## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [7]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

#### 1. To load the datasets

In [8]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


In [9]:
print(test_data.shape)
train_data.head()

(11847, 2)


Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel
- length of content 

In [10]:
from bs4 import BeautifulSoup
from datetime import datetime
# to get the attribute of the 'title', 'year/month/date/day/hour/minute/second/is_weekend', 'num_img', 'num_video', 'author name', 'topic', 'channel', 'content length'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)

    
    date_string = soup.find('time')
    try:
        date_string = date_string['datetime']
    except:
        date_string = 'wed, 10 oct 2014 15:00:43 +0000'
        
    date_string = date_string.strip().lower()
    datetimes = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %z')
    
    
    year = datetimes.year
    month = datetimes.month
    date = datetimes.day
    day = pd.Timestamp(str(year)+'-'+str(month)+'-'+str(date)).dayofweek+1
    is_weekend = 1 if (day==6 or day==7) else 0
    hour = datetimes.hour
    minute = datetimes.minute
    second = datetimes.second
    

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('iframe'))
    

    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'
    
    

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topic = footer.get_text().split(': ')[1]
    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    '''
    print('title = ', title, type(title))
    print('time = ', year, "/", month, "/",day, " ",hour, ":",minute, ":",second, type(year))
    print('number of images = ', num_img, type(num_img))
    print('number of videos = ', num_video, type(num_video))
    print('author_name = ', author_name, type(author_name))
    print('topic = ', topic, type(topic))
    print('channel = ', channel, type(channel))
    '''
    
    # 8. to find the content length
    content = soup.body.find('section', class_='article-content').get_text()
    len_content = len(content)

    # print('topic = ', topic, type(topic))

    return title, author_name, channel, topic, year, month, date, day, is_weekend, hour, minute, second, num_img, num_video, len_content

In [11]:
feature_train_list = []
feature_test_list = []

for content in (train_data['Page content']):
    feature_train_list.append(preprocessor(content))
for content in (test_data['Page content']):
    feature_train_list.append(preprocessor(content))

df_all = pd.DataFrame(
        feature_train_list, 
        columns=['title', 'author_name', 'channel', 'topic', 'year', 'month', 'date', 'day', 'is_weekend', 'hour', 'minute', 'second', 'num_img', 'num_video', 'len_content'])


### 3-1 . Preprocessing - tokenization

To split the text corpora into individual elements

In [12]:
import re

def tokenizer(text):
    return re.split('\s+', text.strip())

### 3-2 . Preprocessing - Word Stemming

There are two ways of word stemming

1. PorterStemmer(Stemming): break the word in rule-besed way, which will lead to the probelm of overstemming

2. WordNetLemmatizer(Lemmatization): Stem the words will better performance, while time-consuming


In [13]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.data.path.append('/home/mfhsieh/NTHU-Deep-Learning-Competition/2023_DataLab_Cup1-Predicting_News_Popularity')

nltk.download('wordnet')
nltk.download('omw-1.4')

def word_stemming(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    return lemmatized_words

print(word_stemming('university, universal, universities'))

[nltk_data] Downloading package wordnet to /home/mfhsieh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mfhsieh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['university,', 'universal,', 'university']


In [14]:
from nltk.stem.porter import PorterStemmer

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

print(tokenizer_stem('university, universal, universities'))

['university,', 'universal,', 'univers']


### 3-3 Preprocessing - Stop-Word Removal

儘管刪除停用詞在某些情況下（例如 BoW 和特徵哈希）可以有益於簡化表示，並可能提高文字分析的準確性，但並不總是必要，特別是在使用 TF-IDF 時。是否刪除停用詞應基於文本分析任務的具體要求以及資料集的特性來進行決策

In [15]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def stop_word_removal(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(stop_word_removal('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mfhsieh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### 3-5 Preprocessing - Word Stemming + Stop-Word Removal

In [16]:
def stem_word_remove_stopword(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    
    filtered_list = [word for word in lemmatized_words if word not in stop]
    
    return filtered_list


#### 3-5 Create TF-IDF feature representation ([ref](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

text_transformer = ColumnTransformer(
    [
        #('title preprocess', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [0]),            # to process the title paragraph
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [0]),
        #('channel process', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [1]),
        ('topic name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [1]),
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)


In [18]:
tfidf = TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False)
tfidf.fit(df_all['title'])
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names_out()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(df_all['title']).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones((1, tfidf_sum.shape[0])))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))


[vocabularies with smallest idf scores]
to: 2.52
the: 2.58
in: 2.96
a: 3.03
of: 3.07
for: 3.10
and: 3.44
is: 3.51
on: 3.54
your: 3.60

[vocabularies with highest tf-idf scores]
the: 1142.4830180213792
to: 1109.6698985304176
a: 795.7866740412087
in: 787.5419357082401
of: 746.7665375362841
for: 734.8807138633431
and: 555.5640584744767
your: 551.5848064261177
is: 544.621163447621
you: 533.4917456688497


### 4. Model training

- XGBoost

- LightGBM

- CatBoost

- AdaBoost

- Random Forest

- VotingClassifier


#### - To split the dataset 

In [19]:
df = df_all.loc[:, [
                    #'title', 
                    'author_name', 
                    #'channel', 
                    'topic', 
                    'year', 
                    'month',
                    'date', 
                    'day', 
                    'is_weekend',
                    'hour', 
                    # 'minute', 
                    # 'second', 
                    # 'num_img', 
                    'num_video', 
                    'len_content'
                    ]]
df.head()

Unnamed: 0,author_name,topic,year,month,date,day,is_weekend,hour,num_video,len_content
0,clara moskowitz,"Asteroid, Asteroids, challenge, Earth, Space, ...",2013,6,19,3,0,15,0,3591
1,christina warren,"Apps and Software, Google, open source, opn pl...",2013,3,28,4,0,17,0,1843
2,sam laird,"Entertainment, NFL, NFL Draft, Sports, Televis...",2014,5,7,3,0,19,25,6646
3,sam laird,"Sports, Video, Videos, Watercooler",2013,10,11,5,0,2,21,1821
4,connor finnegan,"Entertainment, instagram, instagram video, NFL...",2014,4,17,4,0,3,1,8919


In [20]:
from sklearn.model_selection import train_test_split

X_train = df.values[:train_data.shape[0]]
y_train = train_data['Popularity'].values
y_train[y_train==-1] = 0
X_test = df.values[train_data.shape[0]:]

print(X_train.shape)
print(type(X_train))
print(X_train)
print(y_train.shape)
print(type(y_train))
print(y_train)

X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

(27643, 10)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  15 0 3591]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 17 0 1843]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 19 25 6646]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 12 0 1274]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 20 0 2657]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 18 0 3027]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


#### - To construct the training function

In [21]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

def training(clf):
    clf_cv = cross_validate(clf, X_train, y_train, scoring='roc_auc', return_train_score=True, return_estimator=True)
    print('train score: {:.5f} (+/-{:.5f})'.format(
        np.mean(clf_cv['train_score']), np.std(clf_cv['train_score'])))
    print('valid score: {:.5f} (+/-{:.5f})'.format(
        np.mean(clf_cv['test_score']), np.std(clf_cv['test_score'])))

    train_score =  np.mean(clf_cv['train_score'])
    valid_score = np.mean(clf_cv['test_score'])

    clf.fit(X_train_split, y_train_split)
    print('train score: {:.5f}'.format(roc_auc_score(
        y_train_split, clf.predict_proba(X_train_split)[:, 1])))
    print('valid score: {:.5f}'.format(roc_auc_score(
        y_valid_split, clf.predict_proba(X_valid_split)[:, 1])))
    
    return clf, train_score, valid_score

#### - To contruct the grid search function

In [23]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv(ct, clf, param_grid):
    X_train_ct = ct.fit_transform(X_train)
    
    gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True)
    gs.fit(X_train_ct, y_train)
    
    results, idx = gs.cv_results_, gs.best_index_
    print('train score: {:.5f} (+/-{:.5f})'.format(results['mean_train_score'][idx], results['std_train_score'][idx]))
    print('valid score: {:.5f} (+/-{:.5f})'.format(results['mean_test_score'][idx], results['std_test_score'][idx]))
    print('best params:', gs.best_params_)
    return gs.best_params_, gs.best_estimator_

### 4-1. XGBOOST

4-1-1. Grid sizing for XGBoost

In [28]:
from xgboost import XGBClassifier

In [17]:
param_grid_xgb = {
    'gamma' : [0, 0.5, 1, 1.5, 2],
    'lambda' : [1.5, 2, 2.5, 3],
    'n_estimators': [100, 120, 140, 160, 180],
    'max_depth': [6, 8, 10, 12, 14],
    'learning_rate' : [0.14, 0.15, 0.16]  
}

best_xgb_param, best_xgb = grid_search_cv(text_transformer, XGBClassifier(n_jobs=-1), param_grid_xgb)


train score: 0.81088 (+/-0.00377)
valid score: 0.59139 (+/-0.00843)
best params: {'gamma': 1, 'lambda': 2.5, 'learning_rate': 0.14, 'max_depth': 8, 'n_estimators': 100}


4-1-2. Training for XGBoost

In [31]:
from sklearn.pipeline import Pipeline

param_grid_lgbm = {
    'gamma' : 1,
    'lambda' : 2.5,
    'n_estimators': 100,
    'max_depth': 8,
    'learning_rate' : 0.14,
    'n_jobs' : -1
}

# '**' 是一種解包（unpacking）操作符，它可以用於將字典中的鍵值對以關鍵字參數的方式傳遞給函數或方法
xgboost = Pipeline([('vect', text_transformer),
                    ('clf', XGBClassifier(**param_grid_lgbm))])

training(xgboost)

train score: 0.81234 (+/-0.00367)
valid score: 0.58892 (+/-0.00724)
train score: 0.82065
valid score: 0.58606


(Pipeline(steps=[('vect',
                  ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                    transformers=[('author name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f1cf51079c0>),
                                                   [0]),
                                                  ('topic name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f1cf51079c0>),
                                                   [1])])),
                 ('...
                             

### 4-2. LightGBM

4-2-1. Grid sizing for LightGBM

In [27]:
from lightgbm import LGBMClassifier

In [19]:
param_grid_lgbm = {
    'learning_rate' : [0.005, 0.006, 0.007, 0.008, 0.009 , 0.0095, 0.0098, 0.01, 0.011, 0.012, 0.013, 0.014, 0.015], 
    'n_estimators' : [200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380],
    'min_sum_hessian_in_leaf': [0.001, 0.01, 0.1, 1],
    'objective' : ['regression', 'regression_l1', 'poisson']
}

best_lgbm_param, best_lgbm = grid_search_cv(text_transformer, LGBMClassifier(n_jobs=-1), param_grid_lgbm)
































train score: 0.66707 (+/-0.00226)
valid score: 0.59813 (+/-0.00820)
best params: {'learning_rate': 0.014, 'min_sum_hessian_in_leaf': 0.001, 'n_estimators': 240, 'objective': 'poisson'}


4-2-2. Training for LightGBM

In [32]:
params_LGBM = {
    'random_state': 0, 
    'learning_rate' : 0.014,
    'min_sum_hessian_in_leaf' : 0.001,
    'n_estimators' : 240,
    'n_jobs' : -1,
    'objective' : 'poisson'
}

lgbm = Pipeline([('vect', text_transformer),
                 ('clf', LGBMClassifier(**params_LGBM))])

training(lgbm)

train score: 0.66729 (+/-0.00243)
valid score: 0.59753 (+/-0.00710)
train score: 0.67436
valid score: 0.59156


(Pipeline(steps=[('vect',
                  ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                    transformers=[('author name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f1cf51079c0>),
                                                   [0]),
                                                  ('topic name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f1cf51079c0>),
                                                   [1])])),
                 ('clf',
                  LGBMClass

### 4-3. CatBoost

In [29]:
from catboost import CatBoostClassifier

In [None]:
param_grid_catboost = {
    'learning_rate' : [0.001, 0.005, 0.01, 0.05, 0.06 , 0.09, 0.1, 0.15, 0.2], 
    'n_estimators' : [300, 350, 400, 450, 500, 550, 600,650,700,800,900,1000],
    'depth': [3,4,5,6,7,8,9,10,11,12,13,14,15,16],
}

best_cat_params, best_estimator_ = grid_search_cv(text_transformer, CatBoostClassifier( eval_metric='AUC',random_state=0 ), param_grid_catboost)

In [38]:
catboost = Pipeline([('ct', text_transformer),
                                    ('clf', CatBoostClassifier(verbose=False, eval_metric='AUC', n_estimators=300, depth = 10, learning_rate=0.06, random_state = 0))])
training(catboost)

train score: 0.79647 (+/-0.00167)
valid score: 0.59347 (+/-0.01051)
train score: 0.81273
valid score: 0.58901


(Pipeline(steps=[('ct',
                  ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                    transformers=[('author name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f1cf51079c0>),
                                                   [0]),
                                                  ('topic name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f1cf51079c0>),
                                                   [1])])),
                 ('clf',
                  <catboost.c

### 4-4. AdaBoost

In [21]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adaBoost = Pipeline([('vect', text_transformer),
                 ('clf', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=4), learning_rate = 0.005, n_estimators=900))])
training(adaBoost)

train score: 0.67757 (+/-0.00542)
valid score: 0.58832 (+/-0.01050)
train score: 0.68843
valid score: 0.57826


### 4-5. Random Classifier

### 4-6. VotingClassifier

4-2-1. Grid sizing for Voting classifier

In [40]:
from sklearn.ensemble import VotingClassifier
from itertools import combinations_with_replacement

num_classifier = 3

params_list = []
weight_list = []

for i in range(0, 11):
    params_list.append(i/10)

for i in combinations_with_replacement(params_list, num_classifier):
    if(i != tuple(0 for _ in range(num_classifier))):
        weight_list.append(i)

param_grid_voting = {
    'weights' : weight_list,   
}

best_voting_param, best_voting = grid_search_cv(text_transformer, VotingClassifier(estimators = [('xgboost', xgboost), ('lgbm', lgbm), ('catboost', catboost)], n_jobs=-1, voting = 'soft', flatten_transform = True, verbose = True), param_grid_voting)

KeyboardInterrupt: 

In [22]:

prarms_voting = {
    'estimators' : [('xgboost', xgboost), ('lgbm', lgbm)], 
    'voting' : 'soft',
    'weights' : [0.5, 0.5],
    'n_jobs' : -1,
    'flatten_transform' : True,
    'verbose' : True
}

voting = VotingClassifier([('xgboost', xgboost), ('lgbm', lgbm)],
                          voting='soft', weights=[0.1, 0.9])
voting = training(voting)

# training(votingClassifier)

train score: 0.69838 (+/-0.00254)
valid score: 0.59840 (+/-0.00770)
train score: 0.70515
valid score: 0.59241


### 5. Testing data prediction

In [23]:
best_model = voting

y_score = best_model.predict_proba(X_test)[:, 1]

df_pred = pd.DataFrame({'Id': test_data['Id'], 'Popularity': y_score})
df_pred.to_csv('../output/test_pred.csv', index=False)