## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

#### 1. To load the datasets

In [2]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


In [3]:
print(test_data.shape)
display(train_data.head())
display(test_data.head())

(11847, 2)


Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


Unnamed: 0,Id,Page content
0,27643,"<html><head><div class=""article-info""><span cl..."
1,27644,"<html><head><div class=""article-info""><span cl..."
2,27645,"<html><head><div class=""article-info""><span cl..."
3,27646,"<html><head><div class=""article-info""><span cl..."
4,27647,"<html><head><div class=""article-info""><span cl..."


#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel
- length of content 

In [4]:
from bs4 import BeautifulSoup
from datetime import datetime
# to get the attribute of the 'title', 'year/month/date/day/hour/minute/second/is_weekend', 'num_img', 'num_video', 'author name', 'topic', 'channel', 'content length'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)

    
    date_string = soup.find('time')
    try:
        date_string = date_string['datetime']
    except:
        date_string = 'wed, 10 oct 2014 15:00:43 +0000'
        
    date_string = date_string.strip().lower()
    datetimes = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %z')
    
    
    year = datetimes.year
    month = datetimes.month
    date = datetimes.day
    day = pd.Timestamp(str(year)+'-'+str(month)+'-'+str(date)).dayofweek+1
    is_weekend = 1 if (day==6 or day==7) else 0
    hour = datetimes.hour
    minute = datetimes.minute
    second = datetimes.second
    

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('iframe'))
    

    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'
    
    

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topic = footer.get_text().split(': ')[1]
    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    '''
    print('title = ', title, type(title))
    print('time = ', year, "/", month, "/",day, " ",hour, ":",minute, ":",second, type(year))
    print('number of images = ', num_img, type(num_img))
    print('number of videos = ', num_video, type(num_video))
    print('author_name = ', author_name, type(author_name))
    print('topic = ', topic, type(topic))
    print('channel = ', channel, type(channel))
    '''
    
    # 8. to find the content length
    content = soup.body.find('section', class_='article-content').get_text()
    len_content = len(content)

    # print('topic = ', topic, type(topic))

    return title, author_name, channel, topic, year, month, date, day, is_weekend, hour, minute, second, num_img, num_video, len_content

In [5]:
feature_list = []

for content in (train_data['Page content']):
    feature_list.append(preprocessor(content))

for content in (test_data['Page content']):
    feature_list.append(preprocessor(content))

df_all = pd.DataFrame(
        feature_list, 
        columns=['title', 'author_name', 'channel', 'topic', 'year', 'month', 'date', 'day', 'is_weekend', 'hour', 'minute', 'second', 'num_img', 'num_video', 'len_content'])

In [24]:
df_all.head()

Unnamed: 0,title,author_name,channel,topic,year,month,date,day,is_weekend,hour,minute,second,num_img,num_video,len_content
0,nasa's grand challenge: stop asteroids from de...,clara moskowitz,world,"Asteroid, Asteroids, challenge, Earth, Space, ...",2013,6,19,3,0,15,4,30,1,0,3591
1,google's new open source patent pledge: we won...,christina warren,tech,"Apps and Software, Google, open source, opn pl...",2013,3,28,4,0,17,40,55,2,0,1843
2,ballin': 2014 nfl draft picks get to choose th...,sam laird,entertainment,"Entertainment, NFL, NFL Draft, Sports, Televis...",2014,5,7,3,0,19,15,20,2,25,6646
3,cameraperson fails deliver slapstick laughs,sam laird,watercooler,"Sports, Video, Videos, Watercooler",2013,10,11,5,0,2,26,50,1,21,1821
4,nfl star helps young fan prove friendship with...,connor finnegan,entertainment,"Entertainment, instagram, instagram video, NFL...",2014,4,17,4,0,3,31,43,52,1,8919


### 3-1 . Preprocessing - tokenization

To split the text corpora into individual elements

In [6]:
import re

def tokenizer(text):
    return re.split('\s+', text.strip())

### 3-2 . Preprocessing - Word Stemming

There are two ways of word stemming

1. PorterStemmer(Stemming): break the word in rule-besed way, which will lead to the probelm of overstemming

2. WordNetLemmatizer(Lemmatization): Stem the words will better performance, while time-consuming


In [7]:
import nltk
from nltk.stem import WordNetLemmatizer

# nltk.download('wordnet')
# nltk.download('omw-1.4')

def word_stemming(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    return lemmatized_words

print(word_stemming('university, universal, universities'))

['university,', 'universal,', 'university']


In [8]:
from nltk.stem.porter import PorterStemmer

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

print(tokenizer_stem('university, universal, universities'))

['university,', 'universal,', 'univers']


### 3-3 Preprocessing - Stop-Word Removal

儘管刪除停用詞在某些情況下（例如 BoW 和特徵哈希）可以有益於簡化表示，並可能提高文字分析的準確性，但並不總是必要，特別是在使用 TF-IDF 時。是否刪除停用詞應基於文本分析任務的具體要求以及資料集的特性來進行決策

In [9]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def stop_word_removal(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(stop_word_removal('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to
[nltk_data]     /users/student/mr111//jhliu22/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### 3-4 Create TF-IDF feature representation ([ref](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

text_transformer = ColumnTransformer(
    [
        #('title preprocess', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [0]),            # to process the title paragraph
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [0]),
        #('channel process', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [1]),
        ('topic name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [1]),
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)


catboost_transformer =  ColumnTransformer(
    [
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [0]),            # to process the title paragraph
        ('topic name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [1])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)

In [11]:
tfidf = TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False)
tfidf.fit(df_all['title'])
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names_out()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(df_all['title']).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones((1, tfidf_sum.shape[0])))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))


[vocabularies with smallest idf scores]
to: 2.52
the: 2.58
in: 2.96
a: 3.03
of: 3.07
for: 3.10
and: 3.44
is: 3.51
on: 3.54
your: 3.60

[vocabularies with highest tf-idf scores]
the: 1142.4830180213792
to: 1109.6698985304176
a: 795.7866740412087
in: 787.5419357082401
of: 746.7665375362841
for: 734.8807138633431
and: 555.5640584744767
your: 551.5848064261177
is: 544.621163447621
you: 533.4917456688497


### 4. Model training

- XGBoost

- LightGBM

- CatBoost

#### - To split the dataset 

In [12]:
df = df_all.loc[:, [
                    #'title', 
                    'author_name', 
                    #'channel', 
                    'topic', 
                    'year', 
                    'month',
                    'date', 
                    'day', 
                    'is_weekend',
                    'hour', 
                    # 'minute', 
                    # 'second', 
                    # 'num_img', 
                    'num_video', 
                    'len_content'
                    ]]
df.head()

Unnamed: 0,author_name,topic,year,month,date,day,is_weekend,hour,num_video,len_content
0,clara moskowitz,"Asteroid, Asteroids, challenge, Earth, Space, ...",2013,6,19,3,0,15,0,3591
1,christina warren,"Apps and Software, Google, open source, opn pl...",2013,3,28,4,0,17,0,1843
2,sam laird,"Entertainment, NFL, NFL Draft, Sports, Televis...",2014,5,7,3,0,19,25,6646
3,sam laird,"Sports, Video, Videos, Watercooler",2013,10,11,5,0,2,21,1821
4,connor finnegan,"Entertainment, instagram, instagram video, NFL...",2014,4,17,4,0,3,1,8919


In [13]:
from sklearn.model_selection import train_test_split

X_train = df.values[:train_data.shape[0]]
y_train = train_data['Popularity'].values
y_train[y_train==-1] = 0
X_test = df.values[train_data.shape[0]:]

print(X_train.shape)
print(type(X_train))
print(X_train)
print(y_train.shape)
print(type(y_train))
print(y_train)


X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

(27643, 10)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  15 0 3591]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 17 0 1843]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 19 25 6646]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 12 0 1274]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 20 0 2657]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 18 0 3027]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


#### - To construct the training function

In [14]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

def training(clf):
    clf_cv = cross_validate(clf, X_train, y_train, scoring='roc_auc', return_train_score=True, return_estimator=True)
    print('train score: {:.5f} (+/-{:.5f})'.format(
        np.mean(clf_cv['train_score']), np.std(clf_cv['train_score'])))
    print('valid score: {:.5f} (+/-{:.5f})'.format(
        np.mean(clf_cv['test_score']), np.std(clf_cv['test_score'])))

    clf.fit(X_train_split, y_train_split)
    train_score = roc_auc_score(
        y_train_split, clf.predict_proba(X_train_split)[:, 1])
    valid_score = (roc_auc_score(
        y_valid_split, clf.predict_proba(X_valid_split)[:, 1]))
    
    return clf, np.mean(clf_cv['train_score']),  np.mean(clf_cv['test_score'])
    # return clf, train_score, valid_score

### 4-1. XGBOOST

#### 4-1-1. Grid Sizing for XGBoost


In [15]:
# from xgboost import XGBClassifier

# param_grid_xgb = {
#     'gamma' : [0, 0.5, 1, 1.5, 2],
#     'lambda' : [1.5, 2, 2.5, 3],
#     'n_estimators': [100, 120, 140, 160, 180],
#     'max_depth': [6, 8, 10, 12, 14],
#     'learning_rate' : [0.14, 0.15, 0.16]  
# }

# best_xgb_param, best_xgb = grid_search_cv(text_transformer, XGBClassifier(n_jobs=-1), param_grid_xgb)

#### 4-1-2. Training for XGBoost



In [16]:
# from sklearn.pipeline import Pipeline
# from xgboost import XGBClassifier

# param_grid_lgbm = {
#     'gamma' : 1,
#     'lambda' : 2.5,
#     'n_estimators': 100,
#     'max_depth': 8,
#     'learning_rate' : 0.14,
#     'n_jobs' : -1
# }

# # '**' 是一種解包（unpacking）操作符，它可以用於將字典中的鍵值對以關鍵字參數的方式傳遞給函數或方法
# xgboost = Pipeline([('vect', text_transformer),
#                     ('clf', XGBClassifier(**param_grid_lgbm))])

# training(xgboost)

### 4-3 CatBoost Classifier

#### 4-3-1 Use simple for-loop to tuning the params

In [17]:
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier

# catboost = Pipeline([('ct', catboost_transformer),
#                      ('clf', CatBoostClassifier(verbose=False, eval_metric='AUC', n_estimators=290, learning_rate=0.06))])
# catboost, train_error, valid_error = training(catboost)

train_acc_list = []
valid_acc_list = []

with open('../catboost_log/tree_num_tree_depth_vs_lr.log','w') as file:
    for tree_num in range(100,0,100):
        for tree_depth in range(3, 11): # depth from 3 to 10 (8)
            for i in range(1,11): # lr from 0.01 to 0.10 (10)
                print ("tree_num: ", tree_num, "tree_depth :", tree_depth, " learning rate: ", i*0.01)
                catboost = Pipeline([('ct', catboost_transformer),
                                    ('clf', CatBoostClassifier(verbose=False, eval_metric='AUC', n_estimators=tree_num, depth = tree_depth, learning_rate=i*0.01, random_state = 0))])
                catboost, train_acc, valid_acc = training(catboost)
                train_acc_list.append(train_acc)
                valid_acc_list.append(valid_acc)
                write_str = "tree_num: " + str(tree_num) + " tree_depth:"+ str(tree_depth) + " learning_rate: " +  str(i*0.01) + " train_acc: " + str(train_acc) + " valid_acc: " + str(valid_acc) + '\n'
                file.write (write_str)
                file.flush()


#### 4-3-2 Use matplot to plot the results

In [18]:
# import matplotlib.pyplot as plt


# train_acc_list = []
# valid_acc_list = []

# with open ('../catboost_log/tree_num_tree_depth_vs_lr.log','r') as file:
#     for line in file:
#         if 'train_acc' in line and 'valid_acc' in line:
#             train_acc = float(line.split('train_acc: ')[1].split(' valid_acc: ')[0])
#             valid_acc = float(line.split('valid_acc: ')[1])
#             train_acc_list.append(train_acc)
#             valid_acc_list.append(valid_acc)

# print("Train Accuracy List: ", train_acc_list[:5])
# print("Valid Accuracy List: ", valid_acc_list[:5])

# tree_num_list = list(range(100,1100,100))
# tree_depth_list = list(range(3,11))

# plt.figure(figsize=(20,10))
# best_result_list = []
# for i in range(5):
#     tree_num_train_list = train_acc_list[i*80:(i+1)*80] # there are 80 data in each list
#     tree_num_valid_list = valid_acc_list[i*80:(i+1)*80]
#     max_valid_acc = max(tree_num_valid_list)
#     max_index = tree_num_valid_list.index(max_valid_acc) # the index should be 0 ~ 79
#     train_acc = tree_num_train_list[max_index]
#     max_lr = (max_index % 10) * 0.01
#     max_tree_depth  = (max_index-1) // 10 + 3
#     best_result_list.append((max_tree_depth, max_lr, max_valid_acc))
#     print("In tree num: ", str((i+1)*100), ", the best param (tree_depth, max_lr) is: (", max_tree_depth,", " ,max_lr, "), and its train/valid acc = ",train_acc,'/', max_valid_acc)


#     # plt.subplot(5,2,i+1)
#     # # 繪製圖形
#     # plt.plot(tree_depth_list, train_iter_300_list, marker='o', linestyle='--', color='b', label='Train acc')
#     # plt.plot(tree_depth_list, valid_iter_300_list, marker='o', linestyle='--', color='r', label='Valid acc')

#     # # 添加標籤和標題
#     # plt.xlabel('tree_depth')
#     # plt.ylabel('Accuracy')
#     # title_name = 'Training and Validation Accuracy vs tree_depth on tree number = ' + str((i+1)*100)
#     # plt.title(title_name)
#     # plt.legend()

# # 顯示圖形
# plt.show()

#### 4-3-3 Use grid search to select best parameters

In [19]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv(ct, clf, param_grid):
    X_train_ct = ct.fit_transform(X_train)
    gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=16, cv=5, return_train_score=True, verbose=2)
    gs.fit(X_train_ct, y_train)
    results, idx = gs.cv_results_, gs.best_index_
    print('train score: {:.5f} (+/-{:.5f})'.format(results['mean_train_score'][idx], results['std_train_score'][idx]))
    print('valid score: {:.5f} (+/-{:.5f})'.format(results['mean_test_score'][idx], results['std_test_score'][idx]))
    print('best params:', gs.best_params_)
    return gs.best_params_, gs.best_estimator_

In [26]:
from catboost import CatBoostClassifier

param_grid_catboost = {
    'learning_rate' : [0.001, 0.005, 0.01, 0.05], 
    'n_estimators' : [400,500,600],
    'depth': [10,11,12],
}

best_cat_params, best_estimator_ = grid_search_cv(text_transformer, CatBoostClassifier(verbose=False, eval_metric='AUC',random_state=0), param_grid_catboost)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ....depth=10, learning_rate=0.001, n_estimators=400; total time= 6.6min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=400; total time= 6.7min
[CV] END ....depth=10, learning_rate=0.005, n_estimators=400; total time= 6.7min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=400; total time= 6.7min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=400; total time= 6.8min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=400; total time= 6.8min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=500; total time= 8.3min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=500; total time= 8.4min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=500; total time= 8.4min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=500; total time= 8.4min
[CV] END ....depth=10, learning_rate=0.001, n_estimators=500; total time= 8.5min
[CV] END ....depth=10, learning_rate=0.001, n_e

### 5. Use specified param to get the y_pred:

In [23]:
catboost = Pipeline([('ct', catboost_transformer),
                                    ('clf', CatBoostClassifier(verbose=False, eval_metric='AUC', n_estimators=500, depth = 10, learning_rate=0.01, random_state = 0))])
catboost, train_acc, valid_acc = training(catboost)

y_score = catboost.predict_proba(X_test)[:, 1]
df_pred = pd.DataFrame({'Id': test_data['Id'], 'Popularity': y_score})
df_pred.to_csv('../output/test_pred.csv', index=False)

train score: 0.68975 (+/-0.00182)
valid score: 0.59867 (+/-0.00856)
