## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [40]:
import pandas as pd
import numpy as np
import warnings
import os

warnings.filterwarnings("ignore")

#### 1. To load the datasets

In [41]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


[Voting] ..................... (2 of 3) Processing lgbm, total=  49.6s
[Voting] ..................... (2 of 3) Processing lgbm, total=  50.9s


In [42]:
print(test_data.shape)
train_data.head()

(11847, 2)


Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel
- length of content 

--- 

* beautiful soup
    - conda install -c conda-forge beautifulsoup4
    
<br>

* vadersentiment
    - conda install -c conda-forge vadersentiment

---

In [43]:
from bs4 import BeautifulSoup
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# to get the attribute of the 'title', 'year/month/date/day/hour/minute/second/is_weekend', 'num_img', 'num_video', 'author name', 'topic', 'channel', 'content length', 'title_sentiment'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)

    
    date_string = soup.find('time')
    try:
        date_string = date_string['datetime']
    except:
        date_string = 'wed, 10 oct 2014 15:00:43 +0000'
        
    date_string = date_string.strip().lower()
    datetimes = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %z')
    
    
    year = datetimes.year
    month = datetimes.month
    date = datetimes.day
    day = pd.Timestamp(str(year)+'-'+str(month)+'-'+str(date)).dayofweek+1
    is_weekend = 1 if (day==6 or day==7) else 0
    hour = datetimes.hour
    minute = datetimes.minute
    second = datetimes.second
    

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('iframe'))
    

    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'
    
    

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topic = footer.get_text().split(': ')[1]
    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    '''
    print('title = ', title, type(title))
    print('time = ', year, "/", month, "/",day, " ",hour, ":",minute, ":",second, type(year))
    print('number of images = ', num_img, type(num_img))
    print('number of videos = ', num_video, type(num_video))
    print('author_name = ', author_name, type(author_name))
    print('topic = ', topic, type(topic))
    print('channel = ', channel, type(channel))
    '''
    
    # 8. to find the content length
    content = soup.body.find('section', class_='article-content').get_text()
    len_content = len(content)

    # print('topic = ', topic, type(topic))
    
    # 9. to find the sentiment of title
    analyzer = SentimentIntensityAnalyzer()
    title_sentiment = analyzer.polarity_scores(topic)
    sentiment_neg = title_sentiment['neg']
    sentiment_neu = title_sentiment['neu']
    sentiment_pos = title_sentiment['pos']
    sentiment_compound = title_sentiment['compound']

    return title, author_name, channel, topic, year, month, date, day, is_weekend, hour, minute, second, num_img, num_video, len_content, sentiment_neg, sentiment_neu, sentiment_pos, sentiment_compound

In [44]:
feature_train_list = []
feature_test_list = []

for content in (train_data['Page content']):
    feature_train_list.append(preprocessor(content))
for content in (test_data['Page content']):
    feature_train_list.append(preprocessor(content))

df_all = pd.DataFrame(
        feature_train_list, 
        columns=['title', 'author_name', 'channel', 'topic', 'year', 'month', 'date', 'day', 'is_weekend', 'hour', 'minute', 'second', 'num_img', 'num_video', 'len_content', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'sentiment_compound'])


[Voting] ..................... (2 of 3) Processing lgbm, total= 1.4min
[Voting] ..................... (2 of 3) Processing lgbm, total= 1.4min
[Voting] ..................... (2 of 3) Processing lgbm, total= 1.5min
[Voting] ....................... (3 of 3) Processing RF, total= 1.4min
[2, 2, 2] Finish!!
[Voting] ....................... (3 of 3) Processing RF, total= 1.4min
[2, 2, 1] Finish!!
[Voting] ....................... (3 of 3) Processing RF, total= 1.5min
[Voting] ....................... (3 of 3) Processing RF, total= 1.4min
[Voting] ....................... (3 of 3) Processing RF, total= 1.4min
[Voting] ....................... (3 of 3) Processing RF, total= 1.1min
[Voting] ....................... (3 of 3) Processing RF, total= 1.0min
[Voting] ....................... (3 of 3) Processing RF, total= 1.0min
[Voting] ..................... (2 of 3) Processing lgbm, total= 2.5min
[Voting] ..................... (2 of 3) Processing lgbm, total= 2.5min
[Voting] ....................... (3 of 

Exception in thread Thread-5843 (process_weight):
Traceback (most recent call last):
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_53697/2761700238.py", line 30, in process_weight
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 538, in release
    raise ValueError("Semaphore released too many times")
ValueError: Semaphore released too many times


[Voting] ....................... (3 of 3) Processing RF, total=  28.2s[Voting] .................. (1 of 3) Processing xgboost, total=  31.7s
[Voting] .................. (1 of 3) Processing xgboost, total=  20.5s
[Voting] .................. (1 of 3) Processing xgboost, total=  19.1s
[Voting] .................. (1 of 3) Processing xgboost, total=  23.0s
[Voting] .................. (1 of 3) Processing xgboost, total=  20.5s
[Voting] .................. (1 of 3) Processing xgboost, total=  21.1s
[Voting] .................. (1 of 3) Processing xgboost, total=  25.6s
[Voting] ..................... (2 of 3) Processing lgbm, total=  31.0s
[Voting] ..................... (2 of 3) Processing lgbm, total=  34.4s
[Voting] ..................... (2 of 3) Processing lgbm, total=  38.6s
[Voting] ..................... (2 of 3) Processing lgbm, total=  34.3s
[Voting] .................. (1 of 3) Processing xgboost, total=  25.0s
[Voting] ..................... (2 of 3) Processing lgbm, total=  34.6s
[Voting

Exception in thread Exception in threading.excepthook:
Exception ignored in thread started by: <bound method Thread._bootstrap of <Thread(Thread-5842 (process_weight), stopped 140376190977792)>>
Traceback (most recent call last):
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 995, in _bootstrap
    self._bootstrap_inner()
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 1040, in _bootstrap_inner
    self._invoke_excepthook(self)
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 1352, in invoke_excepthook
    local_print("Exception in threading.excepthook:",
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/site-packages/ipykernel/iostream.py", line 575, in flush
    self.pub_thread.schedule(self._flush)
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/site-packages/ipykernel/iostream.py", line 267, in schedul

[Voting] ..................... (2 of 3) Processing lgbm, total=  53.0s[Voting] ..................... (2 of 3) Processing lgbm, total=  57.0s
[Voting] ....................... (3 of 3) Processing RF, total=   9.4s
[1, 2, 2] Finish!!


### 3-1 . Preprocessing - tokenization

To split the text corpora into individual elements

In [45]:
import re

def tokenizer(text):
    return re.split('\s+', text.strip())

### 3-2 . Preprocessing - Word Stemming

There are two ways of word stemming

1. PorterStemmer(Stemming): break the word in rule-besed way, which will lead to the probelm of overstemming

2. WordNetLemmatizer(Lemmatization): Stem the words will better performance, while time-consuming


In [46]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.data.path.append('/home/mfhsieh/NTHU-Deep-Learning-Competition/2023_DataLab_Cup1-Predicting_News_Popularity')

nltk.download('wordnet')
nltk.download('omw-1.4')

def word_stemming(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    return lemmatized_words

print(word_stemming('university, universal, universities'))

['university,', 'universal,', 'university']


[nltk_data] Downloading package wordnet to
[nltk_data]     /users/student/mr111//mfhsieh22/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/student/mr111//mfhsieh22/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [47]:
from nltk.stem.porter import PorterStemmer

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

print(tokenizer_stem('university, universal, universities'))

['university,', 'universal,', 'univers']


### 3-3 Preprocessing - Stop-Word Removal

儘管刪除停用詞在某些情況下（例如 BoW 和特徵哈希）可以有益於簡化表示，並可能提高文字分析的準確性，但並不總是必要，特別是在使用 TF-IDF 時。是否刪除停用詞應基於文本分析任務的具體要求以及資料集的特性來進行決策

In [48]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def stop_word_removal(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(stop_word_removal('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to
[nltk_data]     /users/student/mr111//mfhsieh22/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 3-5 Preprocessing - Word Stemming + Stop-Word Removal

In [49]:
def stem_word_remove_stopword(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    
    filtered_list = [word for word in lemmatized_words if word not in stop]
    
    return filtered_list


#### 3-5 Create TF-IDF feature representation ([ref](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html))

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

ngram_range_ = (1,1)

text_transformer = ColumnTransformer(
    [
        #('title preprocess', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [0]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=ngram_range_, lowercase=False), [0]),
        #('channel process', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [1]),
        ('topic name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=ngram_range_, lowercase=False), [1]),
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)


In [51]:
tfidf = TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False)
tfidf.fit(df_all['title'])
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names_out()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(df_all['title']).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones((1, tfidf_sum.shape[0])))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))


[vocabularies with smallest idf scores]
to: 2.52
the: 2.58
in: 2.96
a: 3.03
of: 3.07
for: 3.10
and: 3.44
is: 3.51
on: 3.54
your: 3.60

[vocabularies with highest tf-idf scores]
the: 1142.4830180213792
to: 1109.6698985304176
a: 795.7866740412087
in: 787.5419357082401
of: 746.7665375362841
for: 734.8807138633431
and: 555.5640584744767
your: 551.5848064261177
is: 544.621163447621
you: 533.4917456688497


### 4. Model training

- XGBoost

- LightGBM

- CatBoost

- AdaBoost

- Random Forest

- VotingClassifier


#### - To split the dataset 

In [52]:
remaining_dict = [  #'title', 
                    'author_name', 
                    #'channel', 
                    'topic', 
                    'year', 
                    'month',
                    'date', 
                    'day', 
                    'is_weekend',
                    'hour', 
                    # 'minute', 
                    # 'second', 
                    # 'num_img', 
                    'num_video', 
                    'len_content',
                    'sentiment_neg', 
                    'sentiment_neu', 
                    'sentiment_pos', 
                    'sentiment_compound'
                    ]

df = df_all.loc[:, remaining_dict]
df.head()

Unnamed: 0,author_name,topic,year,month,date,day,is_weekend,hour,num_video,len_content,sentiment_neg,sentiment_neu,sentiment_pos,sentiment_compound
0,clara moskowitz,"Asteroid, Asteroids, challenge, Earth, Space, ...",2013,6,19,3,0,15,0,3591,0.0,0.822,0.178,0.0772
1,christina warren,"Apps and Software, Google, open source, opn pl...",2013,3,28,4,0,17,0,1843,0.119,0.881,0.0,-0.2263
2,sam laird,"Entertainment, NFL, NFL Draft, Sports, Televis...",2014,5,7,3,0,19,25,6646,0.0,0.641,0.359,0.4215
3,sam laird,"Sports, Video, Videos, Watercooler",2013,10,11,5,0,2,21,1821,0.0,1.0,0.0,0.0
4,connor finnegan,"Entertainment, instagram, instagram video, NFL...",2014,4,17,4,0,3,1,8919,0.0,0.641,0.359,0.4215


In [53]:
from sklearn.model_selection import train_test_split

X_train = df.values[:train_data.shape[0]]
y_train = train_data['Popularity'].values
y_train[y_train==-1] = 0
X_test = df.values[train_data.shape[0]:]

print(X_train.shape)
print(type(X_train))
print(X_train)
print(y_train.shape)
print(type(y_train))
print(y_train)

X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

(27643, 14)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  0.822 0.178 0.0772]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 0.881 0.0 -0.2263]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 0.641 0.359 0.4215]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 0.704 0.296 0.2732]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 1.0 0.0 0.0]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 1.0 0.0 0.0]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


#### - To construct the training function

In [54]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

def training(clf):
    score = cross_validate(clf, X_train, y_train, scoring='roc_auc', return_train_score=True, return_estimator=True)
    print('train score: {:.6f} (+/-{:.6f})'.format(
        np.mean(score['train_score']), np.std(score['train_score'])))
    print('valid score: {:.6f} (+/-{:.6f})'.format(
        np.mean(score['test_score']), np.std(score['test_score'])))

    clf.fit(X_train, y_train)
    
    return clf, np.mean(score['train_score']), np.mean(score['test_score'])

#### - To contruct the grid search function

In [55]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv(ct, clf, param_grid, verbose_=False):
    X_train_ct = ct.fit_transform(X_train)
    
    # to report the grid search information
    if(verbose_):
        gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True, verbose = 2)
    else:
        gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True)
    gs.fit(X_train_ct, y_train)
    
    results, idx = gs.cv_results_, gs.best_index_
    print('train score: {:.6f} (+/-{:.6f})'.format(results['mean_train_score'][idx], results['std_train_score'][idx]))
    print('valid score: {:.6f} (+/-{:.6f})'.format(results['mean_test_score'][idx], results['std_test_score'][idx]))
    print('best params:', gs.best_params_)
    return gs.best_params_, gs.best_estimator_

#### **- To set whether to run the grid search**

In [56]:
grid_search_en = True
# grid_search_en = False

#### - to store the best parameter to a file

In [57]:
def parameter_storage(dict_path, file_name, best_xgb_param):
    if not os.path.exists(dict_path):
        os.makedirs(dict_path)
        
    file_path = os.path.join(dict_path, file_name + ".txt")
    
    with open(file_path, "w") as file:
        file.write(f'The features: {remaining_dict}\n')
        file.write(f'The best parameter: {best_xgb_param}\n')
        file.write(f'ngram_range_: {ngram_range_}')
        file.write('--------------------------------------------------------\n')

### 4-1. XGBOOST

4-1-1. Grid sizing for XGBoost

In [58]:
from xgboost import XGBClassifier

In [59]:
param_grid_xgb = {
    'gamma' : [0.8, 0.9, 1, 1.1, 1.2],
    'lambda' : [2.3, 2.4, 2.5],
    'n_estimators': [97, 98, 99, 100, 101, 102, 103],
    'max_depth': [7, 8, 9],
    'learning_rate' : [0.137, 0.138, 0.139, 0.14, 0.141, 0.142, 0.143]  
}

# param_grid_xgb = {
#     'lambda' : [2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3]  
# }

if (grid_search_en):
    best_xgb_param, best_xgb = grid_search_cv(text_transformer, XGBClassifier(n_jobs=-1), param_grid_xgb, True)
    parameter_storage('../output/best_parameters', 'best_xgb_param', best_xgb_param)
    

4-1-2. Training for XGBoost

In [60]:
from sklearn.pipeline import Pipeline

param_grid_lgbm = {
    'gamma' : 1,
    'lambda' : 2.5,
    'n_estimators': 100,
    'max_depth': 8,
    'learning_rate' : 0.14,
    'n_jobs' : -1
}

if (grid_search_en):
    xgboost = Pipeline([('vect', text_transformer), ('clf', best_xgb)])
else :
    xgboost = Pipeline([('vect', text_transformer), ('clf', XGBClassifier(**param_grid_lgbm))])
    
_ = training(xgboost)

train score: 0.815946 (+/-0.003000)
valid score: 0.588796 (+/-0.011443)


### 4-2. LightGBM

4-2-1. Grid sizing for LightGBM

In [61]:
from lightgbm import LGBMClassifier

In [62]:
param_grid_lgbm = {
    'learning_rate' : [0.013, 0.0135 , 0.0136, 0.0137, 0.0138, 0.0139, 0.014, 0.0141, 0.0142, 0.0143, 0.0144 ,0.0145, 0.015], 
    'n_estimators' : [230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250],
    'objective' : ['regression', 'regression_l1', 'poisson']
}

if (grid_search_en):
    best_lgbm_param, best_lgbm = grid_search_cv(text_transformer, LGBMClassifier(n_jobs=-1, verbose=-1), param_grid_lgbm, True)
    parameter_storage('../output/best_parameters', 'best_lgbm_param', best_lgbm_param)

4-2-2. Training for LightGBM

In [63]:
params_LGBM = {
    'random_state': 0, 
    'learning_rate' : 0.014,
    'n_estimators' : 240,
    'n_jobs' : -1,
    'objective' : 'poisson'
}

if (grid_search_en):
    lgbm = Pipeline([('vect', text_transformer), ('clf', best_lgbm)])
else :
    lgbm = Pipeline([('vect', text_transformer), ('clf', LGBMClassifier(**params_LGBM))])

_ = training(lgbm)

train score: 0.669064 (+/-0.002302)
valid score: 0.596976 (+/-0.007465)


### 4-3. CatBoost

4-3-1. Grid sizing for Catboost

In [64]:
from catboost import CatBoostClassifier

In [65]:
param_grid_catboost = {
    'learning_rate' : [0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.02, 0.03], 
    'n_estimators' : [450, 500, 550],
    'depth' : [9, 10, 11]
}

if (grid_search_en):
    best_cat_params, best_cat_estimator = grid_search_cv(text_transformer, CatBoostClassifier(eval_metric='AUC',random_state=0, verbose=False), param_grid_catboost, True)
    parameter_storage('../output/best_parameters', 'best_cat_params', best_cat_params)

4-3-2. Training for CatBoost

In [66]:
paramsCatBoost = {
    'eval_metric' : 'AUC',
    'n_estimators' : 500,
    'depth' : 10,
    'learning_rate' : 0.01,
    'random_state' : 0,
    'verbose' : False
}

if (grid_search_en):
    catboost = Pipeline([('ct', text_transformer),('clf', best_cat_estimator)])
else :
    catboost = Pipeline([('ct', text_transformer),('clf', CatBoostClassifier(**paramsCatBoost))])

_ = training(catboost)

train score: 0.694347 (+/-0.002958)
valid score: 0.597723 (+/-0.009453)


### 4-4. AdaBoost

In [67]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

4-4-1. Grid sizing for AdaBoost

In [68]:

params_grid_AdaBoost = {
    'clf__estimator__max_depth' : [1,2,3,4,5,6,7,8,9,10],
    'clf__n_estimators': [num for num in range(50, 1000, 50)],
    'clf__learning_rate': [0.005, 0.01, 0.05, 0.1, 0.5] 
}

if (grid_search_en):
    best_ada_params, best_ada_estimator = grid_search_cv(text_transformer, 
                                                         AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_jobs=-1), 
                                                         params_grid_AdaBoost, True)
    parameter_storage('../output/best_parameters', 'best_ada_params', best_ada_params)

4-4-2. Training for AdaBoost

In [69]:
param_Adaboost = {
    'estimator' : DecisionTreeClassifier(max_depth = 4), 
    'learning_rate' : 0.005, 
    'n_estimators' : 900
}

if (grid_search_en):
    adaBoost = Pipeline([('vect', text_transformer), ('clf', best_ada_estimator)])
else :
    adaBoost = Pipeline([('vect', text_transformer), ('clf', AdaBoostClassifier(**param_Adaboost))])
    
_ = training(adaBoost)

train score: 0.680774 (+/-0.005286)
valid score: 0.586246 (+/-0.007764)


### 4-5. Random Forest Classifier

4-5-1. Grid sizing for RF

In [70]:
from sklearn.ensemble import RandomForestClassifier

In [71]:

params_grid_RF = {
    'n_estimator' : {800, 900, 1000, 1100},
    'max_depth' : {70, 80, 90}
}

if (grid_search_en):
    best_RF_params, best_RF_estimator = grid_search_cv(text_transformer, RandomForestClassifier(n_jobs=-1), params_grid_RF, True)
    parameter_storage('../output/best_parameters', 'best_RF_params', best_RF_params)

4-5-2. Training for RF

In [72]:
param_RF = {
    'n_jobs' : -1,
    'random_state' : 0,
    'n_estimators' : 1000,
    'max_depth' : 80
}

if (grid_search_en):
    RF = Pipeline([('vect', text_transformer), ('clf', best_RF_estimator)])
else :
    RF = Pipeline([('vect', text_transformer), ('clf', RandomForestClassifier(**param_RF))])
    
_ = training(RF)

train score: 0.997939 (+/-0.000347)
valid score: 0.585703 (+/-0.010655)


### 4-6. VotingClassifier

4-2-1. Grid sizing for Voting classifier

In [73]:
num_classifier = 3
weight_range = 2
estimator_list = [('xgboost', xgboost), ('lgbm', lgbm), ('RF', RF)]
# estimator_list = [('xgboost', xgboost), ('lgbm', lgbm), ('catboost', catboost), ('adaBoost', adaBoost), ('RF', RF)]

In [74]:
if (len(estimator_list) != num_classifier) :
    print("Error: the numver of the classifier must equal to the estimator_list element number! Please check again!")

- to find the weight combinations

In [75]:
import itertools

def weight_list_generator(weight_range_, num_classifier_):   
    weight_range = weight_range_
    weight_list = []

    binary_values = (i for i in range(1, weight_range+1))
    weight_list = [list(i) for i in list(itertools.product(binary_values, repeat=num_classifier_))]

    print('weight list = ', weight_list)
    print('length of weight list = ', len(weight_list))
    return weight_list

In [76]:
weight_list = weight_list_generator(weight_range_ = weight_range, 
                      num_classifier_ = num_classifier)

weight list =  [[1, 1, 1], [1, 1, 2], [1, 2, 1], [1, 2, 2], [2, 1, 1], [2, 1, 2], [2, 2, 1], [2, 2, 2]]
length of weight list =  8


* multi-thread grid search for voting classifier

In [77]:
import threading
from sklearn.ensemble import VotingClassifier

param_grid_voting_static = {
    'estimators' : estimator_list,
    'voting' : 'soft',
    'flatten_transform' : True, 
    'verbose' : True
}

lock = threading.Lock()


def process_weight(weight):
    # share in multiple threads
    global best_valid_score, best_weight, best_voting

    voting = VotingClassifier(**param_grid_voting_static, weights=weight)
    print('current weight =', weight)
    clf_voting, _, valid_voting = training(voting)

    # to protect the safety of shared variables
    with lock:
        if valid_voting > best_valid_score:
            best_valid_score = valid_voting
            best_weight = weight
            best_voting = clf_voting
    
    print(f'{weight} Finish!!')
    threadmax.release()

'''
def process_weight(weight):
    # share in multiple threads
    global best_valid_score, best_weight, best_voting

    voting = VotingClassifier(**param_grid_voting_static, weights=weight)
    print('current weight =', weight)
    valid_voting = 2

    # to protect the safety of shared variables
    with lock:
        if valid_voting >= best_valid_score:
            best_valid_score = valid_voting
            best_weight = weight
    
    print(f'{weight} Finish!!')
    threadmax.release()
''' 
        
if (1):
    best_valid_score = 0
    best_weight = None
    best_voting = None
    mem = []
    threadmax = threading.BoundedSemaphore(64)
    
    for weight in weight_list:
        threadmax.acquire()
        thread = threading.Thread(target=process_weight, args=(weight,))
    
        thread.start()
        mem.append(thread)

    for thread in mem:
        thread.join()
        mem.remove(thread)
    
    print('end once')

    print('best_valid_score = %.6f' % best_valid_score)
    print('best_weight = ', best_weight)
    
    parameter_storage('../output/best_parameters', 'best_weight', best_weight)

current weight = [1, 1, 1]
current weight = [1, 1, 2]
current weight = [1, 2, 1]
current weight = [1, 2, 2]
current weight = [2, 1, 1]
current weight = [2, 1, 2]
current weight = [2, 2, 1]
current weight = [2, 2, 2]
[Voting] .................. (1 of 3) Processing xgboost, total=  12.6s
[Voting] .................. (1 of 3) Processing xgboost, total=  14.3s
[Voting] .................. (1 of 3) Processing xgboost, total=  14.4s
[Voting] .................. (1 of 3) Processing xgboost, total=  17.7s
[Voting] .................. (1 of 3) Processing xgboost, total=  17.9s
[Voting] .................. (1 of 3) Processing xgboost, total=  18.1s
[Voting] .................. (1 of 3) Processing xgboost, total=  21.8s
[Voting] .................. (1 of 3) Processing xgboost, total=  23.1s


[Voting] ..................... (2 of 3) Processing lgbm, total=  31.1s
[Voting] ..................... (2 of 3) Processing lgbm, total=  27.5s
[Voting] ..................... (2 of 3) Processing lgbm, total=  31.5s
[Voting] ..................... (2 of 3) Processing lgbm, total=  31.8s
[Voting] ..................... (2 of 3) Processing lgbm, total=  36.9s
[Voting] ..................... (2 of 3) Processing lgbm, total=  52.2s
[Voting] ....................... (3 of 3) Processing RF, total=  47.4s
[Voting] ....................... (3 of 3) Processing RF, total= 1.1min
[Voting] ....................... (3 of 3) Processing RF, total= 1.1min
[Voting] ....................... (3 of 3) Processing RF, total= 1.1min
[Voting] ....................... (3 of 3) Processing RF, total=  59.9s
[Voting] ....................... (3 of 3) Processing RF, total=  49.3s
[Voting] ..................... (2 of 3) Processing lgbm, total= 1.9min
[Voting] .................. (1 of 3) Processing xgboost, total=  30.4s
[Votin

[Voting] ....................... (3 of 3) Processing RF, total=  28.6s
[2, 2, 2] Finish!!
[Voting] ..................... (2 of 3) Processing lgbm, total= 1.2min
[Voting] ....................... (3 of 3) Processing RF, total=   6.0s
[1, 2, 2] Finish!!


In [None]:
# from sklearn.ensemble import VotingClassifier

# param_grid_voting = {
#     'weight' : weight_list
# }

# param_grid_voting_static = {
#     'estimators' : estimator_list,
#     'voting' : 'soft',
#     'flatten_transform' : True, 
#     'verbose' : True
# }

# print(param_grid_voting['weight'])

# best_valid_score = 0
# best_weight = []

# # grid search for the weight of voting classifier
# if (grid_search_en):
#     for weight in param_grid_voting['weight']:
#         voting = VotingClassifier(**param_grid_voting_static, weights=weight)
#         clf_voting, train_voting, valid_voting = training(voting)
#         if(valid_voting>best_valid_score):
#             best_valid_score = valid_voting
#             best_weight = weight
#             best_voting = clf_voting

# print('best_valid_score = ', best_valid_score)
# print('best_weight = ', best_weight)


In [78]:
# no n_jobs
prarms_voting = {
    'estimators' : estimator_list, 
    'voting' : 'soft',
    'weights' : [1, 2, 1],
    'flatten_transform' : True,
    'verbose' : True
}

if (grid_search_en):
    voting = VotingClassifier(**param_grid_voting_static, weights=best_weight)
else :
    voting = VotingClassifier(**prarms_voting)

_ = training(voting)


[Voting] .................. (1 of 3) Processing xgboost, total=  29.3s
[Voting] ..................... (2 of 3) Processing lgbm, total=   6.2s
[Voting] ....................... (3 of 3) Processing RF, total=   6.8s
[Voting] .................. (1 of 3) Processing xgboost, total=   3.2s
[Voting] ..................... (2 of 3) Processing lgbm, total=   5.8s
[Voting] ....................... (3 of 3) Processing RF, total=   5.7s
[Voting] .................. (1 of 3) Processing xgboost, total=   3.3s
[Voting] ..................... (2 of 3) Processing lgbm, total=   5.0s
[Voting] ....................... (3 of 3) Processing RF, total=   5.3s
[Voting] .................. (1 of 3) Processing xgboost, total=   3.3s
[Voting] ..................... (2 of 3) Processing lgbm, total=   5.8s
[Voting] ....................... (3 of 3) Processing RF, total=   4.8s
[Voting] .................. (1 of 3) Processing xgboost, total=   2.9s
[Voting] ..................... (2 of 3) Processing lgbm, total=   6.0s
[Votin

### 5. Testing data prediction

In [79]:
best_model = voting

y_score = best_model.predict_proba(X_test)[:, 1]

df_pred = pd.DataFrame({'Id': test_data['Id'], 'Popularity': y_score})
df_pred.to_csv('../output/test_pred.csv', index=False)