## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

#### 1. To load the datasets

In [2]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


In [3]:
print(test_data.shape)
train_data.head()

(11847, 2)


Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel
- length of content 

In [4]:
from bs4 import BeautifulSoup
from datetime import datetime
# to get the attribute of the 'title', 'year/month/date/day/hour/minute/second/is_weekend', 'num_img', 'num_video', 'author name', 'topic', 'channel', 'content length'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)

    
    date_string = soup.find('time')
    try:
        date_string = date_string['datetime']
    except:
        date_string = 'wed, 10 oct 2014 15:00:43 +0000'
        
    date_string = date_string.strip().lower()
    datetimes = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %z')
    
    
    year = datetimes.year
    month = datetimes.month
    date = datetimes.day
    day = pd.Timestamp(str(year)+'-'+str(month)+'-'+str(date)).dayofweek+1
    is_weekend = 1 if (day==6 or day==7) else 0
    hour = datetimes.hour
    minute = datetimes.minute
    second = datetimes.second
    

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('iframe'))
    

    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'
    
    

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topic = footer.get_text().split(': ')[1]
    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    '''
    print('title = ', title, type(title))
    print('time = ', year, "/", month, "/",day, " ",hour, ":",minute, ":",second, type(year))
    print('number of images = ', num_img, type(num_img))
    print('number of videos = ', num_video, type(num_video))
    print('author_name = ', author_name, type(author_name))
    print('topic = ', topic, type(topic))
    print('channel = ', channel, type(channel))
    '''
    
    # 8. to find the content length
    content = soup.body.find('section', class_='article-content').get_text()
    len_content = len(content)

    # print('topic = ', topic, type(topic))

    return title, author_name, channel, topic, year, month, date, day, is_weekend, hour, minute, second, num_img, num_video, len_content

In [5]:
feature_train_list = []

for content in (train_data['Page content']):
    feature_train_list.append(preprocessor(content))
for content in (test_data['Page content']):
    feature_train_list.append(preprocessor(content))

df_all = pd.DataFrame(
        feature_train_list, 
        columns=['title', 'author_name', 'channel', 'topic', 'year', 'month', 'date', 'day', 'is_weekend',
                 'hour', 'minute', 'second', 'num_img', 'num_video', 'len_content'])

print(df_all.head())


                                               title       author_name  \
0  nasa's grand challenge: stop asteroids from de...   clara moskowitz   
1  google's new open source patent pledge: we won...  christina warren   
2  ballin': 2014 nfl draft picks get to choose th...         sam laird   
3        cameraperson fails deliver slapstick laughs         sam laird   
4  nfl star helps young fan prove friendship with...   connor finnegan   

         channel                                              topic  year  \
0          world  Asteroid, Asteroids, challenge, Earth, Space, ...  2013   
1           tech  Apps and Software, Google, open source, opn pl...  2013   
2  entertainment  Entertainment, NFL, NFL Draft, Sports, Televis...  2014   
3    watercooler                Sports, Video, Videos, Watercooler   2013   
4  entertainment  Entertainment, instagram, instagram video, NFL...  2014   

   month  date  day  is_weekend  hour  minute  second  num_img  num_video  \
0      6    19 

### 3-1 . Preprocessing - tokenization

To split the text corpora into individual elements

In [7]:
import re

def tokenizer(text):
    return re.split('\s+', text.strip())

### 3-2 . Preprocessing - Word Stemming

There are two ways of word stemming

1. PorterStemmer(Stemming): break the word in rule-besed way, which will lead to the probelm of overstemming

2. WordNetLemmatizer(Lemmatization): Stem the words will better performance, while time-consuming


In [8]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.data.path.append('/home/mfhsieh/NTHU-Deep-Learning-Competition/2023_DataLab_Cup1-Predicting_News_Popularity')

nltk.download('wordnet')
nltk.download('omw-1.4')

def word_stemming(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    return lemmatized_words

print(word_stemming('university, universal, universities'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/student/mr111//lywu22/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/student/mr111//lywu22/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['university,', 'universal,', 'university']


In [9]:
from nltk.stem.porter import PorterStemmer

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

print(tokenizer_stem('university, universal, universities'))

['university,', 'universal,', 'univers']


### 3-3 Preprocessing - Stop-Word Removal

儘管刪除停用詞在某些情況下（例如 BoW 和特徵哈希）可以有益於簡化表示，並可能提高文字分析的準確性，但並不總是必要，特別是在使用 TF-IDF 時。是否刪除停用詞應基於文本分析任務的具體要求以及資料集的特性來進行決策

In [10]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def stop_word_removal(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(stop_word_removal('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to
[nltk_data]     /users/student/mr111//lywu22/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 3-5 Preprocessing - Word Stemming + Stop-Word Removal

In [11]:
def stem_word_remove_stopword(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    
    filtered_list = [word for word in lemmatized_words if word not in stop]
    
    return filtered_list


#### 3-5 Create TF-IDF feature representation ([ref](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

text_transformer = ColumnTransformer(
    [
        #('title preprocess', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [0]),            # to process the title paragraph
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [0]),
        #('channel process', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [1]),
        ('topic name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [1]),
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)


In [13]:
tfidf = TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False)
tfidf.fit(df_all['title'])
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names_out()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(df_all['title']).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones((1, tfidf_sum.shape[0])))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))


[vocabularies with smallest idf scores]
to: 2.52
the: 2.58
in: 2.96
a: 3.03
of: 3.07
for: 3.10
and: 3.44
is: 3.51
on: 3.54
your: 3.60

[vocabularies with highest tf-idf scores]
the: 1142.4830180213792
to: 1109.6698985304176
a: 795.7866740412087
in: 787.5419357082401
of: 746.7665375362841
for: 734.8807138633431
and: 555.5640584744767
your: 551.5848064261177
is: 544.621163447621
you: 533.4917456688497


### 4. Model training

- XGBoost

- LightGBM

- CatBoost

- AdaBoost

- Random Forest

- VotingClassifier


#### - To split the dataset 

In [14]:
df = df_all.loc[:, [
                    #'title', 
                    'author_name', 
                    #'channel', 
                    'topic', 
                    'year', 
                    'month',
                    'date', 
                    'day', 
                    'is_weekend',
                    'hour', 
                    # 'minute', 
                    # 'second', 
                    # 'num_img', 
                    'num_video', 
                    'len_content'
                    ]]
df.head()

Unnamed: 0,author_name,topic,year,month,date,day,is_weekend,hour,num_video,len_content
0,clara moskowitz,"Asteroid, Asteroids, challenge, Earth, Space, ...",2013,6,19,3,0,15,0,3591
1,christina warren,"Apps and Software, Google, open source, opn pl...",2013,3,28,4,0,17,0,1843
2,sam laird,"Entertainment, NFL, NFL Draft, Sports, Televis...",2014,5,7,3,0,19,25,6646
3,sam laird,"Sports, Video, Videos, Watercooler",2013,10,11,5,0,2,21,1821
4,connor finnegan,"Entertainment, instagram, instagram video, NFL...",2014,4,17,4,0,3,1,8919


In [15]:
from sklearn.model_selection import train_test_split

X_train = df.values[:train_data.shape[0]]
y_train = train_data['Popularity'].values
y_train[y_train==-1] = 0
X_test = df.values[train_data.shape[0]:]

print(X_train.shape)
print(type(X_train))
print(X_train)
print(y_train.shape)
print(type(y_train))
print(y_train)

X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

(27643, 10)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  15 0 3591]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 17 0 1843]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 19 25 6646]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 12 0 1274]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 20 0 2657]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 18 0 3027]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


#### - To construct the training function

In [16]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

def training(clf):
    clf_cv = cross_validate(clf, X_train, y_train, scoring='roc_auc', return_train_score=True, return_estimator=True)
    print('train score: {:.5f} (+/-{:.5f})'.format(
        np.mean(clf_cv['train_score']), np.std(clf_cv['train_score'])))
    print('valid score: {:.5f} (+/-{:.5f})'.format(
        np.mean(clf_cv['test_score']), np.std(clf_cv['test_score'])))

    train_score =  np.mean(clf_cv['train_score'])
    valid_score = np.mean(clf_cv['test_score'])

    clf.fit(X_train_split, y_train_split)
    print('train score: {:.5f}'.format(roc_auc_score(
        y_train_split, clf.predict_proba(X_train_split)[:, 1])))
    print('valid score: {:.5f}'.format(roc_auc_score(
        y_valid_split, clf.predict_proba(X_valid_split)[:, 1])))
    
    return clf, train_score, valid_score

#### - To contruct the grid search function

In [17]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv(ct, clf, param_grid):
    X_train_ct = ct.fit_transform(X_train)
    
    gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True)
    gs.fit(X_train_ct, y_train)
    
    results, idx = gs.cv_results_, gs.best_index_
    print('train score: {:.5f} (+/-{:.5f})'.format(results['mean_train_score'][idx], results['std_train_score'][idx]))
    print('valid score: {:.5f} (+/-{:.5f})'.format(results['mean_test_score'][idx], results['std_test_score'][idx]))
    print('best params:', gs.best_params_)
    return gs.best_params_, gs.best_estimator_

#### - To set whether to run the grid search 

In [18]:
grid_search_en = False

### 4-1. XGBOOST

4-1-1. Grid sizing for XGBoost

In [19]:
from xgboost import XGBClassifier

In [34]:
param_grid_xgb = {
    'gamma' : [0, 0.5, 1, 1.5, 2],
    'lambda' : [1.5, 2, 2.5, 3],
    'n_estimators': [100, 120, 140, 160, 180],
    'max_depth': [6, 8, 10, 12, 14],
    'learning_rate' : [0.14, 0.15, 0.16] ,
    'n_jobs' : [-1]
}

best_xgb_param, best_xgb = grid_search_cv(text_transformer, XGBClassifier(), param_grid_xgb)


KeyboardInterrupt: 

4-1-2. Training for XGBoost

In [20]:
from sklearn.pipeline import Pipeline

param_grid_lgbm = {
    'gamma' : 1,
    'lambda' : 2.5,
    'n_estimators': 100,
    'max_depth': 8,
    'learning_rate' : 0.14,
    'n_jobs' : -1
}

if (grid_search_en):
    xgboost = Pipeline([('vect', text_transformer), ('clf', best_xgb)])
else :
    xgboost = Pipeline([('vect', text_transformer), ('clf', XGBClassifier(**param_grid_lgbm))])
    
training(xgboost)

train score: 0.81234 (+/-0.00367)
valid score: 0.58892 (+/-0.00724)
train score: 0.82065
valid score: 0.58606


(Pipeline(steps=[('vect',
                  ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                    transformers=[('author name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [0]),
                                                  ('topic name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [1])])),
                 ('...
                             

### 4-2. LightGBM

4-2-1. Grid sizing for LightGBM

In [21]:
from lightgbm import LGBMClassifier

In [19]:
param_grid_lgbm = {
    'learning_rate' : [0.005, 0.006, 0.007, 0.008, 0.009 , 0.0095, 0.0098, 0.01, 0.011, 0.012, 0.013, 0.014, 0.015], 
    'n_estimators' : [200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380],
    'min_sum_hessian_in_leaf': [0.001, 0.01, 0.1, 1],
    'objective' : ['regression', 'regression_l1', 'poisson'],
    'n_jobs' : [-1],
    'verbose' : [-1]
}

best_lgbm_param, best_lgbm = grid_search_cv(text_transformer, LGBMClassifier(), param_grid_lgbm)
































train score: 0.66707 (+/-0.00226)
valid score: 0.59813 (+/-0.00820)
best params: {'learning_rate': 0.014, 'min_sum_hessian_in_leaf': 0.001, 'n_estimators': 240, 'objective': 'poisson'}


4-2-2. Training for LightGBM

In [22]:
params_LGBM = {
    'random_state': 0, 
    'learning_rate' : 0.009,
    'min_sum_hessian_in_leaf' : 0.001,
    'n_estimators' : 380,
    'n_jobs' : -1,
    'objective' : 'poisson'
}

if (grid_search_en):
    lgbm = Pipeline([('vect', text_transformer), ('clf', best_lgbm)])
else :
    lgbm = Pipeline([('vect', text_transformer), ('clf', LGBMClassifier(**params_LGBM))])

training(lgbm)

train score: 0.66793 (+/-0.00239)
valid score: 0.59754 (+/-0.00735)
train score: 0.67509
valid score: 0.59149


(Pipeline(steps=[('vect',
                  ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                    transformers=[('author name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [0]),
                                                  ('topic name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [1])])),
                 ('clf',
                  LGBMClass

### 4-3. CatBoost

4-3-1. Grid sizing for Catboost

In [23]:
from catboost import CatBoostClassifier

In [None]:
param_grid_catboost = {
    'learning_rate' : [0.001, 0.005, 0.01, 0.05, 0.06 , 0.09, 0.1, 0.15, 0.2], 
    'n_estimators' : [300, 350, 400, 450, 500, 550, 600,650,700,800,900,1000],
    'depth': [3,4,5,6,7,8,9,10,11,12,13,14,15,16],
    'random_state' : [0],
    'eval_metric' : ['AUC'],
    'thread_count' : [-1]
}

best_cat_params, best_cat_estimator = grid_search_cv(text_transformer, CatBoostClassifier(), param_grid_catboost)

4-3-2. Training for CatBoost

In [24]:
paramsCatBoost = {
    'verbose' : False,
    'eval_metric' : 'AUC',
    'n_estimators' : 500,
    'depth' : 10,
    'learning_rate' : 0.01,
    'random_state' : 0,
    'thread_count' : -1
}

if (grid_search_en):
    catboost = Pipeline([('ct', text_transformer),('clf', best_cat_estimator)])
else :
    catboost = Pipeline([('ct', text_transformer),('clf', CatBoostClassifier(**paramsCatBoost))])

training(catboost)

train score: 0.68975 (+/-0.00182)
valid score: 0.59867 (+/-0.00856)
train score: 0.69824
valid score: 0.59119


(Pipeline(steps=[('ct',
                  ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                    transformers=[('author name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [0]),
                                                  ('topic name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [1])])),
                 ('clf',
                  <catboost.c

### 4-4. AdaBoost

In [25]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

4-4-1. Grid sizing for AdaBoost

In [None]:
params_grid_AdaBoost = {
    'estimator__max_depth' : [1,2,3,4,5,6,7,8,9,10],
    'n_estimators': [num for num in range(50, 1000, 50)],
    'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.5],
    'random_state' : 0
}

best_ada_params, best_ada_estimator = grid_search_cv(text_transformer, 
                                                     AdaBoostClassifier(), 
                                                     params_grid_AdaBoost)

4-4-2. Training for AdaBoost

In [26]:
param_Adaboost = {
    'estimator' : DecisionTreeClassifier(max_depth = 4), 
    'learning_rate' : 0.005, 
    'n_estimators' : 900,
    'random_state' : 0
}

if (grid_search_en):
    adaBoost = Pipeline([('vect', text_transformer), ('clf', best_ada_estimator)])
else :
    adaBoost = Pipeline([('vect', text_transformer), ('clf', AdaBoostClassifier(**param_Adaboost))])
    
training(adaBoost)

train score: 0.67851 (+/-0.00528)
valid score: 0.58886 (+/-0.00965)
train score: 0.68843
valid score: 0.57824


(Pipeline(steps=[('vect',
                  ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                    transformers=[('author name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [0]),
                                                  ('topic name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [1])])),
                 ('clf',
                  AdaBoostC

### 4-5. Random Forest Classifier

4-5-1. Grid sizing for RF

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
params_grid_RF = {
    'random_state' : [0],
    'n_estimators': [num for num in range(50, 1000, 50)],
    'max_depth': [num for num in range(1, 100)],
    'n_jobs' : [-1]
}

best_RF_params, best_RF_estimator = grid_search_cv(text_transformer, RandomForestClassifier(n_jobs=-1), params_grid_RF)

4-5-2. Training for RF

In [28]:
param_RF = {
    'n_jobs' : -1,
    'random_state' : 0,
    'n_estimators' : 1000,
    'max_depth' : 80
}

if (grid_search_en):
    RF = Pipeline([('vect', text_transformer), ('clf', best_RF_estimator)])
else :
    RF = Pipeline([('vect', text_transformer), ('clf', RandomForestClassifier(**param_RF))])
    
training(RF)

train score: 0.99647 (+/-0.00032)
valid score: 0.58696 (+/-0.00963)
train score: 0.99756
valid score: 0.58557


(Pipeline(steps=[('vect',
                  ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                    transformers=[('author name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [0]),
                                                  ('topic name process',
                                                   TfidfVectorizer(lowercase=False,
                                                                   token_pattern=None,
                                                                   tokenizer=<function word_stemming at 0x7f6021438820>),
                                                   [1])])),
                 ('clf',
                  RandomFor

### 4-6. VotingClassifier

4-2-1. Grid sizing for Voting classifier

- to find the weight combinations

In [37]:
num_classifier = 5
estimator_list = [('xgboost', xgboost), ('lgbm', lgbm), ('catboost', catboost), ('adaBoost', adaBoost), ('RF', RF)]

In [38]:
from itertools import combinations_with_replacement
from sklearn.ensemble import VotingClassifier
params_list= []
weight_list = []

for i in range(0, 11):
    params_list.append(i/10)

for i in combinations_with_replacement(params_list, num_classifier):
    if(i != tuple(0 for _ in range(num_classifier))):
        weight_list.append(i)

In [34]:
from sklearn.ensemble import VotingClassifier


param_grid_voting = {
    'weights' : weight_list,   
}

param_grid_voting_static = {
    'estimators' : estimator_list, 
     'n_jobs' : -1, 
     'voting' : 'soft',               
     'flatten_transform' : True, 
     'verbose' : True
}

best_voting_param, best_voting = grid_search_cv(text_transformer, VotingClassifier(**param_grid_voting_static), param_grid_voting)

KeyboardInterrupt: 

In [42]:
prarms_voting = {
    'estimators' : estimator_list, 
    'voting' : 'soft',
    # 'weights' : [0.5, 0.5, 0.5, 0.5, 0.5],
    'n_jobs' : -1,
    'flatten_transform' : True,
    'verbose' : True
}

if (grid_search_en):
    voting = VotingClassifier(**param_grid_voting_static, weights=best_voting_param)
else :
    voting = VotingClassifier(**prarms_voting)

training(voting)


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 428, in _process_worker
    r = call_item()
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 275, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 620, in __call__
    return self.func(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/ensemble/_base.py", line 36, in _fit_single_estimator
    estimator.fit(X, y)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 743, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 670, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 65, in __call__
    return super().__call__(iterable_with_config)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/parallel.py", line 1098, in __call__
    self.retrieve()
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/parallel.py", line 975, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/multiprocessing/pool.py", line 771, in get
    raise self._value
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 620, in __call__
    return self.func(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/feature_extraction/text.py", line 2126, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/feature_extraction/text.py", line 1383, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/feature_extraction/text.py", line 1270, in _count_vocab
    for feature in analyze(doc):
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/feature_extraction/text.py", line 112, in _analyze
    doc = tokenizer(doc)
  File "/tmp/ipykernel_81188/2473757962.py", line 14, in word_stemming
  File "/tmp/ipykernel_81188/2473757962.py", line 14, in <listcomp>
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/nltk/stem/wordnet.py", line 45, in lemmatize
    lemmas = wn._morphy(word, pos)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/nltk/corpus/util.py", line 121, in __getattr__
    self.__load()
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/nltk/corpus/util.py", line 95, in __load
    args, kwargs = self.__args, self.__kwargs
AttributeError: 'WordNetCorpusReader' object has no attribute '_LazyCorpusLoader__args'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/ensemble/_voting.py", line 349, in fit
    return super().fit(X, transformed_y, sample_weight)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/ensemble/_voting.py", line 81, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 65, in __call__
    return super().__call__(iterable_with_config)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/parallel.py", line 1098, in __call__
    self.retrieve()
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/parallel.py", line 975, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 567, in wrap_future_result
    return future.result(timeout=timeout)
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/concurrent/futures/_base.py", line 446, in result
    return self.__get_result()
  File "/users/student/mr111/lywu22/miniconda3/envs/dl/lib/python3.9/concurrent/futures/_base.py", line 391, in __get_result
    raise self._exception
AttributeError: 'WordNetCorpusReader' object has no attribute '_LazyCorpusLoader__args'


### 5. Testing data prediction

In [43]:
best_model = voting

y_score = best_model.predict_proba(X_test)[:, 1]

df_pred = pd.DataFrame({'Id': test_data['Id'], 'Popularity': y_score})
df_pred.to_csv('../output/test_pred.csv', index=False)

NotFittedError: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [46]:
params_grid_AdaBoost = {
    'estimator__max_depth' : [i for i in range(1, 20)],
    'n_estimators': [num for num in range(1, 2000, 10)],
    'learning_rate': [0.0001 * i for i in range(1, 10000)],
    'random_state' : [0]
}

# 'estimator__max_depth' : 1~20
# 'n_estimators': 1~2000
# 'learning_rate': 0.0001~1

best_ada_params, best_ada_estimator = grid_search_cv(text_transformer, 
                                                     AdaBoostClassifier(estimator=DecisionTreeClassifier()), 
                                                     params_grid_AdaBoost)

print(best_ada_params)