## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [66]:
import pandas as pd
import numpy as np
import warnings
import os

warnings.filterwarnings("ignore")

#### 1. To load the datasets

In [67]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


In [68]:
print(test_data.shape)
train_data.head()

(11847, 2)


Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel
- length of content 

--- 

* beautiful soup
    - conda install -c conda-forge beautifulsoup4
    
<br>

* vadersentiment
    - conda install -c conda-forge vadersentiment

---

In [69]:
from bs4 import BeautifulSoup
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# to get the attribute of the 'title', 'year/month/date/day/hour/minute/second/is_weekend', 'num_img', 'num_video', 'author name', 'topic', 'channel', 'content length', 'title_sentiment'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)

    
    date_string = soup.find('time')
    try:
        date_string = date_string['datetime']
    except:
        date_string = 'wed, 10 oct 2014 15:00:43 +0000'
        
    date_string = date_string.strip().lower()
    datetimes = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %z')
    
    
    year = datetimes.year
    month = datetimes.month
    date = datetimes.day
    day = pd.Timestamp(str(year)+'-'+str(month)+'-'+str(date)).dayofweek+1
    is_weekend = 1 if (day==6 or day==7) else 0
    hour = datetimes.hour
    minute = datetimes.minute
    second = datetimes.second
    

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('iframe'))
    

    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'
    
    

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topic = footer.get_text().split(': ')[1]
    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    '''
    print('title = ', title, type(title))
    print('time = ', year, "/", month, "/",day, " ",hour, ":",minute, ":",second, type(year))
    print('number of images = ', num_img, type(num_img))
    print('number of videos = ', num_video, type(num_video))
    print('author_name = ', author_name, type(author_name))
    print('topic = ', topic, type(topic))
    print('channel = ', channel, type(channel))
    '''
    
    # 8. to find the content length
    content = soup.body.find('section', class_='article-content').get_text()
    len_content = len(content)

    # print('topic = ', topic, type(topic))
    
    # 9. to find the sentiment of title
    analyzer = SentimentIntensityAnalyzer()
    title_sentiment = analyzer.polarity_scores(topic)
    sentiment_neg = title_sentiment['neg']
    sentiment_neu = title_sentiment['neu']
    sentiment_pos = title_sentiment['pos']
    sentiment_compound = title_sentiment['compound']

    return title, author_name, channel, topic, year, month, date, day, is_weekend, hour, minute, second, num_img, num_video, len_content, sentiment_neg, sentiment_neu, sentiment_pos, sentiment_compound

In [70]:
feature_train_list = []
feature_test_list = []

for content in (train_data['Page content']):
    feature_train_list.append(preprocessor(content))
for content in (test_data['Page content']):
    feature_train_list.append(preprocessor(content))

df_all = pd.DataFrame(
        feature_train_list, 
        columns=['title', 'author_name', 'channel', 'topic', 'year', 'month', 'date', 'day', 'is_weekend', 'hour', 'minute', 'second', 'num_img', 'num_video', 'len_content', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'sentiment_compound'])


### 3-1 . Preprocessing - tokenization

To split the text corpora into individual elements

In [71]:
import re

def tokenizer(text):
    return re.split('\s+', text.strip())

### 3-2 . Preprocessing - Word Stemming

There are two ways of word stemming

1. PorterStemmer(Stemming): break the word in rule-besed way, which will lead to the probelm of overstemming

2. WordNetLemmatizer(Lemmatization): Stem the words will better performance, while time-consuming


In [72]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.data.path.append('/home/mfhsieh/NTHU-Deep-Learning-Competition/2023_DataLab_Cup1-Predicting_News_Popularity')

nltk.download('wordnet')
nltk.download('omw-1.4')

def word_stemming(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    return lemmatized_words

print(word_stemming('university, universal, universities'))

['university,', 'universal,', 'university']


[nltk_data] Downloading package wordnet to
[nltk_data]     /users/student/mr111//mfhsieh22/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/student/mr111//mfhsieh22/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [73]:
from nltk.stem.porter import PorterStemmer

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

print(tokenizer_stem('university, universal, universities'))

['university,', 'universal,', 'univers']


### 3-3 Preprocessing - Stop-Word Removal

儘管刪除停用詞在某些情況下（例如 BoW 和特徵哈希）可以有益於簡化表示，並可能提高文字分析的準確性，但並不總是必要，特別是在使用 TF-IDF 時。是否刪除停用詞應基於文本分析任務的具體要求以及資料集的特性來進行決策

In [74]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def stop_word_removal(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(stop_word_removal('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to
[nltk_data]     /users/student/mr111//mfhsieh22/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 3-5 Preprocessing - Word Stemming + Stop-Word Removal

In [75]:
def stem_word_remove_stopword(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    
    filtered_list = [word for word in lemmatized_words if word not in stop]
    
    return filtered_list


#### 3-5 Create TF-IDF feature representation ([ref](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html))

In [137]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

ngram_range_ = [(1,2), (1,1)]

text_transformer = ColumnTransformer(
    [
        #('title preprocess', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [0]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [0]),
        #('channel process', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [1]),
        ('topic name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [1]),
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)


In [77]:
tfidf = TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False)
tfidf.fit(df_all['title'])
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names_out()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(df_all['title']).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones((1, tfidf_sum.shape[0])))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))


[vocabularies with smallest idf scores]
to: 2.52
the: 2.58
in: 2.96
a: 3.03
of: 3.07
for: 3.10
and: 3.44
is: 3.51
on: 3.54
your: 3.60

[vocabularies with highest tf-idf scores]
the: 1142.4830180213792
to: 1109.6698985304176
a: 795.7866740412087
in: 787.5419357082401
of: 746.7665375362841
for: 734.8807138633431
and: 555.5640584744767
your: 551.5848064261177
is: 544.621163447621
you: 533.4917456688497


### 4. Model training

- XGBoost

- LightGBM

- CatBoost

- AdaBoost

- Random Forest

- VotingClassifier


#### - To split the dataset 

In [78]:
from sklearn.model_selection import train_test_split

1. XGBoost

In [79]:
remaining_dict_xgboost = [  #'title', 
                            'author_name', 
                            #'channel', 
                            'topic', 
                            'year', 
                            'month',
                            'date', 
                            'day', 
                            'is_weekend',
                            'hour', 
                            # 'minute', 
                            # 'second', 
                            # 'num_img', 
                            'num_video', 
                            'len_content',
                            'sentiment_neg', 
                            'sentiment_neu', 
                            'sentiment_pos', 
                            'sentiment_compound'
                            ]

df_xgboost = df_all.loc[:, remaining_dict_xgboost]

X_xgboost_train = df_xgboost.values[:train_data.shape[0]]
y_xgboost_train = train_data['Popularity'].values
y_xgboost_train[y_xgboost_train==-1] = 0
X_xgboost_test = df_xgboost.values[train_data.shape[0]:]

print(X_xgboost_train.shape)
print(type(X_xgboost_train))
print(X_xgboost_train)
print(y_xgboost_train.shape)
print(type(y_xgboost_train))
print(y_xgboost_train)

X_xgboost_train_split, X_xgboost_valid_split, y_xgboost_train_split, y_xgboost_valid_split = train_test_split(X_xgboost_train, y_xgboost_train, test_size=0.3, random_state=0)

(27643, 14)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  0.822 0.178 0.0772]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 0.881 0.0 -0.2263]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 0.641 0.359 0.4215]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 0.704 0.296 0.2732]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 1.0 0.0 0.0]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 1.0 0.0 0.0]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


2. LightGbm

In [130]:
remaining_dict_lightgbm = [  #'title', 
                            'author_name', 
                            #'channel', 
                            'topic', 
                            'year', 
                            'month',
                            'date', 
                            'day', 
                            'is_weekend',
                            'hour', 
                            # 'minute', 
                            # 'second', 
                            # 'num_img', 
                            'num_video', 
                            'len_content',
                            # 'sentiment_neg', 
                            # 'sentiment_neu', 
                            # 'sentiment_pos', 
                            # 'sentiment_compound'
                            ]

df_lightgbm = df_all.loc[:, remaining_dict_lightgbm]

X_lightgbm_train = df_lightgbm.values[:train_data.shape[0]]
y_lightgbm_train = train_data['Popularity'].values
y_lightgbm_train[y_lightgbm_train==-1] = 0
X_lightgbm_test = df_lightgbm.values[train_data.shape[0]:]

print(X_lightgbm_train.shape)
print(type(X_lightgbm_train))
print(X_lightgbm_train)
print(y_lightgbm_train.shape)
print(type(y_lightgbm_train))
print(y_lightgbm_train)

X_lightgbm_train_split, X_lightgbm_valid_split, y_lightgbm_train_split, y_lightgbm_valid_split = train_test_split(X_lightgbm_train, y_lightgbm_train, test_size=0.3, random_state=0)

(27643, 10)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  15 0 3591]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 17 0 1843]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 19 25 6646]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 12 0 1274]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 20 0 2657]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 18 0 3027]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


3. catboost

In [81]:
remaining_dict_catboost = [  #'title', 
                            'author_name', 
                            #'channel', 
                            'topic', 
                            'year', 
                            'month',
                            'date', 
                            'day', 
                            'is_weekend',
                            'hour', 
                            # 'minute', 
                            # 'second', 
                            # 'num_img', 
                            'num_video', 
                            'len_content',
                            # 'sentiment_neg', 
                            # 'sentiment_neu', 
                            # 'sentiment_pos', 
                            # 'sentiment_compound'
                            ]

df_catboost = df_all.loc[:, remaining_dict_catboost]

X_catboost_train = df_catboost.values[:train_data.shape[0]]
y_catboost_train = train_data['Popularity'].values
y_catboost_train[y_catboost_train==-1] = 0
X_catboost_test = df_catboost.values[train_data.shape[0]:]

print(X_catboost_train.shape)
print(type(X_catboost_train))
print(X_catboost_train)
print(y_catboost_train.shape)
print(type(y_catboost_train))
print(y_catboost_train)

X_catboost_train_split, X_catboost_valid_split, y_catboost_train_split, y_catboost_valid_split = train_test_split(X_catboost_train, y_catboost_train, test_size=0.3, random_state=0)

(27643, 10)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  15 0 3591]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 17 0 1843]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 19 25 6646]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 12 0 1274]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 20 0 2657]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 18 0 3027]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


4. adaboost

In [82]:
remaining_dict_adaboost = [  #'title', 
                            'author_name', 
                            #'channel', 
                            'topic', 
                            'year', 
                            'month',
                            'date', 
                            'day', 
                            'is_weekend',
                            'hour', 
                            # 'minute', 
                            # 'second', 
                            # 'num_img', 
                            'num_video', 
                            'len_content',
                            # 'sentiment_neg', 
                            # 'sentiment_neu', 
                            # 'sentiment_pos', 
                            # 'sentiment_compound'
                            ]

df_adaboost = df_all.loc[:, remaining_dict_adaboost]

X_adaboost_train = df_adaboost.values[:train_data.shape[0]]
y_adaboost_train = train_data['Popularity'].values
y_adaboost_train[y_adaboost_train==-1] = 0
X_adaboost_test = df_adaboost.values[train_data.shape[0]:]

print(X_adaboost_train.shape)
print(type(X_adaboost_train))
print(X_adaboost_train)
print(y_adaboost_train.shape)
print(type(y_adaboost_train))
print(y_adaboost_train)

X_adaboost_train_split, X_adaboost_valid_split, y_adaboost_train_split, y_adaboost_valid_split = train_test_split(X_adaboost_train, y_adaboost_train, test_size=0.3, random_state=0)

(27643, 10)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  15 0 3591]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 17 0 1843]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 19 25 6646]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 12 0 1274]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 20 0 2657]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 18 0 3027]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


5. RF

In [83]:
remaining_dict_RF = [  #'title', 
                            'author_name', 
                            #'channel', 
                            'topic', 
                            'year', 
                            'month',
                            'date', 
                            'day', 
                            'is_weekend',
                            'hour', 
                            # 'minute', 
                            # 'second', 
                            # 'num_img', 
                            'num_video', 
                            'len_content',
                            # 'sentiment_neg', 
                            # 'sentiment_neu', 
                            # 'sentiment_pos', 
                            # 'sentiment_compound'
                            ]

df_RF = df_all.loc[:, remaining_dict_RF]

X_RF_train = df_RF.values[:train_data.shape[0]]
y_RF_train = train_data['Popularity'].values
y_RF_train[y_RF_train==-1] = 0
X_RF_test = df_RF.values[train_data.shape[0]:]

print(X_RF_train.shape)
print(type(X_RF_train))
print(X_RF_train)
print(y_RF_train.shape)
print(type(y_RF_train))
print(y_RF_train)

X_RF_train_split, X_RF_valid_split, y_RF_train_split, y_RF_valid_split = train_test_split(X_RF_train, y_RF_train, test_size=0.3, random_state=0)

(27643, 10)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  15 0 3591]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 17 0 1843]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 19 25 6646]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 12 0 1274]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 20 0 2657]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 18 0 3027]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


6. Voting

In [84]:
remaining_dict_voting = [  #'title', 
                            'author_name', 
                            #'channel', 
                            'topic', 
                            'year', 
                            'month',
                            'date', 
                            'day', 
                            'is_weekend',
                            'hour', 
                            # 'minute', 
                            # 'second', 
                            # 'num_img', 
                            'num_video', 
                            'len_content',
                            # 'sentiment_neg', 
                            # 'sentiment_neu', 
                            # 'sentiment_pos', 
                            # 'sentiment_compound'
                            ]

df_voting = df_all.loc[:, remaining_dict_voting]

X_voting_train = df_voting.values[:train_data.shape[0]]
y_voting_train = train_data['Popularity'].values
y_voting_train[y_voting_train==-1] = 0
X_voting_test = df_voting.values[train_data.shape[0]:]

print(X_voting_train.shape)
print(type(X_voting_train))
print(X_voting_train)
print(y_voting_train.shape)
print(type(y_voting_train))
print(y_voting_train)

X_voting_train_split, X_voting_valid_split, y_voting_train_split, y_voting_valid_split = train_test_split(X_voting_train, y_voting_train, test_size=0.3, random_state=0)

(27643, 10)
<class 'numpy.ndarray'>
[['clara moskowitz'
  'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ' 2013 ...
  15 0 3591]
 ['christina warren'
  'Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S. '
  2013 ... 17 0 1843]
 ['sam laird' 'Entertainment, NFL, NFL Draft, Sports, Television ' 2014
  ... 19 25 6646]
 ...
 ['christine erickson' 'Food, hot dogs, humor, Photography, Watercooler '
  2014 ... 12 0 1274]
 ['seth fiegerman' 'Business, marissa mayer, Media, stocks, Yahoo ' 2013
  ... 20 0 2657]
 ['megan ranney' 'austin, Business, CurioCity, Small Business, Startups '
  2014 ... 18 0 3027]]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


In [85]:
# from sklearn.model_selection import train_test_split

# X_train = df.values[:train_data.shape[0]]
# y_train = train_data['Popularity'].values
# y_train[y_train==-1] = 0
# X_test = df.values[train_data.shape[0]:]

# print(X_train.shape)
# print(type(X_train))
# print(X_train)
# print(y_train.shape)
# print(type(y_train))
# print(y_train)

# X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

#### - To construct the training function

In [86]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

def training(X_train, y_train, clf):
    score = cross_validate(clf, X_train, y_train, scoring='roc_auc', return_train_score=True, return_estimator=True)
    print('train score: {:.6f} (+/-{:.6f})'.format(
        np.mean(score['train_score']), np.std(score['train_score'])))
    print('valid score: {:.6f} (+/-{:.6f})'.format(
        np.mean(score['test_score']), np.std(score['test_score'])))

    clf.fit(X_train, y_train)
    
    return clf, np.mean(score['train_score']), np.mean(score['test_score'])

#### - To contruct the grid search function

In [147]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv(ct, X_train, y_train, clf, param_grid, verbose_=False):
    X_train_ct = ct.fit_transform(X_train)
    
    # to report the grid search information
    if(verbose_):
        gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True, verbose = 3)
    else:
        gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True)
        
    gs.fit(X_train_ct, y_train)
    
    results, idx = gs.cv_results_, gs.best_index_
    print('train score: {:.6f} (+/-{:.6f})'.format(results['mean_train_score'][idx], results['std_train_score'][idx]))
    print('valid score: {:.6f} (+/-{:.6f})'.format(results['mean_test_score'][idx], results['std_test_score'][idx]))
    print('best params:', gs.best_params_)
    return gs.best_params_, gs.best_estimator_, results['mean_test_score'][idx]

#### **- To set whether to run the grid search**

In [88]:
# grid_search_en = True
grid_search_en = False

#### - to store the best parameter to a file

In [89]:
def parameter_storage(dict_path, file_name, best_param, best_validation, remaining_dict, best_estimator_list=None):
    if not os.path.exists(dict_path):
        os.makedirs(dict_path)
        
    file_path = os.path.join(dict_path, file_name + ".txt")
    
    with open(file_path, "w") as file:
        file.write(f'The features: {remaining_dict}\n')
        file.write(f'The best parameter: {best_param}\n')
        file.write(f'ngram_range : {ngram_range_}\n')
        if not (best_estimator_list==None):
            file.write('The best estimator_list: ')
            for i in range(len(best_estimator_list)):
                file.write(f' {best_estimator_list[i][0]}')
            file.write('\n')
        file.write(f'The best validation: {best_validation}\n')
        file.write('--------------------------------------------------------\n')

### 4-1. XGBOOST

4-1-1. Grid sizing for XGBoost

In [113]:
from xgboost import XGBClassifier
import joblib

xgb_model_path = '../output/best_models/xgboost/'
if not os.path.exists(xgb_model_path):
        os.makedirs(xgb_model_path)

In [115]:
param_grid_xgb = {
    'gamma' : [1.1, 1.2],
    'lambda' : [2.4, 2.5],
    'n_estimators': [97, 98],
    'max_depth': [7, 8],
    'learning_rate' : [0.14, 0.141]  
}

if (1):
    best_xgb_param, best_xgb, best_xgb_valid = grid_search_cv(text_transformer, X_xgboost_train, y_xgboost_train, XGBClassifier(n_jobs=-1), param_grid_xgb, True)
    xgb_path = xgb_model_path+'xgb_'+ str(round(best_xgb_valid, 6))+ '.pkl'
    joblib.dump(best_xgb, xgb_path)
    parameter_storage('../output/best_parameters', 'best_xgb_param', best_xgb_param, best_xgb_valid, remaining_dict_xgboost)
    

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END gamma=1.1, lambda=2.4, learning_rate=0.141, max_depth=7, n_estimators=97; total time=  20.1s
[CV] END gamma=1.1, lambda=2.4, learning_rate=0.14, max_depth=7, n_estimators=97; total time=  20.2s
[CV] END gamma=1.1, lambda=2.4, learning_rate=0.14, max_depth=7, n_estimators=97; total time=  20.2s
[CV] END gamma=1.1, lambda=2.4, learning_rate=0.14, max_depth=7, n_estimators=98; total time=  20.3s
[CV] END gamma=1.1, lambda=2.4, learning_rate=0.14, max_depth=7, n_estimators=97; total time=  20.3s
[CV] END gamma=1.1, lambda=2.4, learning_rate=0.14, max_depth=7, n_estimators=98; total time=  20.3s
[CV] END gamma=1.1, lambda=2.5, learning_rate=0.14, max_depth=7, n_estimators=97; total time=  20.2s
[CV] END gamma=1.1, lambda=2.4, learning_rate=0.141, max_depth=7, n_estimators=97; total time=  20.3s
[CV] END gamma=1.1, lambda=2.5, learning_rate=0.14, max_depth=7, n_estimators=97; total time=  20.3s
[CV] END gamma=1.1, lambda=

4-1-2. Training for XGBoost

In [173]:
best_xgb_model_path = xgb_model_path+'xgb_0.593022.pkl'

In [117]:
from sklearn.pipeline import Pipeline

param_grid_lgbm = {
    'gamma' : 1.2,
    'lambda' : 2.5,
    'n_estimators': 97,
    'max_depth': 7,
    'learning_rate' : 0.141,
    'n_jobs' : -1,
    'random_state' : 0
}

clf_xgb = joblib.load(best_xgb_model_path)

# if (grid_search_en):
#     xgboost = Pipeline([('vect', text_transformer), ('clf', best_xgb)])
# else :
#     xgboost = Pipeline([('vect', text_transformer), ('clf', XGBClassifier(**param_grid_lgbm))])

xgboost = Pipeline([('vect', text_transformer), ('clf', clf_xgb)])
    
_ = training(X_xgboost_train, y_xgboost_train, xgboost)

train score: 0.783027 (+/-0.003699)
valid score: 0.589135 (+/-0.005340)


### 4-2. LightGBM

4-2-1. Grid sizing for LightGBM

In [126]:
from lightgbm import LGBMClassifier

lgb_model_path = '../output/best_models/lgboost/'
if not os.path.exists(lgb_model_path):
        os.makedirs(lgb_model_path)

In [133]:
param_grid_lgbm = {
    'learning_rate' : [0.13, 0.0131], 
    'n_estimators' : [230, 231],
    'objective' : ['regression', 'poisson']
}

if (grid_search_en):
    best_lgbm_param, best_lgbm, best_lgbm_valid = grid_search_cv(text_transformer, X_lightgbm_train, y_lightgbm_train, LGBMClassifier(n_jobs=-1, verbose=-1), param_grid_lgbm, True)
    lgb_path = lgb_model_path+'lgb_'+ str(round(best_lgbm_valid, 6))+ '.pkl'
    joblib.dump(best_lgbm, lgb_path)
    parameter_storage('../output/best_parameters', 'best_lgbm_param', best_lgbm_param, best_lgbm_valid, remaining_dict_lightgbm)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END learning_rate=0.13, n_estimators=230, objective=regression; total time=   6.3s
[CV] END learning_rate=0.13, n_estimators=231, objective=regression; total time=   6.4s
[CV] END learning_rate=0.0131, n_estimators=231, objective=poisson; total time=   6.8s
[CV] END learning_rate=0.0131, n_estimators=231, objective=poisson; total time=   6.8s
[CV] END learning_rate=0.0131, n_estimators=231, objective=poisson; total time=   6.8s
[CV] END learning_rate=0.0131, n_estimators=231, objective=regression; total time=   6.9s
[CV] END learning_rate=0.13, n_estimators=230, objective=regression; total time=   6.9s
[CV] END learning_rate=0.0131, n_estimators=231, objective=regression; total time=   7.0s
[CV] END learning_rate=0.13, n_estimators=230, objective=poisson; total time=   7.1s
[CV] END learning_rate=0.13, n_estimators=231, objective=poisson; total time=   7.2s
[CV] END learning_rate=0.13, n_estimators=230, objective=regressi

In [134]:
best_lgb_model_path = lgb_model_path+'lgb_0.599386.pkl'

4-2-2. Training for LightGBM

In [135]:
params_LGBM = {
    'random_state': 0, 
    'learning_rate' : 0.013,
    'n_estimators' : 230,
    'n_jobs' : -1,
    'objective' : 'poisson'
}

# if (grid_search_en):
#     lgbm = Pipeline([('vect', text_transformer), ('clf', best_lgbm)])
# else :
#     lgbm = Pipeline([('vect', text_transformer), ('clf', LGBMClassifier(**params_LGBM))])

clf_lgb = joblib.load(best_lgb_model_path)

lgbm = Pipeline([('vect', text_transformer), ('clf', clf_xgb)])

_ = training(X_lightgbm_train, y_lightgbm_train, lgbm)

train score: 0.779805 (+/-0.003876)
valid score: 0.589276 (+/-0.008707)


### 4-3. CatBoost

4-3-1. Grid sizing for Catboost

In [136]:
from catboost import CatBoostClassifier

cat_model_path = '../output/best_models/catboost/'
if not os.path.exists(cat_model_path):
        os.makedirs(cat_model_path)

In [142]:
param_grid_catboost = {
    'learning_rate' : [0.01, 0.02], 
    'n_estimators' : [700, 650],
    'depth' : [11, 12]
}

if (grid_search_en):
    best_cat_params, best_cat_estimator, best_cat_valid= grid_search_cv(text_transformer, X_catboost_train, y_catboost_train, CatBoostClassifier(eval_metric='AUC',random_state=0, verbose=False), param_grid_catboost, True)
    cat_path = cat_model_path+'cat_'+ str(round(best_cat_valid, 6))+ '.pkl'
    joblib.dump(best_cat_estimator, cat_path)
    parameter_storage('../output/best_parameters', 'best_cat_params', best_cat_params, best_cat_valid, remaining_dict_catboost)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


KeyboardInterrupt: 

4-3-2. Training for CatBoost

In [98]:
paramsCatBoost = {
    'eval_metric' : 'AUC',
    'n_estimators' : 700,
    'depth' : 12,
    'learning_rate' : 0.01,
    'random_state' : 0,
    'verbose' : False
}

if (grid_search_en):
    catboost = Pipeline([('ct', text_transformer),('clf', best_cat_estimator)])
else :
    catboost = Pipeline([('ct', text_transformer),('clf', CatBoostClassifier(**paramsCatBoost))])

_ = training(X_catboost_train, y_catboost_train, catboost)

KeyboardInterrupt: 

### 4-4. AdaBoost

In [178]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_model_path = '../output/best_models/adaboost/'
if not os.path.exists(ada_model_path):
        os.makedirs(ada_model_path)

4-4-1. Grid sizing for AdaBoost

In [181]:

params_grid_AdaBoost = {
    'estimator__max_depth' : [4],
    'n_estimators': [1500],
    'learning_rate': [0.001],
    'random_state' : [0]
}

if (grid_search_en):
    best_ada_params, best_ada_estimator, best_ada_valid = grid_search_cv(text_transformer, X_adaboost_train, y_adaboost_train,
                                                         AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=0)), 
                                                         params_grid_AdaBoost, True)
    ada_path = ada_model_path+'ada_'+ str(round(best_ada_valid, 6))+ '.pkl'
    joblib.dump(best_ada_estimator, ada_path)
    parameter_storage('../output/best_parameters', 'best_ada_params', best_ada_params, best_ada_valid, remaining_dict_adaboost)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [174]:
best_ada_model_path = ada_model_path+'ada_0.599386.pkl'

4-4-2. Training for AdaBoost

In [None]:
param_Adaboost = {
    'estimator' : DecisionTreeClassifier(max_depth = 4), 
    'learning_rate' : 0.001, 
    'n_estimators' : 1560
}

if (grid_search_en):
    adaBoost = Pipeline([('vect', text_transformer), ('clf', best_ada_estimator)])
else :
    adaBoost = Pipeline([('vect', text_transformer), ('clf', AdaBoostClassifier(**param_Adaboost))])
    
_ = training(X_adaboost_train, y_adaboost_train, adaBoost)

KeyboardInterrupt: 

### 4-5. Random Forest Classifier

4-5-1. Grid sizing for RF

In [155]:
from sklearn.ensemble import RandomForestClassifier

RF_model_path = '../output/best_models/RF/'
if not os.path.exists(RF_model_path):
        os.makedirs(RF_model_path)

In [164]:

params_grid_RF = {
    'n_estimators' : [1150, 1200, 1250],
    'max_depth' : [90, 100, 110],
    'min_samples_leaf' : [1]
}

if (grid_search_en):
    best_RF_params, best_RF_estimator, best_RF_valid = grid_search_cv(text_transformer, X_RF_train, y_RF_train, RandomForestClassifier(n_jobs=-1), params_grid_RF, True)
    RF_path = RF_model_path+'ada_'+ str(round(best_RF_valid, 6))+ '.pkl'
    joblib.dump(best_RF_estimator, RF_path)
    parameter_storage('../output/best_parameters', 'best_RF_params', best_RF_params, best_RF_valid, remaining_dict_RF)

[CV 2/5] END max_depth=90, n_estimators=1150;, score=(train=0.998, test=0.589) total time= 2.6min
[CV 1/5] END max_depth=90, n_estimators=1150;, score=(train=0.998, test=0.596) total time= 2.6min
[CV 3/5] END max_depth=90, n_estimators=1150;, score=(train=0.999, test=0.589) total time= 2.6min
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 3/5] END max_depth=110, min_samples_leaf=1, n_estimators=1150;, score=(train=nan, test=nan) total time=   8.1s
[CV 2/5] END max_depth=100, min_samples_leaf=1, n_estimators=1250;, score=(train=nan, test=nan) total time=   8.6s
[CV 2/5] END max_depth=110, min_samples_leaf=1, n_estimators=1250;, score=(train=nan, test=nan) total time=   8.6s
[CV 4/5] END max_depth=100, min_samples_leaf=1, n_estimators=1150;, score=(train=nan, test=nan) total time=   8.4s
[CV 4/5] END max_depth=110, min_samples_leaf=1, n_estimators=1250;, score=(train=nan, test=nan) total time=   8.4s
[CV 3/5] END max_depth=100, min_samples_leaf=1, n_estimators=1250;, sco

In [175]:
best_RF_model_path = RF_model_path+'ada_0.587187.pkl'

4-5-2. Training for RF

In [169]:
param_RF = {
    'n_jobs' : -1,
    'random_state' : 0,
    'n_estimators' : 1200,
    'max_depth' : 100,
    'min_samples_leaf': 1
}

# if (grid_search_en):
#     RF = Pipeline([('vect', text_transformer), ('clf', best_RF_estimator)])
# else :
#     RF = Pipeline([('vect', text_transformer), ('clf', RandomForestClassifier(**param_RF))])

clf_RF = joblib.load(best_RF_model_path)

RF = Pipeline([('vect', text_transformer), ('clf', clf_RF)])

_ = training(X_RF_train, y_RF_train, RF)

train score: 0.998601 (+/-0.000162)
valid score: 0.586302 (+/-0.009809)


### 4-6. VotingClassifier

4-2-1. Grid sizing for Voting classifier

In [176]:
xgb_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3]),
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
lgb_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12, 15, 16, 17, 18]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
cat_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
ada_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
rfc_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)

In [177]:
pipeline_xgb = Pipeline([('vect', xgb_transformer), ('clf', joblib.load(best_xgb_model_path))])
pipeline_lgb = Pipeline([('vect', lgb_transformer), ('clf', joblib.load(best_lgb_model_path))])
# pipeline_cat = Pipeline([('vect', cat_transformer), ('clf', joblib.load(best_xgb_model_path))])
pipeline_ada = Pipeline([('vect', ada_transformer), ('clf', joblib.load(best_ada_model_path))])
pipeline_rfc = Pipeline([('vect', rfc_transformer), ('clf', joblib.load(best_RF_model_path))])

FileNotFoundError: [Errno 2] No such file or directory: '../output/best_models/adaboost/ada_0.599386.pkl'

In [172]:
num_classifier = 3
weight_range = 5
voting_estimator_list = [pipeline_xgb, pipeline_lgb, pipeline_rfc]
# voting_estimator_list = [pipeline_xgb, pipeline_lgb, pipeline_cat, pipeline_ada, pipeline_rfc]

NameError: name 'pipeline_xgb' is not defined

In [None]:
if (len(voting_estimator_list) != num_classifier) :
    print("Error: the numver of the classifier must equal to the estimator_list element number! Please check again!")

- to find the weight combinations

In [None]:
import itertools

def weight_list_generator(weight_range_, num_classifier_):   
    weight_range = weight_range_
    weight_list = []

    binary_values = (1, 2)
    weight_list = [list(i) for i in list(itertools.product(binary_values, repeat=num_classifier_))]

    print('weight list = ', weight_list)
    print('length of weight list = ', len(weight_list))
    return weight_list

In [None]:
weight_list = weight_list_generator(weight_range_ = weight_range, 
                                    num_classifier_ = num_classifier)

weight list =  [[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [3, 1], [3, 3], [3, 5], [3, 7], [3, 9], [5, 1], [5, 3], [5, 5], [5, 7], [5, 9], [7, 1], [7, 3], [7, 5], [7, 7], [7, 9], [9, 1], [9, 3], [9, 5], [9, 7], [9, 9]]
length of weight list =  25


* multi-thread grid search for voting classifier

In [None]:
import threading
from sklearn.ensemble import VotingClassifier

param_grid_voting_static = {
    'estimators' : voting_estimator_list,
    'voting' : 'soft',
    'flatten_transform' : True, 
    'verbose' : True
}

lock = threading.Lock()


def process_weight(weight):
    # share in multiple threads
    global best_valid_score, best_weight, best_voting

    voting = VotingClassifier(**param_grid_voting_static, weights=weight)
    print('current weight =', weight)
    clf_voting, _, valid_voting = training(X_voting_train, y_voting_train, voting)

    # to protect the safety of shared variables
    with lock:
        if valid_voting > best_valid_score:
            best_valid_score = valid_voting
            best_weight = weight
            best_voting = clf_voting
    
    print(f'{weight} Finish!!')
    threadmax.release()
        
if (1):
    best_valid_score = 0
    best_weight = None
    best_voting = None
    mem = []
    threadmax = threading.BoundedSemaphore(32)
    
    for weight in weight_list:
        threadmax.acquire()
        thread = threading.Thread(target=process_weight, args=(weight,))
    
        thread.start()
        mem.append(thread)

    for thread in mem:
        thread.join()
        mem.remove(thread)
    
    print('end once')

    print('best_valid_score = %.6f' % best_valid_score)
    print('best_weight = ', best_weight)
    
    parameter_storage('../output/best_parameters', 'best_weight', best_weight, best_valid_score, estimator_list, remaining_dict_voting)

current weight = [1, 1]
current weight = [1, 3]
current weight = [1, 5]
current weight = [1, 7]
current weight = [1, 9]
current weight = [3, 1]
current weight = [3, 3]
current weight = [3, 5]
current weight = [3, 7]
current weight = [3, 9]
current weight = [5, 1]
current weight = [5, 3]
current weight = [5, 5]
current weight = [5, 7]
current weight = [5, 9]
current weight = [7, 1]
current weight = [7, 3]
current weight = [7, 5]
current weight = [7, 7]
current weight = [7, 9]
current weight = [9, 1]
current weight = [9, 3]
current weight = [9, 5]
current weight = [9, 7]
current weight = [9, 9]
[Voting] .................. (1 of 2) Processing xgboost, total=  24.3s
[Voting] .................. (1 of 2) Processing xgboost, total=  25.0s
[Voting] .................. (1 of 2) Processing xgboost, total=  28.2s
[Voting] .................. (1 of 2) Processing xgboost, total=  28.7s
[Voting] .................. (1 of 2) Processing xgboost, total=  29.3s
[Voting] .................. (1 of 2) Processi

[Voting] ..................... (2 of 2) Processing lgbm, total= 1.2min
[3, 1] Finish!!


In [None]:
# no n_jobs
prarms_voting = {
    'estimators' : voting_estimator_list, 
    'voting' : 'soft',
    'weights' : [1, 2, 1],
    'flatten_transform' : True,
    'verbose' : True
}

if (1):
    voting = VotingClassifier(**param_grid_voting_static, weights=best_weight)
else :
    voting = VotingClassifier(**prarms_voting)

_ = training(X_voting_train, y_voting_train, voting)


### 5. Testing data prediction

In [None]:
best_model = voting

y_score = best_model.predict_proba(X_voting_test)[:, 1]

df_pred = pd.DataFrame({'Id': test_data['Id'], 'Popularity': y_score})
df_pred.to_csv('../output/test_pred.csv', index=False)