In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import time

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from Preprocessing import  load, cleaning, text_mining_tfdf, text_mining_sentiment, preprocessing, hashtag_tfidf, add_word_embeddings, replace_pattern

Brief description of the two function that I have implemented:
 1) compare_models --> train and test the models contained in the model list and create a dataframe to easily visualize the metrics 
 2) compare_models_KF --> do exactly the same, but it uses the K-Fold method to make robust result of the metrcs that we obtain.

In [2]:
def compare_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, models: list, names: list) -> pd.DataFrame:

    """Inputs:
        X_train = training data set
        X_test = testing data set
        y_train = training label 
        y_test = testing label 
        models = list of models to compare
        names = list of trings containing the names of the models 

        Return: comparison table 
        """

    f1_scores, precision_scores, recall_scores, accuracy_scores = [], [], [],[]
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        clf.fit(X_train, y_train)
        print('\n')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average = 'macro')
        f1_scores.append(f1)


        acc = accuracy_score(y_test, y_pred)
        accuracy_scores.append(acc)

        prec = precision_score(y_test, y_pred, average = 'macro')
        precision_scores.append(prec)

        rec = recall_score(y_test, y_pred, average='macro')
        recall_scores.append(rec)



        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    col3 = pd.Series(recall_scores)
    col4 = pd.Series(accuracy_scores)
    col5 = pd.Series(precision_scores)

    result = pd.concat([col1, col2, col3, col4, col5], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score', 'Recall', 'Accuray', 'Precision']

    return result

In [3]:
def compare_models_KF(X: pd.DataFrame, y: pd.Series, models: list, names: list, k = int) -> pd.DataFrame:

    """Inputs:
        X = training data set
        y = training label  
        models = list of models to compare
        names = list of trings containing the names of the models 
        k = int for the cross validation 

        Return: comparison table 
        """

    f1_scores = []
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        f1_score = cross_val_score(clf, X, y, cv = k, scoring = 'f1_macro')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        f1 = f1_score.mean()
        f1_scores.append(f1)

        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    

    result = pd.concat([col1, col2], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score']

    return result

In [4]:
clf_models = [LinearSVC(), RandomForestClassifier(), BernoulliNB()]

clf_names = ['Linear SVC', 'Random Forest', ' Bernoulli Nayve Bayes']

X, y = load().pipe(cleaning).pipe(replace_pattern).pipe(text_mining_sentiment).pipe(preprocessing)

100%|██████████| 1000/1000 [00:08<00:00, 120.09it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2300.93it/s]
100%|██████████| 1000/1000 [00:00<00:00, 3413.63it/s]


In [9]:
X_train

Unnamed: 0,ids,day_of_week,month_of_year,day_of_month,hour_of_day,night,neg,neu,pos,compound,polarity,subjectivity,embedding_negativity,embedding_positivity
0,0.694106,1.000000,1.0,0.200000,0.043478,1.0,0.000,1.000,0.000,0.466604,0.500000,0.000000,0.821088,0.178912
1,0.584181,0.666667,0.5,0.933333,0.956522,1.0,0.000,0.843,0.157,0.697931,0.800000,1.000000,0.908368,0.091632
2,0.480936,0.666667,0.5,0.700000,0.043478,1.0,0.174,0.826,0.000,0.130509,0.583333,0.750000,0.936724,0.063276
3,0.429789,0.000000,0.5,0.566667,0.347826,0.0,0.072,0.928,0.000,0.410241,0.518056,0.313889,0.969564,0.030435
4,0.891410,0.500000,1.0,0.566667,0.826087,1.0,0.093,0.465,0.442,0.834586,0.900000,0.700000,0.930857,0.069143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.953181,0.000000,1.0,0.700000,0.652174,0.0,0.062,0.888,0.050,0.424236,0.500000,0.000000,0.974777,0.025223
796,0.000000,0.000000,0.0,0.166667,0.956522,1.0,0.341,0.553,0.106,0.122716,0.282143,0.521429,0.909329,0.090671
797,0.569716,0.500000,0.5,0.900000,0.956522,1.0,0.445,0.555,0.000,0.279348,0.750000,0.500000,0.862889,0.137111
798,0.703810,1.000000,1.0,0.200000,0.826087,1.0,0.000,0.379,0.621,0.906756,0.575000,0.350000,0.884152,0.115848


In [8]:
X_test

Unnamed: 0,ids,day_of_week,month_of_year,day_of_month,hour_of_day,night,neg,neu,pos,compound,polarity,subjectivity,embedding_negativity,embedding_positivity
0,0.702409,1.000000,1.0,0.200000,0.739130,0.0,0.000,1.000,0.000,0.466604,0.500000,0.000000,0.690881,0.309119
1,0.382264,0.500000,0.5,0.433333,0.347826,0.0,0.000,0.457,0.543,0.858460,0.850000,0.600000,0.836404,0.163596
2,0.888419,0.500000,1.0,0.566667,0.695652,0.0,0.085,0.789,0.126,0.649416,0.442969,0.525000,0.958623,0.041377
3,0.378482,0.333333,0.5,0.400000,1.000000,1.0,0.000,0.734,0.266,0.896877,0.668750,0.200000,0.894359,0.105641
4,0.860828,0.333333,1.0,0.533333,0.260870,0.0,0.600,0.400,0.000,0.072718,0.500000,0.000000,0.765628,0.234372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.693189,0.833333,1.0,0.166667,1.000000,1.0,0.000,0.286,0.714,0.915372,0.890625,0.500000,0.705961,0.294039
196,0.636608,0.166667,1.0,0.033333,1.000000,1.0,0.000,0.756,0.244,0.850941,0.650000,0.300000,0.914930,0.085070
197,0.482245,0.666667,0.5,0.700000,0.173913,1.0,0.200,0.800,0.000,0.396301,0.600000,0.200000,0.881099,0.118901
198,0.911763,0.666667,1.0,0.600000,1.000000,1.0,0.000,0.531,0.469,0.766972,0.900000,0.700000,0.820909,0.179091


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
X_train, X_test = add_word_embeddings(X_train, X_test, y_train)

df_result = compare_models(X_train= X_train, X_test= X_test, y_test=y_test, y_train=y_train, models= clf_models, names= clf_names)

Read 0M words
Number of words:  3715
Number of labels: 2
Progress: 100.0% words/sec/thread:   51420 lr:  0.000000 avg.loss:  0.668736 ETA:   0h 0m 0s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Start training model: Linear SVC


Finishing training model: Linear SVC, trained in 0.012132883071899414

Score of Linear SVC model performed: 0.6925227113906359
Start training model: Random Forest


Finishing training model: Random Forest, trained in 0.2577660083770752

Score of Random Forest model performed: 0.7173224785165084
Start training model:  Bernoulli Nayve Bayes


Finishing training model:  Bernoulli Nayve Bayes, trained in 0.008181095123291016

Score of  Bernoulli Nayve Bayes model performed: 0.6280980266992313


In [19]:
clf = RandomForestClassifier()

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')

print(f1)
importances = pd.Series(X_train.columns, clf.feature_importances_)
importances.sort_index(ascending = False).head(20)

0.7165699617636835


0.142130     day_of_month
0.124731      hour_of_day
0.119548         compound
0.104537              neg
0.103954         polarity
0.090816              pos
0.084652              neu
0.079989     subjectivity
0.068748      day_of_week
0.057686    month_of_year
0.023209            night
dtype: object