In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import time

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from Preprocessing import  load, extract_date, text_mining_tfdf, text_mining_sentiment, preprocessing, add_word_embeddings, clean_text, add_user_text

Brief description of the two function that I have implemented:
 1) compare_models --> train and test the models contained in the model list and create a dataframe to easily visualize the metrics 
 2) compare_models_KF --> do exactly the same, but it uses the K-Fold method to make robust result of the metrcs that we obtain.

In [2]:
def compare_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, models: list, names: list) -> pd.DataFrame:

    """Inputs:
        X_train = training data set
        X_test = testing data set
        y_train = training label 
        y_test = testing label 
        models = list of models to compare
        names = list of trings containing the names of the models 

        Return: comparison table 
        """

    f1_scores, precision_scores, recall_scores, accuracy_scores = [], [], [],[]
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        clf.fit(X_train, y_train)
        print('\n')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average = 'macro')
        f1_scores.append(f1)


        acc = accuracy_score(y_test, y_pred)
        accuracy_scores.append(acc)

        prec = precision_score(y_test, y_pred, average = 'macro')
        precision_scores.append(prec)

        rec = recall_score(y_test, y_pred, average='macro')
        recall_scores.append(rec)



        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    col3 = pd.Series(recall_scores)
    col4 = pd.Series(accuracy_scores)
    col5 = pd.Series(precision_scores)

    result = pd.concat([col1, col2, col3, col4, col5], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score', 'Recall', 'Accuray', 'Precision']

    return result

In [3]:
def compare_models_KF(X: pd.DataFrame, y: pd.Series, models: list, names: list, k = int) -> pd.DataFrame:

    """Inputs:
        X = training data set
        y = training label  
        models = list of models to compare
        names = list of trings containing the names of the models 
        k = int for the cross validation 

        Return: comparison table 
        """

    f1_scores = []
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        f1_score = cross_val_score(clf, X, y, cv = k, scoring = 'f1_macro')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        f1 = f1_score.mean()
        f1_scores.append(f1)

        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    

    result = pd.concat([col1, col2], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score']

    return result

In [4]:
clf_models = [LinearSVC(), RandomForestClassifier(), BernoulliNB()]

clf_names = ['Linear SVC', 'Random Forest', ' Bernoulli Nayve Bayes']

X, y = load().pipe(extract_date).pipe(clean_text).pipe(add_user_text).pipe(text_mining_sentiment).pipe(preprocessing)

100%|██████████| 1000/1000 [00:05<00:00, 187.39it/s]
100%|██████████| 1000/1000 [00:00<00:00, 4193.26it/s]
100%|██████████| 1000/1000 [00:00<00:00, 4958.64it/s]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
X_train, X_test = add_word_embeddings(X_train, X_test, y_train)

df_result = compare_models(X_train= X_train, X_test= X_test, y_test=y_test, y_train=y_train, models= clf_models, names= clf_names)

Read 0M words
Number of words:  5035
Number of labels: 2
Progress: 100.0% words/sec/thread:   59940 lr:  0.000000 avg.loss:  0.694166 ETA:   0h 0m 0s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Start training model: Linear SVC


Finishing training model: Linear SVC, trained in 0.01400613784790039

Score of Linear SVC model performed: 0.6683636363636363
Start training model: Random Forest


Finishing training model: Random Forest, trained in 0.1874709129333496

Score of Random Forest model performed: 0.7877758913412563
Start training model:  Bernoulli Nayve Bayes


Finishing training model:  Bernoulli Nayve Bayes, trained in 0.007173061370849609

Score of  Bernoulli Nayve Bayes model performed: 0.6592292089249493


In [7]:
clf = RandomForestClassifier()

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')

print(f1)
importances = pd.Series(X_train.columns, clf.feature_importances_)
importances.sort_index(ascending = False).head(20)

0.7606319317003112


0.185062                     ids
0.110524    embedding_positivity
0.107131                compound
0.106172    embedding_negativity
0.075699                     pos
0.074099                     neg
0.071647                polarity
0.059200                     neu
0.058242             hour_of_day
0.050821            subjectivity
0.046531            day_of_month
0.032548             day_of_week
0.022323           month_of_year
dtype: object