In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import time

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from Preprocessing import  load, extract_features, text_mining_tfdf, text_mining_sentiment, add_word_embeddings, clean_text, add_user_text, drop_duplicates, convert_categorical, normalize, save_results

Brief description of the two function that I have implemented:
 1) compare_models --> train and test the models contained in the model list and create a dataframe to easily visualize the metrics 
 2) compare_models_KF --> do exactly the same, but it uses the K-Fold method to make robust result of the metrcs that we obtain.

In [2]:
def compare_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, models: list, names: list) -> pd.DataFrame:

    """Inputs:
        X_train = training data set
        X_test = testing data set
        y_train = training label 
        y_test = testing label 
        models = list of models to compare
        names = list of trings containing the names of the models 

        Return: comparison table 
        """

    f1_scores, precision_scores, recall_scores, accuracy_scores = [], [], [],[]
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        clf.fit(X_train, y_train)
        print('\n')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average = 'macro')
        f1_scores.append(f1)


        acc = accuracy_score(y_test, y_pred)
        accuracy_scores.append(acc)

        prec = precision_score(y_test, y_pred, average = 'macro')
        precision_scores.append(prec)

        rec = recall_score(y_test, y_pred, average='macro')
        recall_scores.append(rec)



        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    col3 = pd.Series(recall_scores)
    col4 = pd.Series(accuracy_scores)
    col5 = pd.Series(precision_scores)

    result = pd.concat([col1, col2, col3, col4, col5], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score', 'Recall', 'Accuray', 'Precision']

    return result

In [3]:
def compare_models_KF(X: pd.DataFrame, y: pd.Series, models: list, names: list, k = int) -> pd.DataFrame:

    """Inputs:
        X = training data set
        y = training label  
        models = list of models to compare
        names = list of trings containing the names of the models 
        k = int for the cross validation 

        Return: comparison table 
        """

    f1_scores = []
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        f1_score = cross_val_score(clf, X, y, cv = k, scoring = 'f1_macro')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        f1 = f1_score.mean()
        f1_scores.append(f1)

        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    

    result = pd.concat([col1, col2], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score']

    return result

In [10]:
clf_models = [LinearSVC(), RandomForestClassifier(), BernoulliNB()]

clf_names = ['Linear SVC', 'Random Forest', ' Bernoulli Nayve Bayes']

X, y = load().pipe(extract_date).pipe(clean_text).pipe(add_user_text).pipe(preprocessing)

NameError: name 'preprocessing' is not defined

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=42)
X_train, X_test = add_word_embeddings(X_train, X_test, y_train)

df_result = compare_models(X_train= X_train, X_test= X_test, y_test=y_test, y_train=y_train, models= clf_models, names= clf_names)

Read 2M words
Number of words:  251012
Number of labels: 2
Progress: 100.0% words/sec/thread: 1648931 lr:  0.000000 avg.loss:  0.358665 ETA:   0h 0m 0s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Start training model: Linear SVC


Finishing training model: Linear SVC, trained in 3.4832990169525146

Score of Linear SVC model performed: 0.809720948247317
Start training model: Random Forest


Finishing training model: Random Forest, trained in 25.77042007446289

Score of Random Forest model performed: 0.8259210688737337
Start training model:  Bernoulli Nayve Bayes


Finishing training model:  Bernoulli Nayve Bayes, trained in 0.07192516326904297

Score of  Bernoulli Nayve Bayes model performed: 0.36679438058748404


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
X_train

Unnamed: 0,embedding_negativity,embedding_positivity
0,1.566800e-01,0.843320
1,8.715825e-01,0.128418
2,1.189675e-01,0.881032
3,6.133610e-01,0.386639
4,5.428984e-01,0.457102
...,...,...
178479,9.211846e-02,0.907881
178480,5.627953e-09,1.000000
178481,3.120898e-05,0.999969
178482,7.022826e-01,0.297717


In [4]:
clf = RandomForestClassifier(random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
#f1 = f1_score(y_test, y_pred, average='macro')

#print(f1)
importances = pd.Series(X_train.columns, clf.feature_importances_)
importances.sort_index(ascending = False).head(20)

0.073850    embedding_positivity
0.070641                     ids
0.067279    embedding_negativity
0.058744                     neg
0.048191                compound
0.025772                polarity
0.025334              char_count
0.022674                     pos
0.022161            day_of_month
0.019548             day_of_week
0.017202                       .
0.016374             hour_of_day
0.016332                       @
0.014284                      wa
0.013901                     neu
0.013485            subjectivity
0.007743           month_of_year
0.005653                  though
0.005349                     see
0.005129                     ...
dtype: object

In [2]:
X_train = load(filepath="./DSL2122_january_dataset/development.csv").sample(100).pipe(extract_features).pipe(drop_duplicates, drop_long_text=True).pipe(clean_text).pipe(text_mining_sentiment).pipe(add_user_text)
X_test = load(filepath="./DSL2122_january_dataset/evaluation.csv").sample(100).pipe(extract_features).pipe(clean_text).pipe(text_mining_sentiment).pipe(add_user_text)

X_train, X_test = text_mining_tfdf(X_train, X_test) #if you don't want to use it, just comment it

X_train, X_test = add_word_embeddings(X_train, X_test) #if you don't want to use it, just comment it

X_train = X_train.pipe(convert_categorical) #necessary
X_test = X_test.pipe(convert_categorical)   #necessary

X_train, X_test, y_train = normalize(X_train, X_test)

100%|██████████| 100/100 [00:00<00:00, 118.40it/s]
100%|██████████| 100/100 [00:00<00:00, 1325.20it/s]
100%|██████████| 100/100 [00:00<00:00, 3417.14it/s]
100%|██████████| 100/100 [00:00<00:00, 118.73it/s]
100%|██████████| 100/100 [00:00<00:00, 3623.96it/s]
100%|██████████| 100/100 [00:00<00:00, 3595.69it/s]
Read 0M words
Number of words:  880
Number of labels: 2
Progress: 100.0% words/sec/thread:    7252 lr:  0.000000 avg.loss:  0.694575 ETA:   0h 0m 0s
