In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import time

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from Preprocessing import  load, cleaning, text_mining_tfdf, text_mining_sentiment, preprocessing, create_public_holiday, hashtag_tfidf

Brief description of the two function that I have implemented:
 1) compare_models --> train and test the models contained in the model list and create a dataframe to easily visualize the metrics 
 2) compare_models_KF --> do exactly the same, but it uses the K-Fold method to make robust result of the metrcs that we obtain.

In [2]:
def compare_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, models: list, names: list) -> pd.DataFrame:

    """Inputs:
        X_train = training data set
        X_test = testing data set
        y_train = training label 
        y_test = testing label 
        models = list of models to compare
        names = list of trings containing the names of the models 

        Return: comparison table 
        """

    f1_scores, precision_scores, recall_scores, accuracy_scores = [], [], [],[]
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        clf.fit(X_train, y_train)
        print('\n')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average = 'macro')
        f1_scores.append(f1)


        acc = accuracy_score(y_test, y_pred)
        accuracy_scores.append(acc)

        prec = precision_score(y_test, y_pred, average = 'macro')
        precision_scores.append(prec)

        rec = recall_score(y_test, y_pred, average='macro')
        recall_scores.append(rec)



        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    col3 = pd.Series(recall_scores)
    col4 = pd.Series(accuracy_scores)
    col5 = pd.Series(precision_scores)

    result = pd.concat([col1, col2, col3, col4, col5], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score', 'Recall', 'Accuray', 'Precision']

    return result

In [3]:
def compare_models_KF(X: pd.DataFrame, y: pd.Series, models: list, names: list, k = int) -> pd.DataFrame:

    """Inputs:
        X = training data set
        y = training label  
        models = list of models to compare
        names = list of trings containing the names of the models 
        k = int for the cross validation 

        Return: comparison table 
        """

    f1_scores = []
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        f1_score = cross_val_score(clf, X, y, cv = k, scoring = 'f1_macro')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        f1 = f1_score.mean()
        f1_scores.append(f1)

        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    

    result = pd.concat([col1, col2], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score']

    return result

In [4]:
clf_models = [DecisionTreeClassifier(), KNeighborsClassifier(), SVC(), RandomForestClassifier(), GaussianNB(), MLPClassifier()]

clf_names = ['Decision Tree', 'K-Nearest Neighbor', 'Support Vector Machine', 'Random Forest', 'Nayve Bayes', 'Multi Layer Neural Network']

X, y = load().pipe(cleaning).pipe(create_public_holiday).pipe(text_mining_sentiment).pipe(text_mining_tfdf).pipe(preprocessing)

100%|██████████| 10000/10000 [01:00<00:00, 166.31it/s]
100%|██████████| 10000/10000 [00:01<00:00, 5447.52it/s]
100%|██████████| 10000/10000 [00:01<00:00, 5502.72it/s]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

df_result = compare_models(X_train= X_train, X_test= X_test, y_test=y_test, y_train=y_train, models= clf_models, names= clf_names)

Start training model: Decision Tree


Finishing training model: Decision Tree, trained in 0.30092501640319824

Score of Decision Tree model performed: 0.7058956608298266
Start training model: K-Nearest Neighbor


Finishing training model: K-Nearest Neighbor, trained in 0.004374980926513672

Score of K-Nearest Neighbor model performed: 0.666065963031506
Start training model: Support Vector Machine


Finishing training model: Support Vector Machine, trained in 6.308397054672241

Score of Support Vector Machine model performed: 0.7613605232518321
Start training model: Random Forest


Finishing training model: Random Forest, trained in 1.6253759860992432

Score of Random Forest model performed: 0.769748995006693
Start training model: Nayve Bayes


Finishing training model: Nayve Bayes, trained in 0.019955873489379883

Score of Nayve Bayes model performed: 0.6897568914657715
Start training model: Multi Layer Neural Network


Finishing training model: Multi Layer Neural Network, trained in 8

