In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import time

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from preprocessing import  load, extract_features, text_mining_tfdf, text_mining_sentiment, add_word_embeddings, clean_text, add_user_text, drop_duplicates, convert_categorical, normalize, save_results

Brief description of the two function that I have implemented:
 1) compare_models --> train and test the models contained in the model list and create a dataframe to easily visualize the metrics 
 2) compare_models_KF --> do exactly the same, but it uses the K-Fold method to make robust result of the metrcs that we obtain.

In [None]:
def compare_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, models: list, names: list) -> pd.DataFrame:

    """Inputs:
        X_train = training data set
        X_test = testing data set
        y_train = training label 
        y_test = testing label 
        models = list of models to compare
        names = list of trings containing the names of the models 

        Return: comparison table 
        """

    f1_scores, precision_scores, recall_scores, accuracy_scores = [], [], [],[]
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        clf.fit(X_train, y_train)
        print('\n')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average = 'macro')
        f1_scores.append(f1)


        acc = accuracy_score(y_test, y_pred)
        accuracy_scores.append(acc)

        prec = precision_score(y_test, y_pred, average = 'macro')
        precision_scores.append(prec)

        rec = recall_score(y_test, y_pred, average='macro')
        recall_scores.append(rec)



        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    col3 = pd.Series(recall_scores)
    col4 = pd.Series(accuracy_scores)
    col5 = pd.Series(precision_scores)

    result = pd.concat([col1, col2, col3, col4, col5], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score', 'Recall', 'Accuray', 'Precision']

    return result

In [None]:
def compare_models_KF(X: pd.DataFrame, y: pd.Series, models: list, names: list, k = int) -> pd.DataFrame:

    """Inputs:
        X = training data set
        y = training label  
        models = list of models to compare
        names = list of trings containing the names of the models 
        k = int for the cross validation 

        Return: comparison table 
        """

    f1_scores = []
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        f1_score = cross_val_score(clf, X, y, cv = k, scoring = 'f1_macro')

        finish_time = time.time()

        print(f'Finishing training model: {name}, trained in {finish_time-start_time}\n')

        f1 = f1_score.mean()
        f1_scores.append(f1)

        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    

    result = pd.concat([col1, col2], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score']

    return result

In [None]:
clf_models = [LinearSVC(), RandomForestClassifier(), BernoulliNB()]

clf_names = ['Linear SVC', 'Random Forest', ' Bernoulli Nayve Bayes']

In [None]:
X_train = load(filepath="./DSL2122_january_dataset/development.csv").pipe(extract_features).pipe(drop_duplicates, drop_long_text=True).pipe(clean_text).pipe(text_mining_sentiment).pipe(add_user_text)
X_test = load(filepath="./DSL2122_january_dataset/evaluation.csv").pipe(extract_features).pipe(clean_text).pipe(text_mining_sentiment).pipe(add_user_text)

X_train, X_test = text_mining_tfdf(X_train, X_test) #if you don't want to use it, just comment it out

X_train, X_test = add_word_embeddings(X_train, X_test) #if you don't want to use it, just comment it out

X_train = X_train.pipe(convert_categorical) #necessary
X_test = X_test.pipe(convert_categorical)   #necessary

X_train, X_test, y_train = normalize(X_train, X_test)