In [1]:
import pandas as pd
import numpy as np

import time

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


Brief description of the two function that I have implemented:
 1) compare_models --> train and test the models contained in the model list and create a dataframe to easily visualize the metrics 
 2) compare_models_KF --> do exactly the same, but it uses the K-Fold method to make robust result of the metrcs that we obtain.

In [2]:
def compare_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, models: list, names: list) -> pd.DataFrame:

    """Inputs:
        X_train = training data set
        X_test = testing data set
        y_train = training label 
        y_test = testing label 
        models = list of models to compare
        names = list of trings containing the names of the models 

        Return: comparison table 
        """

    f1_scores, precision_scores, recall_scores, accuracy_scores = [], [], [],[]
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        clf.fit(X_train, y_train)
        print('\n')

        finish_time = time.time()

        print(f'Fiishing training model: {name}, trained in {finish_time-start_time}\n')

        f1 = f1_score(y_test, clf.predict(X_test), average = 'macro')
        f1_scores.append(f1)

        acc = accuracy_score(y_test, clf.predict(X_test))
        accuracy_scores.append(acc)

        prec = precision_score(y_test, clf.predict(X_test), average = 'macro')
        precision_scores.append(prec)

        rec = recall_score(y_test, clf.predict(X_test), average='macro')
        recall_scores.append(rec)



        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    col3 = pd.Series(recall_scores)
    col4 = pd.Series(accuracy_scores)
    col5 = pd.Series(precision_scores)

    result = pd.concat([col1, col2, col3, col4, col5], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score', 'Recall', 'Accuray', 'Precision']

    return result

In [3]:
def compare_models_KF(X: pd.DataFrame, y: pd.Series, models: list, names: list, k = int) -> pd.DataFrame:

    """Inputs:
        X = training data set
        y = training label  
        models = list of models to compare
        names = list of trings containing the names of the models 
        k = int for the cross validation 

        Return: comparison table 
        """

    f1_scores = []
    
    for clf, name in zip(models, names):

        print(f'Start training model: {name}')

        start_time = time.time()

        f1_score = cross_val_score(clf, X, y, cv = k, scoring = 'f1_macro')

        finish_time = time.time()

        print(f'Fiishing training model: {name}, trained in {finish_time-start_time}\n')

        f1 = f1_score.mean()
        f1_scores.append(f1)

        print(f'Score of {name} model performed: {f1}')

    col1 = pd.Series(names)
    col2 = pd.Series(f1_scores)
    

    result = pd.concat([col1, col2], axis = 'columns')
    result.columns = ['Model Name', 'F1 Score']

    return result

In [4]:
clf_models = [DecisionTreeClassifier(), KNeighborsClassifier(), SVC(), RandomForestClassifier(), GaussianNB(), MLPClassifier()]

clf_names = ['Decision Tree', 'K-Nearest Neighbor', 'Support Vector Machine', 'Random Forest', 'Nayve Bayes', 'Multi Layer Neural Network']

#X_train, X_test, y_train, y_test = train_test_split(X, y, testsize = 0.2, stratify=y)

#df_result = compare_models(X_train= X_train, X_test= X_test, y_test=y_test, y_train=y_train, models= clf_models, names= clf_names)

In [5]:
# I used this cell to test the fucntions that I implemented above

from sklearn.datasets import load_iris

X,y  = load_iris(return_X_y=True)
df_result1 = compare_models_KF(X, y, models=clf_models, names = clf_names, k= 5)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
df_result0 = compare_models(X_train= X_train, X_test= X_test, y_test=y_test, y_train=y_train, models= clf_models, names= clf_names)

Start training model: Decision Tree
Fiishing training model: Decision Tree, trained in 0.006243467330932617

Score of Decision Tree model performed: 0.9598997493734336
Start training model: K-Nearest Neighbor
Fiishing training model: K-Nearest Neighbor, trained in 0.009898185729980469

Score of K-Nearest Neighbor model performed: 0.973165236323131
Start training model: Support Vector Machine
Fiishing training model: Support Vector Machine, trained in 0.006592273712158203

Score of Support Vector Machine model performed: 0.9666165413533834
Start training model: Random Forest
Fiishing training model: Random Forest, trained in 0.37074947357177734

Score of Random Forest model performed: 0.9598319029897976
Start training model: Nayve Bayes
Fiishing training model: Nayve Bayes, trained in 0.006242275238037109

Score of Nayve Bayes model performed: 0.9530472646262119
Start training model: Multi Layer Neural Network




Fiishing training model: Multi Layer Neural Network, trained in 0.3696417808532715

Score of Multi Layer Neural Network model performed: 0.9664818612187034
Start training model: Decision Tree


Fiishing training model: Decision Tree, trained in 0.0004968643188476562

Score of Decision Tree model performed: 0.9665831244778613
Start training model: K-Nearest Neighbor


Fiishing training model: K-Nearest Neighbor, trained in 0.0004100799560546875

Score of K-Nearest Neighbor model performed: 0.9665831244778613
Start training model: Support Vector Machine


Fiishing training model: Support Vector Machine, trained in 0.0006253719329833984

Score of Support Vector Machine model performed: 1.0
Start training model: Random Forest


Fiishing training model: Random Forest, trained in 0.06635665893554688

Score of Random Forest model performed: 0.9326599326599326
Start training model: Nayve Bayes


Fiishing training model: Nayve Bayes, trained in 0.0005474090576171875

Score of Nayve Bayes model 



In [6]:
df_result0

Unnamed: 0,Model Name,F1 Score,Recall,Accuray,Precision
0,Decision Tree,0.966583,0.966667,0.966667,0.969697
1,K-Nearest Neighbor,0.966583,0.966667,0.966667,0.969697
2,Support Vector Machine,1.0,1.0,1.0,1.0
3,Random Forest,0.93266,0.933333,0.933333,0.944444
4,Nayve Bayes,0.966583,0.966667,0.966667,0.969697
5,Multi Layer Neural Network,0.966583,0.966667,0.966667,0.969697


In [7]:
df_result1

Unnamed: 0,Model Name,F1 Score
0,Decision Tree,0.9599
1,K-Nearest Neighbor,0.973165
2,Support Vector Machine,0.966617
3,Random Forest,0.959832
4,Nayve Bayes,0.953047
5,Multi Layer Neural Network,0.966482
