In [1]:
import pandas as pd
import ast
import os
from pprint import pprint
import matplotlib.pyplot as plt
from auxiliar_func import *
from plot_func import *

# Importing the models
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
models = {
    'nb': GaussianNB,
    'lda': LDA,
    'qda': QDA,
    'knn': KNN,
    'logreg': LogisticRegression,
    'svm': LinearSVC,
    'rf': RandomForestClassifier,
    'xgb': XGBClassifier,
    'catboost': CatBoostClassifier,
}

df_tr = pd.read_csv('../train.csv')

# for xgboost
df_tr = df_tr.applymap(lambda x: x.replace('[', '').replace(']', '').replace('<', '') if isinstance(x, str) else x)

TARGET_METRIC = 'f1_macro'
SEED = 42
CV_FOLDS = 20

results = pd.DataFrame(columns=['model', 'accuracy', 'f1_macro', 'precision_macro', 'recall_macro'], dtype=float)
results.set_index('model', inplace=True)


for file in os.listdir('./results'):
    if file.endswith('.csv') and file.startswith('results_'):
        res = read_results('./results/'+file)
        prep_par, model_par = get_best_params('./results/'+file)
        mod_name = file.split('_')[1][:-4]
        model = models[mod_name](**model_par)
        score, y_pred, y_true = cross_validation(model, df_tr, prep_par, cv=CV_FOLDS, random_state=SEED, return_predict=True)

        results.loc[mod_name] = pd.Series(score)
        
        plot_conf_matrix(y_true, y_pred, './figures/conf_matrix_'+mod_name+'.pdf', show=False)
        
        

results = results.sort_values(by=TARGET_METRIC, ascending=False)

0 0.8005060747743677


In [None]:
results.to_csv('./results/cv20_results.csv')
results.head(10)

Unnamed: 0_level_0,accuracy,f1_macro,precision_macro,recall_macro
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
catboost,0.953654,0.799174,0.806165,0.79285
xgb,0.952358,0.795985,0.799193,0.793043
rf,0.947403,0.779654,0.776612,0.783019
logreg,0.945241,0.775301,0.767037,0.784516
svm,0.94532,0.773143,0.767616,0.779126
lda,0.945047,0.764396,0.767593,0.761507
knn,0.945928,0.758085,0.773709,0.744668
qda,0.930821,0.743132,0.717565,0.77835
nb,0.909004,0.691692,0.662621,0.742476
