In [None]:
import numpy as np
import sys
import yaml
import os
import datetime
import glob
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('VCDI')
from shutil import copyfile
from sklearn.model_selection import StratifiedKFold
sys.path.insert(0,'../models/')
from utils import generate_model_config, generate_gridsearch_configs, train_model

sys.path.insert(0,'/anaconda3/envs/model_search/lib/python3.7/site-packages') #This is specific to my environment in order to import xgboost, proably not needed in others
import xgboost

In [None]:
train_type = 'classification' #classification or regression
experiment_folder = sorted(glob.glob(f'../results/{train_type}/20*'))[-1]
last_results = experiment_folder+ '/results_df.pkl'
results_df = pd.read_pickle(last_results)

with open(f"{experiment_folder}/config_{train_type}.yaml", 'r') as stream:
    config_experiment = yaml.load(stream)

if 'class_weight' in results_df.columns:
    results_df['class_weight'].fillna('None', inplace=True)

### Get metric names

In [None]:
metrics = [key for key in config_experiment['metrics']if config_experiment['metrics'][key]]
if 'label_distribution' in metrics:
    metrics.remove('label_distribution')
    metrics.remove('multiclass_average_strategy')

metrics = [m+'_train' for m in metrics] + [m+'_val' for m in metrics]



### Average over cross val folds

In [None]:
crossfold_df = results_df.groupby(['model_num'], sort=False)[metrics].mean()
model_params = results_df[results_df['fold']==0].drop(crossfold_df.columns, axis=1).reset_index(drop=True)
crossfold_df = pd.concat([model_params, crossfold_df], axis=1)


### Visualise best results for each model

In [None]:
metric = 'f1_val'
idx = crossfold_df.groupby(['model'])[metric].transform(max) == crossfold_df[metric]

plt.figure(figsize=(10,10))
sns.barplot(x="model", y=metric, data=crossfold_df[idx])

### Pick best model and look into parameters 

In [None]:
metric = 'f1_val'
best_type = crossfold_df.loc[crossfold_df[metric].idxmax(),'model'] # type of best model   any(x in str for x in a)
tested_params = model_params[model_params.loc[:,'model']==best_type].loc[:,[column for column in model_params.columns if not any(x in column for x in ['label_count', 'train_time', 'model_num'])]].dropna(axis=1, how='all')


In [None]:
print('The following parameters were tested:')
for col in tested_params.columns:
    if len(tested_params[col].unique())>1:
        print(f'    {col}: {tested_params[col].unique()}')
        
print('\nThe best model was:')
for col in tested_params.columns:
    if len(tested_params[col].unique())>1:
        print(f'    {col}: {tested_params.loc[crossfold_df[metric].idxmax(), col]}')

In [None]:
parameter1 = 'max_depth'
parameter2 = 'max_features'
metric = 'f1_val'

plt.figure(figsize=(10,10))
sns.catplot(x=parameter1, y=metric, hue=parameter2, data=crossfold_df[crossfold_df.loc[:,'model']==best_type], kind="point")

### Investigate effect of model complexity

In [None]:
parameter1 = 'max_depth'
param_dict = {
    'max_features': 0.1, 
    'class_weight': 'None',
}
metric = 'precision'
plot_df = crossfold_df[crossfold_df.loc[:,'model']==best_type]
for key in param_dict:
    plot_df = plot_df[plot_df.loc[:,key]==param_dict[key]]

plt.figure(figsize=(10,10))
sns.catplot(x=parameter1, y=metric+'_val', data=plot_df, kind="point")
sns.catplot(x=parameter1, y=metric+'_train', data=plot_df, kind="point")