# Model Selection

- make_otherreport function: Using the whole results in all the model folders in a result folder, generate the dataframe with all models of all permutations of hyperparameters 

In [1]:
#!pip install pandas==1.4.1
#!pip freeze

import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score, f1_score,classification_report 
from sklearn.metrics import roc_curve, auc

import model_team14 
from model_team14 import *



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def make_auc_dict(model_name, clf, labels, y_test, y_pred, y_pred_prob):

    dict_auc={'model':[], 'class':[], 'auc':[]}

    if len(labels)==2:

        fpr, tpr, threshold = roc_curve(y_test, y_pred)
        roc_auc = auc(fpr, tpr)

        dict_auc['model'].append(model_name)
        dict_auc['class'].append(0)
        dict_auc['auc'].append(roc_auc)

        dict_auc['model'].append(model_name)
        dict_auc['class'].append(1)
        dict_auc['auc'].append(roc_auc)

    else:

        #Binarize the output
        y_test_bin = label_binarize(y_test, classes=sorted(labels))
        n_classes = y_test_bin.shape[1]

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
                                    
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])  #y_score[:, i]

            dict_auc['model'].append(model_name)
            dict_auc['class'].append(i)
            dict_auc['auc'].append(auc(fpr[i], tpr[i]))

    return pd.DataFrame(dict_auc)

In [3]:
def make_otherreport():

    folders=[x for x in os.listdir("../result") if ('ft' in x)]  ##&('ls' not in x)

    for i, folder in enumerate(folders):
        reports=[x for x in os.listdir("../result/{}".format(folder)) if 'creport' in x]
    
        models=[x for x in os.listdir("../result/{}".format(folder)) if 'clf' in x]
        
        ft_criteria=folder.split('_')[1]
        ft_threshold=folder.split('_')[2]
        
        ft_scaling=folder.split('_')[-1]
        if ft_scaling=='scaling':
            ft_scaling='scaled'
        else:
            ft_scaling='non-scaled'

        path='../result/{}'.format(folder)

        for k, report in enumerate(reports):
            with open (path+'/{}'.format(report), 'rb') as f:
                df=pickle.load(f)
            if k==0:
                df_report=df
            else:    
                df_report=pd.concat([df_report, df])

        df_report['ft_criteria']=ft_criteria    
        df_report['ft_threshold']=ft_threshold
        df_report['ft_scaling']=ft_scaling
        
         
        print('folder is', folder)

        for j, model in enumerate(models):
            model_name=model.split('.')[0]
            dtype=model.split('_')[4]
            y_type=model.split('_')[2]+'_'+model.split('_')[3]
            test_year=5

            with open (path+'/{}'.format(model), 'rb') as f:
                [clf, df_cvresult, y_pred, y_pred_prob, clf_report]=pd.read_pickle(f)

            if dtype=='tr':
                X=pd.read_csv('../data/X_data_tr.csv', index_col='date', parse_dates=True)
                y=pd.read_csv('../data/y_data_tr.csv', index_col='date', parse_dates=True)
            else:
                X=pd.read_csv('../data/X_data.csv', index_col='date', parse_dates=True)
                y=pd.read_csv('../data/y_data.csv', index_col='date', parse_dates=True)

            labels=list(y[y_type].unique())    
            y_test=y[y_type][-(test_year*12):]


            df_tmp=make_auc_dict(model_name, clf, labels, y_test, y_pred, y_pred_prob)

            if j==0:
                df_auc=df_tmp
            else:
                df_auc=pd.concat([df_auc, df_tmp])

        df_report=df_report.merge(df_auc, left_on=['class','model'], right_on=['class','model'])

        if i==0:
            df_final_report=df_report
        else:
            df_final_report=pd.concat([df_final_report, df_report])
            
    return df_final_report            

In [4]:
# uncomment to update and generate new pickle files
import pickle

df_creport_other=make_otherreport()
df_creport_other.head()
len(df_creport_other)

df_selection=df_creport_other.groupby(['ft_criteria','ft_threshold','ft_scaling','model']).mean().reset_index()
df_selection.head()

final_selection=df_selection.sort_values(['recall','accuracy'], ascending=[False, False])

with open('../result/final_selection.pkl','wb') as f:
    pickle.dump(final_selection, f)

final_selection[:20]  ##[final_selection.ft_scaling=='non-scaled'][:10]

folder is ft_None_0.3_scaling
folder is ft_None_0.5
folder is ft_None_0.2_scaling
folder is ft_None_0.2
folder is ft_None_0.4_scaling
folder is ft_None_0.3
folder is ft_None_0.3_sensitivity
folder is ft_None_0.5_scaling
folder is ft_None_0.4


Unnamed: 0,ft_criteria,ft_threshold,ft_scaling,model,class,precision,recall,f1_score,support,accuracy,auc
34,,0.2,scaled,clf_rf_y_agg_tr_tss_t5_spl5,1.0,0.566667,0.94,0.633333,20.0,0.83,0.955229
54,,0.3,non-scaled,clf_knn_y_oecd_tr_tss_t5_spl5,0.5,0.655,0.92,0.69,30.0,0.85,0.919643
6,,0.2,non-scaled,clf_knn_y_oecd_tr_tss_t5_spl5,0.5,0.62,0.885,0.625,30.0,0.78,0.883929
102,,0.4,non-scaled,clf_knn_y_oecd_tr_tss_t5_spl5,0.5,0.62,0.885,0.625,30.0,0.78,0.883929
150,,0.5,non-scaled,clf_knn_y_oecd_tr_tss_t5_spl5,0.5,0.62,0.885,0.625,30.0,0.78,0.883929
12,,0.2,non-scaled,clf_rf_y_oecd_ntr_tss_t5_spl5,0.5,0.99,0.875,0.925,30.0,0.98,0.875
36,,0.2,scaled,clf_rf_y_oecd_ntr_tss_t5_spl5,0.5,0.99,0.875,0.925,30.0,0.98,0.875
18,,0.2,non-scaled,clf_svc_y_agg_tr_tss_t5_spl5,1.0,0.69,0.84,0.603333,20.0,0.55,0.874897
76,,0.3,scaled,clf_knn_y_oecd_ntr_tss_t5_spl5,0.5,0.68,0.83,0.72,30.0,0.9,0.830357
124,,0.4,scaled,clf_knn_y_oecd_ntr_tss_t5_spl5,0.5,0.68,0.83,0.72,30.0,0.9,0.830357


In [5]:
with open('../result/ft_None_{}{}/df_{}.pkl'.format(0.2, '_scaling', 'rf'),'rb') as f:
        df_tmp=pickle.load(f)

df_tmp.sort_values('mean_test_recall', ascending=False)

Unnamed: 0,class_weight,max_features,n_estimators,warm_start,model,data,y,cv,mean_test_recall,std_test_recall,rank_test_recall
14,balanced,0.5,50,True,RF,tr,y_oecd,tss,0.687388,0.108870,1
15,balanced,0.5,50,False,RF,tr,y_oecd,tss,0.687388,0.108870,1
61,,0.2,30,False,RF,tr,y_oecd,tss,0.684590,0.101021,3
60,,0.2,30,True,RF,tr,y_oecd,tss,0.684590,0.101021,3
49,balanced_subsample,0.7,30,False,RF,tr,y_oecd,tss,0.681367,0.124834,5
...,...,...,...,...,...,...,...,...,...,...,...
7,balanced,0.3,30,False,RF,ntr,y_agg,tss,0.322009,0.125501,85
42,balanced_subsample,0.5,30,True,RF,ntr,y_agg,tss,0.317870,0.138308,87
43,balanced_subsample,0.5,30,False,RF,ntr,y_agg,tss,0.317870,0.138308,87
37,balanced_subsample,0.3,30,False,RF,ntr,y_agg,tss,0.291785,0.160226,89


In [17]:
thresholds=[0.2,0.3,0.4,0.5]
scalings=['_scaling',''] # ['_scaling','']
final_cv=pd.DataFrame()
for scaling in scalings:
    for threshold in thresholds:
        for i, model in enumerate(['knn','rf','svc','knn_km','rf_km','svc_km']):
            with open('../result/ft_None_{}{}/df_{}.pkl'.format(threshold, scaling, model),'rb') as f:
                df_tmp=pickle.load(f)
                df_tmp['grouping']='km' if  'km' in model else "original"
                df_tmp['threshold']=threshold
                df_tmp['scaling']=scaling
            final_cv=pd.concat([final_cv, df_tmp])
            
# uncomment to save the result
with open('../result/final_cv.pkl', 'wb') as f:
    pickle.dump(final_cv, f)

In [18]:
final_cv[(final_cv.threshold==0.3)
      &(final_cv.y=='y_oecd')
      &(final_cv.data=='tr')
      &(final_cv.scaling=='_scaling')      
      &(final_cv.rank_test_recall==1)            
     ].sort_values('mean_test_recall', ascending=False)[:10]

Unnamed: 0,n_neighbors,weights,model,data,y,cv,mean_test_recall,std_test_recall,rank_test_recall,grouping,threshold,scaling,class_weight,max_features,n_estimators,warm_start,C,kernel
156,,,RF,tr,y_oecd,tss,0.760129,0.04989,1,original,0.3,_scaling,balanced_subsample,0.5,10.0,True,,
157,,,RF,tr,y_oecd,tss,0.760129,0.04989,1,original,0.3,_scaling,balanced_subsample,0.5,10.0,False,,
8,10.0,uniform,KNN,tr,y_oecd,tss,0.703995,0.128376,1,original,0.3,_scaling,,,,,,
7,,,SVC,tr,y_oecd,tss,0.69339,0.065548,1,original,0.3,_scaling,,,,,2.0,rbf
32,,,RF,tr,y_oecd,tss,0.667786,0.057932,1,km,0.3,_scaling,balanced_subsample,0.2,50.0,True,,
33,,,RF,tr,y_oecd,tss,0.667786,0.057932,1,km,0.3,_scaling,balanced_subsample,0.2,50.0,False,,
38,,,RF,tr,y_oecd,tss,0.667786,0.057932,1,km,0.3,_scaling,balanced_subsample,0.3,50.0,True,,
39,,,RF,tr,y_oecd,tss,0.667786,0.057932,1,km,0.3,_scaling,balanced_subsample,0.3,50.0,False,,
8,10.0,uniform,KNN,tr,y_oecd,tss,0.657454,0.03949,1,km,0.3,_scaling,,,,,,
8,,,SVC,tr,y_oecd,tss,0.649191,0.107472,1,km,0.3,_scaling,,,,,2.0,sigmoid


In [9]:
final_selection
final_selection['model_type']=final_selection['model'].apply(lambda x:x.split('_')[1])
final_selection['y_type']=final_selection['class'].apply(lambda x:'binary' if x==0.5 else 'trinary')
final_selection['dtype']=final_selection['model'].apply(lambda x:'non-stationary' if x.split('_')[4]=='ntr' else 'stationary')
final_selection['grouping']=final_selection['model'].apply(lambda x:'km' if x.split('_')[-1]=='km' else 'original')


final_selection.head(10)

Unnamed: 0,ft_criteria,ft_threshold,ft_scaling,model,class,precision,recall,f1_score,support,accuracy,auc,model_type,y_type,dtype,grouping
34,,0.2,scaled,clf_rf_y_agg_tr_tss_t5_spl5,1.0,0.566667,0.94,0.633333,20.0,0.83,0.955229,rf,trinary,stationary,original
54,,0.3,non-scaled,clf_knn_y_oecd_tr_tss_t5_spl5,0.5,0.655,0.92,0.69,30.0,0.85,0.919643,knn,binary,stationary,original
6,,0.2,non-scaled,clf_knn_y_oecd_tr_tss_t5_spl5,0.5,0.62,0.885,0.625,30.0,0.78,0.883929,knn,binary,stationary,original
102,,0.4,non-scaled,clf_knn_y_oecd_tr_tss_t5_spl5,0.5,0.62,0.885,0.625,30.0,0.78,0.883929,knn,binary,stationary,original
150,,0.5,non-scaled,clf_knn_y_oecd_tr_tss_t5_spl5,0.5,0.62,0.885,0.625,30.0,0.78,0.883929,knn,binary,stationary,original
12,,0.2,non-scaled,clf_rf_y_oecd_ntr_tss_t5_spl5,0.5,0.99,0.875,0.925,30.0,0.98,0.875,rf,binary,non-stationary,original
36,,0.2,scaled,clf_rf_y_oecd_ntr_tss_t5_spl5,0.5,0.99,0.875,0.925,30.0,0.98,0.875,rf,binary,non-stationary,original
18,,0.2,non-scaled,clf_svc_y_agg_tr_tss_t5_spl5,1.0,0.69,0.84,0.603333,20.0,0.55,0.874897,svc,trinary,stationary,original
76,,0.3,scaled,clf_knn_y_oecd_ntr_tss_t5_spl5,0.5,0.68,0.83,0.72,30.0,0.9,0.830357,knn,binary,non-stationary,original
124,,0.4,scaled,clf_knn_y_oecd_ntr_tss_t5_spl5,0.5,0.68,0.83,0.72,30.0,0.9,0.830357,knn,binary,non-stationary,original


In [19]:
final_selection[(final_selection.ft_threshold=='0.3')
                &(final_selection.ft_scaling=='scaled')
                &(final_selection.model=='clf_rf_y_oecd_tr_tss_t5_spl5')
               ]

Unnamed: 0,ft_criteria,ft_threshold,ft_scaling,model,class,precision,recall,f1_score,support,accuracy,auc,model_type,y_type,dtype,grouping
86,,0.3,scaled,clf_rf_y_oecd_tr_tss_t5_spl5,0.5,0.595,0.775,0.605,30.0,0.8,0.776786,rf,binary,stationary,original


In [20]:
import altair as alt

base1=alt.Chart(final_selection).mark_point(size=30, shape='circle').encode(
    x=alt.X('precision:Q', scale=alt.Scale(domainMin=0.2, domainMax=1),  ##domainMin=0.45, domainMax=0.7
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q', scale=alt.Scale(domainMin=0.1, domainMax=1),  ##domainMin=0.75, domainMax=0.9
           title='How completely cycle can be detected?'), #domainMin=0.9, domainMax=1.0
    color=alt.Color('y_type:N', scale=alt.Scale(range=['blue','red']), title='# of classes'), #scheme= 'turbo',domainMin=0.78, domainMax=0.86
    shape=alt.Shape('model_type:N', title='model type')
#    color=alt.Color('auc:Q', scale=alt.Scale(scheme= 'turbo', ), title='auc score') #domainMin=0.78, domainMax=0.86
#).transform_filter(alt.datum.model_type=='knn')
).transform_filter((alt.datum.precision>0.2)&(alt.datum.recall>0.1)&(alt.datum.model_type=='rf'
                  ))


emp=alt.Chart(final_selection[final_selection.index==86]).mark_circle(size=200, filled=False, stroke='orange', strokeWidth=5).encode(
    x=alt.X('precision:Q',  #scale=alt.Scale(domainMin=0, domainMax=1)
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q',  #scale=alt.Scale(domainMin=0, domainMax=1)
           title='How completely cycle can be detected?')
)


(emp+base1).properties(                       ##+emp1+emp2+emp3
    width=200, height=300,
    title={'text':['Precision(X) and Recall(Y) of RF models'],
       'subtitle':['the orange point: The Final RF Model'],
       'align':'left',
       'anchor':'start'
      })

In [21]:
import altair as alt

base1=alt.Chart(final_selection).mark_point(size=30, shape='circle').encode(
    x=alt.X('precision:Q', scale=alt.Scale(domainMin=0.2, domainMax=1),  ##domainMin=0.45, domainMax=0.7
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q', scale=alt.Scale(domainMin=0.3, domainMax=1),  ##domainMin=0.75, domainMax=0.9
           title='How completely cycle can be detected?'), #domainMin=0.9, domainMax=1.0
    color=alt.Color('ft_threshold:N', scale=alt.Scale(scheme='blues'), title='Threshold for PCA'), #scheme= 'turbo',domainMin=0.78, domainMax=0.86
    shape=alt.Shape('model_type:N', title='model type')
#    color=alt.Color('auc:Q', scale=alt.Scale(scheme= 'turbo', ), title='auc score') #domainMin=0.78, domainMax=0.86
).transform_filter((alt.datum.precision>0.2)&(alt.datum.recall>0.3)&(alt.datum.model_type=='rf'
                  ))


emp=alt.Chart(final_selection[final_selection.index==86]).mark_circle(size=100, filled=False, stroke='orange', strokeWidth=5).encode(
    x=alt.X('precision:Q',  #scale=alt.Scale(domainMin=0, domainMax=1)
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q',  #scale=alt.Scale(domainMin=0, domainMax=1)
           title='How completely cycle can be detected?')
)


(emp+base1).properties(                       ##+emp1+emp2+emp3
    width=200, height=300,
    title={'text':['Precision(X) and Recall(Y) of RF models'],
       'subtitle':['the orange point: The Final RF Model'],
       'align':'left',
       'anchor':'start'
      })

In [22]:
import altair as alt

base1=alt.Chart(final_selection).mark_point(size=30, shape='circle').encode(
    x=alt.X('precision:Q', scale=alt.Scale(domainMin=0.2, domainMax=1),  ##domainMin=0.45, domainMax=0.7
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q', scale=alt.Scale(domainMin=0.3, domainMax=1),  ##domainMin=0.75, domainMax=0.9
           title='How completely cycle can be detected?'), #domainMin=0.9, domainMax=1.0
    color=alt.Color('dtype:N', scale=alt.Scale(range=['blue','red']), title='type of X data'), #scheme= 'turbo',domainMin=0.78, domainMax=0.86
    shape=alt.Shape('model_type:N', title='model type')
#    color=alt.Color('auc:Q', scale=alt.Scale(scheme= 'turbo', ), title='auc score') #domainMin=0.78, domainMax=0.86
).transform_filter((alt.datum.precision>0.2)&(alt.datum.recall>0.3)&(alt.datum.model_type=='rf'
                  ))

emp=alt.Chart(final_selection[final_selection.index==86]).mark_circle(size=100, filled=False, stroke='orange', strokeWidth=5).encode(
    x=alt.X('precision:Q',  #scale=alt.Scale(domainMin=0, domainMax=1)
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q',  #scale=alt.Scale(domainMin=0, domainMax=1)
           title='How completely cycle can be detected?')
)

(emp+base1).properties(                       ##+emp1+emp2+emp3
    width=200, height=300,
    title={'text':['Precision(X) and Recall(Y) of RF models'],
       'subtitle':['the orange point: The Final RF Model'],
       'align':'left',
       'anchor':'start'
      })

In [23]:
import altair as alt

base1=alt.Chart(final_selection).mark_point(size=30, shape='circle').encode(
    x=alt.X('precision:Q', scale=alt.Scale(domainMin=0.2, domainMax=1),  ##domainMin=0.45, domainMax=0.7
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q', scale=alt.Scale(domainMin=0.3, domainMax=1),  ##domainMin=0.75, domainMax=0.9
           title='How completely cycle can be detected?'), #domainMin=0.9, domainMax=1.0
    color=alt.Color('grouping:N', scale=alt.Scale(range=['blue','red']), title='methods of grouping'), #scheme= 'turbo',domainMin=0.78, domainMax=0.86
    shape=alt.Shape('model_type:N', title='model type')
#    color=alt.Color('auc:Q', scale=alt.Scale(scheme= 'turbo', ), title='auc score') #domainMin=0.78, domainMax=0.86
).transform_filter((alt.datum.precision>0.2)&(alt.datum.recall>0.3)&(alt.datum.model_type=='rf'
                  ))


(base1).properties(                       ##+emp1+emp2+emp3
    width=200, height=300,
    title={'text':['Precision(X) and Recall(Y) of candidate models'],
#       'subtitle':['the annotated point: The Best Model for each model type'],
       'align':'left',
       'anchor':'start'
      })

In [24]:
import altair as alt

base1=alt.Chart(final_selection).mark_point(size=30, shape='circle', filled=True).encode(
    x=alt.X('precision:Q', scale=alt.Scale(domainMin=0, domainMax=1),  ##domainMin=0.45, domainMax=0.7
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q', scale=alt.Scale(domainMin=0, domainMax=1),  ##domainMin=0.75, domainMax=0.9
           title='How completely cycle can be detected?'), #domainMin=0.9, domainMax=1.0
    color=alt.Color('model_type:N', scale=alt.Scale(range=['orange','blue','lightgreen']), title='model_type'), #scheme= 'turbo',domainMin=0.78, domainMax=0.86
#    shape=alt.Shape('model_type:N', title='model type')
#    color=alt.Color('auc:Q', scale=alt.Scale(scheme= 'turbo', ), title='auc score') #domainMin=0.78, domainMax=0.86
#).transform_filter(alt.datum.model_type=='knn')
)

emp=alt.Chart(final_selection[final_selection.index==86]).mark_circle(size=100, filled=False, stroke='red', strokeWidth=3).encode(
    x=alt.X('precision:Q',  #scale=alt.Scale(domainMin=0, domainMax=1)
           title='Is classified cycle true?'), #domainMin=0.0, domainMax=0.37
    y=alt.Y('recall:Q',  #scale=alt.Scale(domainMin=0, domainMax=1)
           title='How completely cycle can be detected?')
)


(emp+base1).properties(                       ##+emp1+emp2+emp3
    width=200, height=300,
    title={'text':['Precision(X) and Recall(Y) of candidate models'],
       'subtitle':['the red point: The Final RF Model'],
       'align':'left',
       'anchor':'start'
      })

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=dc2156ff-f31b-485a-9893-d89a520307c4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>