In [1]:
#!/usr/bin/python

# import sys
import pickle
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import math
# from scipy import stats
from functools import partial

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.feature_selection import SelectPercentile, SelectFromModel, \
        f_classif, mutual_info_classif, chi2, SelectFpr, SelectFdr, RFECV
from sklearn.decomposition import FastICA, IncrementalPCA, KernelPCA, PCA, \
                                  TruncatedSVD
from sklearn.pipeline import Pipeline

# from sklearn.linear_model import LinearRegression
# from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

from sklearn.model_selection import GridSearchCV
# from genetic_selection import GeneticSelectionCV
# from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

# from time import time

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# sys.path.append("../tools/")
# from feature_format import featureFormat, targetFeatureSplit
# from dos2unix import crlf_to_lf
# from tester import dump_classifier_and_data

In [2]:
### Load data sets.
with open(file='y_train.pkl', mode='rb') as file:
    y_train = pickle.load(file=file)
y_test = None
    
X_train_scaled_imp0 = None
with open('X_train_scaled_imp0.pkl', 'rb') as file:
    X_train_scaled_imp0 = pickle.load(file=file)
X_train_scaled_imp_med = None
with open('X_train_scaled_imp_med.pkl', 'rb') as file:
    X_train_scaled_imp_med = pickle.load(file=file)
X_train_scaled_imp_mv = None
with open('X_train_scaled_imp_mv.pkl', 'rb') as file:
    X_train_scaled_imp_mv = pickle.load(file=file)
    
with open('imp_gscvs_dict_1.pkl', 'rb') as file:
    imp_gscvs_dict1 = pickle.load(file=file)

In [3]:
### Initial search params/steps

n_jobs = -1

mutual_info_classif_partial = partial(mutual_info_classif, random_state=42)
DecisionTreeClassifier_partial = \
    partial(DecisionTreeClassifier, random_state=42)
RandomForestClassifier_partial = \
    partial(RandomForestClassifier, random_state=42, n_jobs=n_jobs)
AdaBoostClassifier_partial = partial(AdaBoostClassifier, random_state=42)
svm_SVC_partial = partial(svm.SVC, random_state=42)
KNeighborsClassifier_partial = partial(KNeighborsClassifier, n_jobs=n_jobs)

selectors = {
    'sel_per': {
        'sel': SelectPercentile(),
        'params': {
            'sel_per__score_func': [f_classif, chi2,
                                    mutual_info_classif_partial],
            'sel_per__percentile': [5, 10, 20, 50, 100]
        }
    }
}

decomps = {
    'fica': {
        'dec': FastICA(),
        'params': {
            'fica__algorithm': ['parallel', 'deflation'],
            'fica__fun': ['logcosh', 'exp', 'cube'],
            'fica__random_state': [42]
        }
    },
    'ipca': {
        'dec': IncrementalPCA(),
        'params': {
            ### defaults
        }
    }
}

classifiers = {
    'dt_clf': {
        'clf': DecisionTreeClassifier(),
        'params': {
            'dt_clf__random_state': [42]
        }
    },
    'rf_clf': {
        'clf': RandomForestClassifier(),
        'params': {
            'rf_clf__n_estimators': [4, 8, 16],
            'rf_clf__max_features': ['sqrt', 'log2', 0.75, None],
            'rf_clf__max_depth': [32, 128, None],
            'rf_clf__min_samples_split': [2, 4, 8],
            'rf_clf__min_samples_leaf': [1, 2, 4],
            'rf_clf__bootstrap': [True, False],
            'rf_clf__random_state': [42],
            'rf_clf__n_jobs': [n_jobs]
        }
    },
    'ab_clf': {
        'clf': AdaBoostClassifier(),
        'params': {
            'ab_clf__base_estimator': [
                DecisionTreeClassifier_partial(),
                RandomForestClassifier_partial(),
                AdaBoostClassifier_partial(),
                svm_SVC_partial(),
                KNeighborsClassifier_partial(),
                GaussianNB()
            ],
            'ab_clf__n_estimators': [32, 50, 64],
            'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
            'ab_clf__random_state': [42]
          }
    },
    'kn_clf': {
        'clf': KNeighborsClassifier(),
        'params': {
            'kn_clf__n_neighbors': [2, 4, 8],
            'kn_clf__weights': ['uniform', 'distance'],
            'kn_clf__algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'kn_clf__leaf_size': [16, 32, 64],
            'kn_clf__n_jobs': [n_jobs]
        }
    },
    'gnb_clf': {
        'clf': GaussianNB(),
        'params': {
            ### Defaults only
        }
    },
}

In [5]:
### Well, that was silly of me to return my search data in this structure.
### Reformat and expand data into a dataframe.
imp_gscvs_df = pd.DataFrame(columns=['imp_method', 'selector', 'decomp', 'classifier', 'best_score_',
                                     'best_params_', 'gscv'])

for imp, gscv_dict in imp_gscvs_dict1.items():
    for steps_name, gscv in gscv_dict.items():
        row_dict = {'imp_method': imp}
        
        steps_lst = steps_name.split('_')
        if len(steps_lst) == 5:
            row_dict['selector'] = '_'.join(steps_lst[:2])
            row_dict['decomp'] = steps_lst[2]
            row_dict['classifier'] = '_'.join(steps_lst[3:])
            row_dict['best_score_'] = gscv.best_score_
            row_dict['best_params_'] = gscv.best_params_
            row_dict['gscv'] = gscv
        
        row_dict.update(gscv.best_params_)
        imp_gscvs_df = imp_gscvs_df.append(row_dict, ignore_index=True)
        
imp_gscvs_df = imp_gscvs_df.set_index(keys=['imp_method', 'selector', 'decomp', 'classifier'])
imp_gscvs_df.info()
imp_gscvs_df

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 30 entries, ('imp0', 'sel_per', 'fica', 'dt_clf') to ('imp_mv', 'sel_per', 'ipca', 'gnb_clf')
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   best_score_                30 non-null     float64
 1   best_params_               30 non-null     object 
 2   gscv                       30 non-null     object 
 3   dt_clf__random_state       6 non-null      float64
 4   fica__algorithm            15 non-null     object 
 5   fica__fun                  15 non-null     object 
 6   fica__random_state         15 non-null     float64
 7   sel_per__percentile        30 non-null     float64
 8   sel_per__score_func        30 non-null     object 
 9   rf_clf__bootstrap          6 non-null      float64
 10  rf_clf__max_depth          6 non-null      float64
 11  rf_clf__max_features       6 non-null      object 
 12  rf_clf__min_samples_leaf   6 non-n

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_score_,best_params_,gscv,dt_clf__random_state,fica__algorithm,fica__fun,fica__random_state,sel_per__percentile,sel_per__score_func,rf_clf__bootstrap,...,rf_clf__random_state,ab_clf__algorithm,ab_clf__base_estimator,ab_clf__n_estimators,ab_clf__random_state,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs,kn_clf__n_neighbors,kn_clf__weights
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
imp0,sel_per,fica,dt_clf,0.824837,"{'dt_clf__random_state': 42, 'fica__algorithm'...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,42.0,parallel,logcosh,42.0,5.0,<function chi2 at 0x0000024138DC3D30>,,...,,,,,,,,,,
imp0,sel_per,fica,rf_clf,0.872549,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,logcosh,42.0,20.0,<function f_classif at 0x0000024138DC39D0>,1.0,...,42.0,,,,,,,,,
imp0,sel_per,fica,ab_clf,0.860784,"{'ab_clf__algorithm': 'SAMME', 'ab_clf__base_e...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,cube,42.0,50.0,functools.partial(<function mutual_info_classi...,,...,,SAMME,AdaBoostClassifier(random_state=42),32.0,42.0,,,,,
imp0,sel_per,fica,kn_clf,0.884314,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,exp,42.0,10.0,<function chi2 at 0x0000024138DC3D30>,,...,,,,,,ball_tree,16.0,-1.0,4.0,distance
imp0,sel_per,fica,gnb_clf,0.860784,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,logcosh,42.0,20.0,<function chi2 at 0x0000024138DC3D30>,,...,,,,,,,,,,
imp0,sel_per,ipca,dt_clf,0.732026,"{'dt_clf__random_state': 42, 'sel_per__percent...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,42.0,,,,5.0,<function f_classif at 0x0000024138DC39D0>,,...,,,,,,,,,,
imp0,sel_per,ipca,rf_clf,0.872549,"{'rf_clf__bootstrap': True, 'rf_clf__max_depth...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,10.0,functools.partial(<function mutual_info_classi...,1.0,...,42.0,,,,,,,,,
imp0,sel_per,ipca,ab_clf,0.860784,"{'ab_clf__algorithm': 'SAMME', 'ab_clf__base_e...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,50.0,<function chi2 at 0x0000024138DC3D30>,,...,,SAMME,"RandomForestClassifier(n_jobs=-1, random_state...",32.0,42.0,,,,,
imp0,sel_per,ipca,kn_clf,0.872549,"{'kn_clf__algorithm': 'ball_tree', 'kn_clf__le...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,10.0,<function f_classif at 0x0000024138DC39D0>,,...,,,,,,ball_tree,16.0,-1.0,4.0,uniform
imp0,sel_per,ipca,gnb_clf,0.860784,"{'sel_per__percentile': 20, 'sel_per__score_fu...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,20.0,<function f_classif at 0x0000024138DC39D0>,,...,,,,,,,,,,


In [24]:
### Sort by best_score_.
print('Best scores sorted:\n')
imp_gscvs_df.sort_values(by='best_score_', ascending=False)

Best scores sorted:



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_score_,best_params_,gscv,dt_clf__random_state,fica__algorithm,fica__fun,fica__random_state,sel_per__percentile,sel_per__score_func,rf_clf__bootstrap,...,rf_clf__random_state,ab_clf__algorithm,ab_clf__base_estimator,ab_clf__n_estimators,ab_clf__random_state,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs,kn_clf__n_neighbors,kn_clf__weights
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
imp0,sel_per,fica,kn_clf,0.884314,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,exp,42.0,10.0,<function chi2 at 0x0000024138DC3D30>,,...,,,,,,ball_tree,16.0,-1.0,4.0,distance
imp_med,sel_per,fica,kn_clf,0.884314,"{'fica__algorithm': 'deflation', 'fica__fun': ...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,deflation,logcosh,42.0,50.0,functools.partial(<function mutual_info_classi...,,...,,,,,,ball_tree,16.0,-1.0,2.0,uniform
imp_mv,sel_per,fica,rf_clf,0.88366,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,cube,42.0,5.0,<function f_classif at 0x0000024138DC39D0>,1.0,...,42.0,,,,,,,,,
imp_med,sel_per,ipca,rf_clf,0.88366,"{'rf_clf__bootstrap': False, 'rf_clf__max_dept...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,50.0,functools.partial(<function mutual_info_classi...,0.0,...,42.0,,,,,,,,,
imp_med,sel_per,fica,rf_clf,0.88366,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,logcosh,42.0,5.0,<function f_classif at 0x0000024138DC39D0>,0.0,...,42.0,,,,,,,,,
imp_med,sel_per,fica,ab_clf,0.872549,"{'ab_clf__algorithm': 'SAMME.R', 'ab_clf__base...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,exp,42.0,20.0,functools.partial(<function mutual_info_classi...,,...,,SAMME.R,GaussianNB(),32.0,42.0,,,,,
imp0,sel_per,fica,rf_clf,0.872549,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,logcosh,42.0,20.0,<function f_classif at 0x0000024138DC39D0>,1.0,...,42.0,,,,,,,,,
imp_mv,sel_per,ipca,kn_clf,0.872549,"{'kn_clf__algorithm': 'ball_tree', 'kn_clf__le...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,5.0,functools.partial(<function mutual_info_classi...,,...,,,,,,ball_tree,16.0,-1.0,8.0,uniform
imp0,sel_per,ipca,rf_clf,0.872549,"{'rf_clf__bootstrap': True, 'rf_clf__max_depth...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,10.0,functools.partial(<function mutual_info_classi...,1.0,...,42.0,,,,,,,,,
imp0,sel_per,ipca,kn_clf,0.872549,"{'kn_clf__algorithm': 'ball_tree', 'kn_clf__le...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,10.0,<function f_classif at 0x0000024138DC39D0>,,...,,,,,,ball_tree,16.0,-1.0,4.0,uniform


In [36]:
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_score_,best_params_,gscv,dt_clf__random_state,fica__algorithm,fica__fun,fica__random_state,sel_per__percentile,sel_per__score_func,rf_clf__bootstrap,...,rf_clf__random_state,ab_clf__algorithm,ab_clf__base_estimator,ab_clf__n_estimators,ab_clf__random_state,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs,kn_clf__n_neighbors,kn_clf__weights
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
imp0,sel_per,fica,kn_clf,0.884314,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,exp,42.0,10.0,<function chi2 at 0x0000024138DC3D30>,,...,,,,,,ball_tree,16.0,-1.0,4.0,distance
imp_med,sel_per,fica,kn_clf,0.884314,"{'fica__algorithm': 'deflation', 'fica__fun': ...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,deflation,logcosh,42.0,50.0,functools.partial(<function mutual_info_classi...,,...,,,,,,ball_tree,16.0,-1.0,2.0,uniform
imp_mv,sel_per,fica,rf_clf,0.88366,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,cube,42.0,5.0,<function f_classif at 0x0000024138DC39D0>,1.0,...,42.0,,,,,,,,,
imp_med,sel_per,ipca,rf_clf,0.88366,"{'rf_clf__bootstrap': False, 'rf_clf__max_dept...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,50.0,functools.partial(<function mutual_info_classi...,0.0,...,42.0,,,,,,,,,
imp_med,sel_per,fica,rf_clf,0.88366,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,logcosh,42.0,5.0,<function f_classif at 0x0000024138DC39D0>,0.0,...,42.0,,,,,,,,,
imp_med,sel_per,fica,ab_clf,0.872549,"{'ab_clf__algorithm': 'SAMME.R', 'ab_clf__base...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,exp,42.0,20.0,functools.partial(<function mutual_info_classi...,,...,,SAMME.R,GaussianNB(),32.0,42.0,,,,,
imp0,sel_per,fica,rf_clf,0.872549,"{'fica__algorithm': 'parallel', 'fica__fun': '...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,parallel,logcosh,42.0,20.0,<function f_classif at 0x0000024138DC39D0>,1.0,...,42.0,,,,,,,,,
imp_mv,sel_per,ipca,kn_clf,0.872549,"{'kn_clf__algorithm': 'ball_tree', 'kn_clf__le...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,5.0,functools.partial(<function mutual_info_classi...,,...,,,,,,ball_tree,16.0,-1.0,8.0,uniform
imp0,sel_per,ipca,rf_clf,0.872549,"{'rf_clf__bootstrap': True, 'rf_clf__max_depth...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,10.0,functools.partial(<function mutual_info_classi...,1.0,...,42.0,,,,,,,,,
imp0,sel_per,ipca,kn_clf,0.872549,"{'kn_clf__algorithm': 'ball_tree', 'kn_clf__le...",GridSearchCV(estimator=Pipeline(steps=[('sel_p...,,,,,10.0,<function f_classif at 0x0000024138DC39D0>,,...,,,,,,ball_tree,16.0,-1.0,4.0,uniform


In [49]:
print('\nClassifier stats:\n')
imp_gscvs_df.groupby(by='classifier')['best_score_'].min().sort_values(ascending=False)
imp_gscvs_df.groupby(by='classifier')['best_score_'].mean().sort_values(ascending=False)
imp_gscvs_df.groupby(by='classifier')['best_score_'].max().sort_values(ascending=False)

imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15).groupby(by='classifier')\
    ['best_score_'].count()

### Groupby classifier parameters.
for clf, clf_dict in classifiers.items():
    for param in clf_dict['params'].keys():
        print('Count of', str(clf_dict['clf']), 'best', param, ':')
        print('Possible values:')
        clf_dict['params'][param]
        imp_gscvs_df.groupby(by=param, sort=False)['best_score_'].count()
        imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15)\
            .groupby(by=param, sort=False)['best_score_'].count()


Classifier stats:



classifier
rf_clf     0.872549
kn_clf     0.861438
ab_clf     0.860784
gnb_clf    0.860784
dt_clf     0.732026
Name: best_score_, dtype: float64

classifier
rf_clf     0.878105
kn_clf     0.874619
ab_clf     0.866667
gnb_clf    0.862963
dt_clf     0.800545
Name: best_score_, dtype: float64

classifier
kn_clf     0.884314
rf_clf     0.883660
ab_clf     0.872549
gnb_clf    0.872549
dt_clf     0.825490
Name: best_score_, dtype: float64

classifier
ab_clf     3
gnb_clf    1
kn_clf     5
rf_clf     6
Name: best_score_, dtype: int64

Count of DecisionTreeClassifier() best dt_clf__random_state :
Possible values:


[42]

dt_clf__random_state
42.0    6
Name: best_score_, dtype: int64

Series([], Name: best_score_, dtype: int64)

Count of RandomForestClassifier() best rf_clf__n_estimators :
Possible values:


[4, 8, 16]

rf_clf__n_estimators
4.0     2
8.0     3
16.0    1
Name: best_score_, dtype: int64

rf_clf__n_estimators
8.0     3
16.0    1
4.0     2
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__max_features :
Possible values:


['sqrt', 'log2', 0.75, None]

rf_clf__max_features
sqrt    5
log2    1
Name: best_score_, dtype: int64

rf_clf__max_features
sqrt    5
log2    1
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__max_depth :
Possible values:


[32, 128, None]

rf_clf__max_depth
32.0    6
Name: best_score_, dtype: int64

rf_clf__max_depth
32.0    6
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__min_samples_split :
Possible values:


[2, 4, 8]

rf_clf__min_samples_split
2.0    5
8.0    1
Name: best_score_, dtype: int64

rf_clf__min_samples_split
2.0    5
8.0    1
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__min_samples_leaf :
Possible values:


[1, 2, 4]

rf_clf__min_samples_leaf
4.0    2
2.0    3
1.0    1
Name: best_score_, dtype: int64

rf_clf__min_samples_leaf
4.0    2
2.0    3
1.0    1
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__bootstrap :
Possible values:


[True, False]

rf_clf__bootstrap
1.0    4
0.0    2
Name: best_score_, dtype: int64

rf_clf__bootstrap
1.0    4
0.0    2
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__random_state :
Possible values:


[42]

rf_clf__random_state
42.0    6
Name: best_score_, dtype: int64

rf_clf__random_state
42.0    6
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__n_jobs :
Possible values:


[-1]

rf_clf__n_jobs
-1.0    6
Name: best_score_, dtype: int64

rf_clf__n_jobs
-1.0    6
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__base_estimator :
Possible values:


[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(n_jobs=-1, random_state=42),
 AdaBoostClassifier(random_state=42),
 SVC(random_state=42),
 KNeighborsClassifier(n_jobs=-1),
 GaussianNB()]

ab_clf__base_estimator
AdaBoostClassifier(random_state=42)                   1
RandomForestClassifier(n_jobs=-1, random_state=42)    2
GaussianNB()                                          3
Name: best_score_, dtype: int64

ab_clf__base_estimator
GaussianNB()    3
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__n_estimators :
Possible values:


[32, 50, 64]

ab_clf__n_estimators
32.0    5
50.0    1
Name: best_score_, dtype: int64

ab_clf__n_estimators
32.0    2
50.0    1
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__algorithm :
Possible values:


['SAMME', 'SAMME.R']

ab_clf__algorithm
SAMME      3
SAMME.R    3
Name: best_score_, dtype: int64

ab_clf__algorithm
SAMME.R    3
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__random_state :
Possible values:


[42]

ab_clf__random_state
42.0    6
Name: best_score_, dtype: int64

ab_clf__random_state
42.0    3
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__n_neighbors :
Possible values:


[2, 4, 8]

kn_clf__n_neighbors
4.0    3
2.0    1
8.0    2
Name: best_score_, dtype: int64

kn_clf__n_neighbors
4.0    2
2.0    1
8.0    2
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__weights :
Possible values:


['uniform', 'distance']

kn_clf__weights
distance    3
uniform     3
Name: best_score_, dtype: int64

kn_clf__weights
distance    2
uniform     3
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__algorithm :
Possible values:


['ball_tree', 'kd_tree', 'brute']

kn_clf__algorithm
ball_tree    6
Name: best_score_, dtype: int64

kn_clf__algorithm
ball_tree    5
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__leaf_size :
Possible values:


[16, 32, 64]

kn_clf__leaf_size
16.0    5
32.0    1
Name: best_score_, dtype: int64

kn_clf__leaf_size
16.0    4
32.0    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__n_jobs :
Possible values:


[-1]

kn_clf__n_jobs
-1.0    6
Name: best_score_, dtype: int64

kn_clf__n_jobs
-1.0    5
Name: best_score_, dtype: int64

In [51]:
print('\nDecomposition method stats:\n')
imp_gscvs_df.groupby(by='decomp')['best_score_'].min().sort_values(ascending=False)
imp_gscvs_df.groupby(by='decomp')['best_score_'].mean().sort_values(ascending=False)
imp_gscvs_df.groupby(by='decomp')['best_score_'].max().sort_values(ascending=False)

imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15).groupby(by='decomp')\
    ['best_score_'].count()

### Groupby decomp parameters.
print('Count of FastICA best algorithms:')
print('Possible values:')
decomps['fica']['params']['fica__algorithm']
imp_gscvs_df.groupby(by='fica__algorithm')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15)\
    .groupby(by='fica__algorithm')['best_score_'].count()

print('Count of FastICA best functions:')
print('Possible values:')
decomps['fica']['params']['fica__fun']
imp_gscvs_df.groupby(by='fica__fun')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15)\
    .groupby(by='fica__fun')['best_score_'].count()


Decomposition method stats:



decomp
fica    0.815033
ipca    0.732026
Name: best_score_, dtype: float64

decomp
fica    0.863181
ipca    0.849978
Name: best_score_, dtype: float64

decomp
fica    0.884314
ipca    0.883660
Name: best_score_, dtype: float64

decomp
fica    9
ipca    6
Name: best_score_, dtype: int64

Count of FastICA best algorithms:
Possible values:


['parallel', 'deflation']

fica__algorithm
deflation     4
parallel     11
Name: best_score_, dtype: int64

fica__algorithm
deflation    3
parallel     6
Name: best_score_, dtype: int64

Count of FastICA best functions:
Possible values:


['logcosh', 'exp', 'cube']

fica__fun
cube       3
exp        5
logcosh    7
Name: best_score_, dtype: int64

fica__fun
cube       1
exp        4
logcosh    4
Name: best_score_, dtype: int64

In [54]:
###  Groupby selector parameters.
print('Count of SelectPercentile best score functions:')
print('Possible values:')
selectors['sel_per']['params']['sel_per__score_func']
imp_gscvs_df.groupby(by='sel_per__score_func', sort=False)['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15)\
    .groupby(by='sel_per__score_func', sort=False)['best_score_'].count()

print('Count of SelectPercentile best percentiles:')
print('Possible values:')
selectors['sel_per']['params']['sel_per__percentile']
imp_gscvs_df.groupby(by='sel_per__percentile')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15)\
    .groupby(by='sel_per__percentile')['best_score_'].count()

Count of SelectPercentile best score functions:
Possible values:


[<function sklearn.feature_selection._univariate_selection.f_classif(X, y)>,
 <function sklearn.feature_selection._univariate_selection.chi2(X, y)>,
 functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)]

sel_per__score_func
<function chi2 at 0x0000024138DC3D30>                                                        6
<function f_classif at 0x0000024138DC39D0>                                                  16
functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)     8
Name: best_score_, dtype: int64

sel_per__score_func
<function chi2 at 0x0000024138DC3D30>                                                       1
functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)    6
<function f_classif at 0x0000024138DC39D0>                                                  8
Name: best_score_, dtype: int64

Count of SelectPercentile best percentiles:
Possible values:


[5, 10, 20, 50, 100]

sel_per__percentile
5.0      8
10.0     8
20.0     9
50.0     4
100.0    1
Name: best_score_, dtype: int64

sel_per__percentile
5.0      3
10.0     5
20.0     4
50.0     2
100.0    1
Name: best_score_, dtype: int64

In [55]:
### Groupby methods/steps and compare count and score min, max, mean.
print('\nImputation method stats:\n')
imp_gscvs_df.groupby(by='imp_method')['best_score_'].min().sort_values(ascending=False)
imp_gscvs_df.groupby(by='imp_method')['best_score_'].mean().sort_values(ascending=False)
imp_gscvs_df.groupby(by='imp_method')['best_score_'].max().sort_values(ascending=False)

imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15).groupby(by='imp_method')\
    ['best_score_'].count()


Imputation method stats:



imp_method
imp_mv     0.814379
imp_med    0.791503
imp0       0.732026
Name: best_score_, dtype: float64

imp_method
imp_mv     0.859869
imp_med    0.859673
imp0       0.850196
Name: best_score_, dtype: float64

imp_method
imp0       0.884314
imp_med    0.884314
imp_mv     0.883660
Name: best_score_, dtype: float64

imp_method
imp0       4
imp_med    5
imp_mv     6
Name: best_score_, dtype: int64

In [61]:
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15)\
    [classifiers['kn_clf']['params'].keys()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,kn_clf__n_neighbors,kn_clf__weights,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
imp0,sel_per,fica,kn_clf,4.0,distance,ball_tree,16.0,-1.0
imp_med,sel_per,fica,kn_clf,2.0,uniform,ball_tree,16.0,-1.0
imp_mv,sel_per,fica,rf_clf,,,,,
imp_med,sel_per,ipca,rf_clf,,,,,
imp_med,sel_per,fica,rf_clf,,,,,
imp_med,sel_per,fica,ab_clf,,,,,
imp0,sel_per,fica,rf_clf,,,,,
imp_mv,sel_per,ipca,kn_clf,8.0,uniform,ball_tree,16.0,-1.0
imp0,sel_per,ipca,rf_clf,,,,,
imp0,sel_per,ipca,kn_clf,4.0,uniform,ball_tree,16.0,-1.0


In [62]:
### Final search.

n_jobs = -1

mutual_info_classif_partial = partial(mutual_info_classif, random_state=42)
DecisionTreeClassifier_partial = partial(DecisionTreeClassifier, random_state=42)
RandomForestClassifier_partial = partial(RandomForestClassifier, random_state=42, n_jobs=n_jobs)
AdaBoostClassifier_partial = partial(AdaBoostClassifier, random_state=42)
svm_SVC_partial = partial(svm.SVC, random_state=42)
KNeighborsClassifier_partial = partial(KNeighborsClassifier, n_jobs=n_jobs)

selectors = {
    'sel_per': {
        'sel': SelectPercentile(),
        'params': {
            'sel_per__score_func': [f_classif, chi2, mutual_info_classif_partial],
            'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30]
        }
    }
}

decomps = {
    'fica': {
        'dec': FastICA(),
        'params': {
            'fica__algorithm': ['parallel', 'deflation'],
            'fica__fun': ['logcosh', 'exp', 'cube'],
            'fica__random_state': [42]
        }
    },
}

classifiers = {
    'rf_clf': {
        'clf': RandomForestClassifier(),
        'params': {
            'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
            'rf_clf__max_features': ['sqrt', 'log2'],
            'rf_clf__max_depth': [16, 32, 64],
            'rf_clf__min_samples_split': [2],
            'rf_clf__min_samples_leaf': [1, 2, 3, 4, 5],
            'rf_clf__bootstrap': [True, False],
            'rf_clf__random_state': [42],
            'rf_clf__n_jobs': [n_jobs]
        }
    },
    'ab_clf': {
        'clf': AdaBoostClassifier(),
        'params': {
            'ab_clf__base_estimator': [GaussianNB()],
            'ab_clf__n_estimators': [16, 32, 48],
            'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
            'ab_clf__random_state': [42]
        }
    },
    'kn_clf': {
        'clf': KNeighborsClassifier(),
        'params': {
            'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
            'kn_clf__weights': ['uniform', 'distance'],
            'kn_clf__algorithm': ['ball_tree'],
            'kn_clf__leaf_size': [8, 16, 24],
            'kn_clf__n_jobs': [n_jobs]
        }
    },
}

def search_em_all(X_train, y_train=y_train, selectors=selectors,
                  decomps=decomps, classifiers=classifiers, pipe_verbose=True,
                  scoring='recall_weighted', n_jobs=n_jobs):
    ### Try also 'jaccard_weighted'
    gscv_dict = {}
    
    i = 0
    for selector in selectors:
        for decomp in decomps:
            for classifier in classifiers:
                
                pipe = Pipeline(steps=[
                    (selector, selectors[selector]['sel']),
                    (decomp, decomps[decomp]['dec']),
                    (classifier, classifiers[classifier]['clf'])
                ], verbose=pipe_verbose)
                
                params = {}
                params.update(selectors[selector]['params'])
                params.update(decomps[decomp]['params'])
                params.update(classifiers[classifier]['params'])
                
                params_name = selector + '_' + decomp + '_' + classifier
                print('\n', i, params_name, '\n')
                
                gscv = GridSearchCV(estimator=pipe, param_grid=params,
                                    scoring=scoring, n_jobs=n_jobs, verbose=3)

                gscv_dict[params_name] = gscv.fit(X=X_train, y=y_train)
                print('\n', gscv_dict[params_name])
                print('\nbest_score_:', gscv_dict[params_name].best_score_)
                print('\nbest_params_:', gscv_dict[params_name].best_params_)
                
                i += 1
                
    return gscv_dict

imp_gscvs_dict = {}
print('\nimp0\n')
imp_gscvs_dict['imp0'] = search_em_all(X_train=X_train_scaled_imp0)
print('\nimp_med\n')
imp_gscvs_dict['imp_med'] = search_em_all(X_train=X_train_scaled_imp_med)
print('\nimp_mv\n')
imp_gscvs_dict['imp_mv'] = search_em_all(X_train=X_train_scaled_imp_mv)


imp0


 0 sel_per_fica_rf_clf 

Fitting 5 folds for each of 60480 candidates, totalling 302400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing rf_clf, total=   0.0s



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('rf_clf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'rf_clf__bootstrap': [True, False],
                         'rf_clf__max_depth': [16, 32, 64],
                         'rf_clf__max_featu...
                         'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                         'rf_clf__n_jobs': [-1], 'rf_clf__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30],
                         'sel_per__score_func': [<function f_classif at 0x0000024138DC39D0>,
                                 

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3780 out of 3780 | elapsed:  4.2min finished
 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
 271 272 273 274 275 278 279 280] are constant.
  f = msb / msw
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.0s
[Pipeline] ............ (step 3 of 3) Processing ab_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('ab_clf', AdaBoostClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
                         'ab_clf__base_estimator': [GaussianNB()],
                         'ab_clf__n_estimators': [16, 32, 48],
                         'ab_clf__random_state': [42],
                         'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 360 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 872 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 1224 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1640 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2120 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2664 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3272 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3944 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 4680 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 5480 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 6344 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 7272 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 8264 tasks      | elapsed:  8.5m

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing kn_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('kn_clf', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'kn_clf__algorithm': ['ball_tree'],
                         'kn_clf__leaf_size': [8, 16, 24],
                         'kn_clf__n_jobs': [-1],
                         'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kn_clf__weights': ['uniform', 'distance'],


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 4632 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 5432 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 6296 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | e

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing rf_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('rf_clf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'rf_clf__bootstrap': [True, False],
                         'rf_clf__max_depth': [16, 32, 64],
                         'rf_clf__max_featu...
                         'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                         'rf_clf__n_jobs': [-1], 'rf_clf__random_st

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3780 out of 3780 | elapsed:  4.5min finished


[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.6s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing ab_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('ab_clf', AdaBoostClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
                         'ab_clf__base_estimator': [GaussianNB()],
                         'ab_clf__n_estimators': [16, 32, 48],
                         'ab_clf__random_state': [42],
                         'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 4632 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 5432 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 6296 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | e

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing kn_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('kn_clf', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'kn_clf__algorithm': ['ball_tree'],
                         'kn_clf__leaf_size': [8, 16, 24],
                         'kn_clf__n_jobs': [-1],
                         'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kn_clf__weights': ['uniform', 'distance'],




Fitting 5 folds for each of 60480 candidates, totalling 302400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 4632 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 5432 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 6296 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | e

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.6s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.3s
[Pipeline] ............ (step 3 of 3) Processing rf_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('rf_clf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'rf_clf__bootstrap': [True, False],
                         'rf_clf__max_depth': [16, 32, 64],
                         'rf_clf__max_featu...
                         'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                         'rf_clf__n_jobs': [-1], 'rf_clf__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3780 out of 3780 | elapsed:  3.9min finished
 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
 271 272 273 274 275 278 279 280] are constant.
  f = msb / msw
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing ab_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('ab_clf', AdaBoostClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
                         'ab_clf__base_estimator': [GaussianNB()],
                         'ab_clf__n_estimators': [16, 32, 48],
                         'ab_clf__random_state': [42],
                         'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 4632 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 5432 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 6296 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 8216 tasks      | elapsed:  7.6m

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing kn_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('kn_clf', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'kn_clf__algorithm': ['ball_tree'],
                         'kn_clf__leaf_size': [8, 16, 24],
                         'kn_clf__n_jobs': [-1],
                         'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kn_clf__weights': ['uniform', 'distance'],




In [64]:
with open('imp_gscvs_dict_2.pkl', 'wb') as file:
    pickle.dump(obj=imp_gscvs_dict, file=file)

In [65]:
### Well, that was silly of me to return my search data in this structure.
### Reformat and expand data into a dataframe.
imp_gscvs_df = pd.DataFrame(columns=['imp_method', 'selector', 'decomp', 'classifier', 'best_score_',
                                     'gscv'])

for imp, gscv_dict in imp_gscvs_dict.items():
    for steps_name, gscv in gscv_dict.items():
        row_dict = {'imp_method': imp}
        
        steps_lst = steps_name.split('_')
        if len(steps_lst) == 5:
            row_dict['selector'] = '_'.join(steps_lst[:2])
            row_dict['decomp'] = steps_lst[2]
            row_dict['classifier'] = '_'.join(steps_lst[3:])
            row_dict['best_score_'] = gscv.best_score_
            row_dict['gscv'] = gscv
        
        row_dict.update(gscv.best_params_)
        imp_gscvs_df = imp_gscvs_df.append(row_dict, ignore_index=True)
        
imp_gscvs_df = imp_gscvs_df.set_index(keys=['imp_method', 'selector', 'decomp', 'classifier'])
imp_gscvs_df.info()
imp_gscvs_df

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9 entries, ('imp0', 'sel_per', 'fica', 'rf_clf') to ('imp_mv', 'sel_per', 'fica', 'kn_clf')
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   best_score_                9 non-null      float64
 1   gscv                       9 non-null      object 
 2   fica__algorithm            9 non-null      object 
 3   fica__fun                  9 non-null      object 
 4   fica__random_state         9 non-null      float64
 5   rf_clf__bootstrap          3 non-null      float64
 6   rf_clf__max_depth          3 non-null      float64
 7   rf_clf__max_features       3 non-null      object 
 8   rf_clf__min_samples_leaf   3 non-null      float64
 9   rf_clf__min_samples_split  3 non-null      float64
 10  rf_clf__n_estimators       3 non-null      float64
 11  rf_clf__n_jobs             3 non-null      float64
 12  rf_clf__random_state       3 non-nul

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_score_,gscv,fica__algorithm,fica__fun,fica__random_state,rf_clf__bootstrap,rf_clf__max_depth,rf_clf__max_features,rf_clf__min_samples_leaf,rf_clf__min_samples_split,...,sel_per__score_func,ab_clf__algorithm,ab_clf__base_estimator,ab_clf__n_estimators,ab_clf__random_state,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs,kn_clf__n_neighbors,kn_clf__weights
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
imp0,sel_per,fica,rf_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,1.0,16.0,log2,3.0,2.0,...,<function chi2 at 0x0000024138DC3D30>,,,,,,,,,
imp0,sel_per,fica,ab_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,cube,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,SAMME.R,GaussianNB(),16.0,42.0,,,,,
imp0,sel_per,fica,kn_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,<function chi2 at 0x0000024138DC3D30>,,,,,ball_tree,8.0,-1.0,3.0,uniform
imp_med,sel_per,fica,rf_clf,0.895425,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,deflation,exp,42.0,1.0,16.0,sqrt,3.0,2.0,...,<function chi2 at 0x0000024138DC3D30>,,,,,,,,,
imp_med,sel_per,fica,ab_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,functools.partial(<function mutual_info_classi...,SAMME.R,GaussianNB(),32.0,42.0,,,,,
imp_med,sel_per,fica,kn_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,,,,,ball_tree,16.0,-1.0,3.0,uniform
imp_mv,sel_per,fica,rf_clf,0.896078,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,logcosh,42.0,1.0,16.0,sqrt,2.0,2.0,...,functools.partial(<function mutual_info_classi...,,,,,,,,,
imp_mv,sel_per,fica,ab_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,deflation,exp,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,SAMME.R,GaussianNB(),48.0,42.0,,,,,
imp_mv,sel_per,fica,kn_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,logcosh,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,,,,,ball_tree,24.0,-1.0,9.0,uniform


In [80]:
### Sort by best_score_.
print('Best scores sorted:\n')
imp_gscvs_df.sort_values(by='best_score_', ascending=False)

Best scores sorted:



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_score_,gscv,fica__algorithm,fica__fun,fica__random_state,rf_clf__bootstrap,rf_clf__max_depth,rf_clf__max_features,rf_clf__min_samples_leaf,rf_clf__min_samples_split,...,sel_per__score_func,ab_clf__algorithm,ab_clf__base_estimator,ab_clf__n_estimators,ab_clf__random_state,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs,kn_clf__n_neighbors,kn_clf__weights
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
imp_mv,sel_per,fica,rf_clf,0.896078,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,logcosh,42.0,1.0,16.0,sqrt,2.0,2.0,...,functools.partial(<function mutual_info_classi...,,,,,,,,,
imp_med,sel_per,fica,rf_clf,0.895425,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,deflation,exp,42.0,1.0,16.0,sqrt,3.0,2.0,...,<function chi2 at 0x0000024138DC3D30>,,,,,,,,,
imp0,sel_per,fica,rf_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,1.0,16.0,log2,3.0,2.0,...,<function chi2 at 0x0000024138DC3D30>,,,,,,,,,
imp0,sel_per,fica,ab_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,cube,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,SAMME.R,GaussianNB(),16.0,42.0,,,,,
imp0,sel_per,fica,kn_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,<function chi2 at 0x0000024138DC3D30>,,,,,ball_tree,8.0,-1.0,3.0,uniform
imp_med,sel_per,fica,kn_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,,,,,ball_tree,16.0,-1.0,3.0,uniform
imp_med,sel_per,fica,ab_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,functools.partial(<function mutual_info_classi...,SAMME.R,GaussianNB(),32.0,42.0,,,,,
imp_mv,sel_per,fica,ab_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,deflation,exp,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,SAMME.R,GaussianNB(),48.0,42.0,,,,,
imp_mv,sel_per,fica,kn_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,logcosh,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,,,,,ball_tree,24.0,-1.0,9.0,uniform


In [78]:
print('\nClassifier stats:\n')
imp_gscvs_df.groupby(by='classifier')['best_score_'].min().sort_values(ascending=False)
imp_gscvs_df.groupby(by='classifier')['best_score_'].mean().sort_values(ascending=False)
imp_gscvs_df.groupby(by='classifier')['best_score_'].max().sort_values(ascending=False)

imp_gscvs_df.sort_values(by='best_score_', ascending=False).head().groupby(by='classifier')\
    ['best_score_'].count()

### Groupby classifier parameters.
for clf, clf_dict in classifiers.items():
    for param in clf_dict['params'].keys():
        print('Count of', str(clf_dict['clf']), 'best', param, ':')
        print('Possible values:')
        clf_dict['params'][param]
        imp_gscvs_df.groupby(by=param, sort=False)['best_score_'].count()
        imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
            .groupby(by=param, sort=False)['best_score_'].count()


Classifier stats:



classifier
rf_clf    0.884314
ab_clf    0.872549
kn_clf    0.872549
Name: best_score_, dtype: float64

classifier
rf_clf    0.891939
kn_clf    0.880392
ab_clf    0.876471
Name: best_score_, dtype: float64

classifier
rf_clf    0.896078
ab_clf    0.884314
kn_clf    0.884314
Name: best_score_, dtype: float64

classifier
ab_clf    1
kn_clf    1
rf_clf    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__n_estimators :
Possible values:


[2, 4, 6, 8, 10, 12, 14, 16]

rf_clf__n_estimators
16.0    1
4.0     1
2.0     1
Name: best_score_, dtype: int64

rf_clf__n_estimators
2.0     1
4.0     1
16.0    1
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__max_features :
Possible values:


['sqrt', 'log2']

rf_clf__max_features
log2    1
sqrt    2
Name: best_score_, dtype: int64

rf_clf__max_features
sqrt    2
log2    1
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__max_depth :
Possible values:


[16, 32, 64]

rf_clf__max_depth
16.0    3
Name: best_score_, dtype: int64

rf_clf__max_depth
16.0    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__min_samples_split :
Possible values:


[2]

rf_clf__min_samples_split
2.0    3
Name: best_score_, dtype: int64

rf_clf__min_samples_split
2.0    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__min_samples_leaf :
Possible values:


[1, 2, 3, 4, 5]

rf_clf__min_samples_leaf
3.0    2
2.0    1
Name: best_score_, dtype: int64

rf_clf__min_samples_leaf
2.0    1
3.0    2
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__bootstrap :
Possible values:


[True, False]

rf_clf__bootstrap
1.0    3
Name: best_score_, dtype: int64

rf_clf__bootstrap
1.0    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__random_state :
Possible values:


[42]

rf_clf__random_state
42.0    3
Name: best_score_, dtype: int64

rf_clf__random_state
42.0    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__n_jobs :
Possible values:


[-1]

rf_clf__n_jobs
-1.0    3
Name: best_score_, dtype: int64

rf_clf__n_jobs
-1.0    3
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__base_estimator :
Possible values:


[GaussianNB()]

ab_clf__base_estimator
GaussianNB()    3
Name: best_score_, dtype: int64

ab_clf__base_estimator
GaussianNB()    1
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__n_estimators :
Possible values:


[16, 32, 48]

ab_clf__n_estimators
16.0    1
32.0    1
48.0    1
Name: best_score_, dtype: int64

ab_clf__n_estimators
16.0    1
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__algorithm :
Possible values:


['SAMME', 'SAMME.R']

ab_clf__algorithm
SAMME.R    3
Name: best_score_, dtype: int64

ab_clf__algorithm
SAMME.R    1
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__random_state :
Possible values:


[42]

ab_clf__random_state
42.0    3
Name: best_score_, dtype: int64

ab_clf__random_state
42.0    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__n_neighbors :
Possible values:


[2, 3, 4, 5, 6, 7, 8, 9, 10]

kn_clf__n_neighbors
3.0    2
9.0    1
Name: best_score_, dtype: int64

kn_clf__n_neighbors
3.0    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__weights :
Possible values:


['uniform', 'distance']

kn_clf__weights
uniform    3
Name: best_score_, dtype: int64

kn_clf__weights
uniform    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__algorithm :
Possible values:


['ball_tree']

kn_clf__algorithm
ball_tree    3
Name: best_score_, dtype: int64

kn_clf__algorithm
ball_tree    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__leaf_size :
Possible values:


[8, 16, 24]

kn_clf__leaf_size
8.0     1
16.0    1
24.0    1
Name: best_score_, dtype: int64

kn_clf__leaf_size
8.0    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__n_jobs :
Possible values:


[-1]

kn_clf__n_jobs
-1.0    3
Name: best_score_, dtype: int64

kn_clf__n_jobs
-1.0    1
Name: best_score_, dtype: int64

In [79]:
### Groupby decomp parameters.
print('Count of FastICA best algorithms:')
print('Possible values:')
decomps['fica']['params']['fica__algorithm']
imp_gscvs_df.groupby(by='fica__algorithm')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    .groupby(by='fica__algorithm')['best_score_'].count()

print('Count of FastICA best functions:')
print('Possible values:')
decomps['fica']['params']['fica__fun']
imp_gscvs_df.groupby(by='fica__fun')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    .groupby(by='fica__fun')['best_score_'].count()

Count of FastICA best algorithms:
Possible values:


['parallel', 'deflation']

fica__algorithm
deflation    2
parallel     7
Name: best_score_, dtype: int64

fica__algorithm
deflation    1
parallel     4
Name: best_score_, dtype: int64

Count of FastICA best functions:
Possible values:


['logcosh', 'exp', 'cube']

fica__fun
cube       1
exp        6
logcosh    2
Name: best_score_, dtype: int64

fica__fun
cube       1
exp        3
logcosh    1
Name: best_score_, dtype: int64

In [69]:
###  Groupby selector parameters.
print('Count of SelectPercentile best score functions:')
print('Possible values:')
selectors['sel_per']['params']['sel_per__score_func']
imp_gscvs_df.groupby(by='sel_per__score_func', sort=False)['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    .groupby(by='sel_per__score_func', sort=False)['best_score_'].count()

print('Count of SelectPercentile best percentiles:')
print('Possible values:')
selectors['sel_per']['params']['sel_per__percentile']
imp_gscvs_df.groupby(by='sel_per__percentile')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    .groupby(by='sel_per__percentile')['best_score_'].count()

Count of SelectPercentile best score functions:
Possible values:


[<function sklearn.feature_selection._univariate_selection.f_classif(X, y)>,
 <function sklearn.feature_selection._univariate_selection.chi2(X, y)>,
 functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)]

sel_per__score_func
<function chi2 at 0x0000024138DC3D30>                                                       3
<function f_classif at 0x0000024138DC39D0>                                                  4
functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)    2
Name: best_score_, dtype: int64

sel_per__score_func
functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)    1
<function chi2 at 0x0000024138DC3D30>                                                       3
<function f_classif at 0x0000024138DC39D0>                                                  1
Name: best_score_, dtype: int64

Count of SelectPercentile best percentiles:
Possible values:


[2, 5, 10, 15, 20, 25, 30]

sel_per__percentile
2.0     1
5.0     1
10.0    3
15.0    1
20.0    2
25.0    1
Name: best_score_, dtype: int64

sel_per__percentile
2.0     1
5.0     1
10.0    1
15.0    1
25.0    1
Name: best_score_, dtype: int64

In [70]:
### Groupby methods/steps and compare count and score min, max, mean.
print('\nImputation method stats:\n')
imp_gscvs_df.groupby(by='imp_method')['best_score_'].min().sort_values(ascending=False)
imp_gscvs_df.groupby(by='imp_method')['best_score_'].mean().sort_values(ascending=False)
imp_gscvs_df.groupby(by='imp_method')['best_score_'].max().sort_values(ascending=False)

imp_gscvs_df.sort_values(by='best_score_', ascending=False).head().groupby(by='imp_method')\
    ['best_score_'].count()


Imputation method stats:



imp_method
imp0       0.884314
imp_med    0.872549
imp_mv     0.872549
Name: best_score_, dtype: float64

imp_method
imp0       0.884314
imp_med    0.884096
imp_mv     0.880392
Name: best_score_, dtype: float64

imp_method
imp_mv     0.896078
imp_med    0.895425
imp0       0.884314
Name: best_score_, dtype: float64

imp_method
imp0       3
imp_med    1
imp_mv     1
Name: best_score_, dtype: int64

In [71]:
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    [classifiers['kn_clf']['params'].keys()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,kn_clf__n_neighbors,kn_clf__weights,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
imp_mv,sel_per,fica,rf_clf,,,,,
imp_med,sel_per,fica,rf_clf,,,,,
imp0,sel_per,fica,rf_clf,,,,,
imp0,sel_per,fica,ab_clf,,,,,
imp0,sel_per,fica,kn_clf,3.0,uniform,ball_tree,8.0,-1.0
