In [1]:
#!/usr/bin/python

import sys
# from time import time
import pickle
import pandas as pd
import numpy as np
# import math
from scipy import stats
# import matplotlib.pyplot as plt
# import seaborn as sns
from functools import partial

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

from sklearn.feature_selection import SelectPercentile, SelectFromModel, f_classif, mutual_info_classif, chi2,\
                                        SelectFpr, SelectFdr, RFECV
from sklearn.decomposition import FastICA, IncrementalPCA, KernelPCA, PCA, TruncatedSVD

from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

### My imports
sys.path.append('tools/')
from dos2unix import crlf_to_lf # Borrowed and modified from multiple sources.
from train_test import run_skl, get_base_perfs, search_em_all
from feature_engineering import set_all_ratios, quant_flag_all, out_flag_all, flag_signs, add_k_means_n

### Udacity imports (may be modified)
# from feature_format import featureFormat, targetFeatureSplit
# from tester import dump_classifier_and_data

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
##########################################################################################
### Load the dictionary containing the dataset, and clean it up.
### Make the dict a dataframe because they're easier to work with.
data_df = None #pd.DataFrame()
fp = crlf_to_lf(f_in_path='data/final_project_dataset.pkl')
with open(fp, 'rb') as data_file:
    data_df = pd.DataFrame(pickle.load(data_file)).T

data/final_project_dataset.pkl saved as data/final_project_dataset_unix.pkl in 6705 bytes.


In [3]:
##########################################################################################
### Task 1: Clean up and select what features and subsets *not* to use.
### (Further feature selection will happen after feature engineering.)
    
### Drop email_address since it's a signature.
data_df.drop(columns='email_address', inplace=True)
### Drop the TOTAL row.
data_df.drop(labels=['TOTAL', 'THE TRAVEL AGENCY IN THE PARK'], inplace=True)

### Handle missing values here.
### Replacing 'NaN' with None had a weird result in which values from some
### rows were copied into the missing values of neighboring rows. No idea why.
### Using np.nan did not have that result as far as I can tell.
### But it is a float missing value and thus casts the column as float,
### or as object when other values are not floats.
data_df.replace(to_replace='NaN', value=np.nan, inplace=True)

### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi"
###    (if using featureFormat(), which I don't).

### All units are in USD.
fin_features = ['salary', 'bonus', 'long_term_incentive', 'deferred_income', 'deferral_payments',
                'loan_advances', 'other', 'expenses', 'director_fees', 'total_payments',
                'exercised_stock_options', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value']
pay_features = fin_features[:10]
stock_features = fin_features[10:]
    
### Units are number of emails messages;
email_features = ['to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
                  'shared_receipt_with_poi']

### Boolean, represented as integer.
POI_label = ['poi']

### The first feature must be "poi" if using featureFormat().
features_list = POI_label + fin_features + email_features

### Imputation recasts as float, but as object if left as bool, so set it to int for now.
data_df['poi'] = data_df['poi'].astype(dtype=int)

### Belfer's financial data is shifted one column to the right.
### Shift it one to the left, financial data only.
### Make total_stock_value np.nan for consistency until imputation, but could be 0.
### May remove this row for so many NaNs, but fix it now anyway.
data_df.loc[data_df.index == 'BELFER ROBERT', fin_features] \
    = data_df.loc[data_df.index == 'BELFER ROBERT', fin_features].shift(periods=-1, axis='columns',
                                                                        fill_value=np.nan)

### Bhatnagar's financial data is shifted one to the left.
### Shift it one to the right, financial data only.
### Make salary np.nan.
data_df.loc[data_df.index == 'BHATNAGAR SANJAY', fin_features] \
    = data_df.loc[data_df.index == 'BHATNAGAR SANJAY', fin_features].shift(periods=1, axis='columns',
                                                                           fill_value=np.nan)

### Set totals to sum of values where any values are not NaN.
### i.e. don't make 0 totals NaN, even though some NaN values may be included.
### Makes these rows consistent with other rows that include NaNs and numbers yet have a nonNaN total.
data_df.loc[~(data_df[pay_features].isna().all(axis='columns')), 'total_payments'] \
    = data_df[pay_features[:-1]].sum(axis='columns')
data_df.loc[~(data_df[stock_features].isna().all(axis='columns')), 'total_stock_value'] \
    = data_df[stock_features[:-1]].sum(axis='columns')

### Add one to Glisan's to_message to at least equal shared_receipt_with_poi.
data_df.loc['GLISAN JR BEN F', 'to_messages'] = 874

### Drop features that are too sparse.
drop_feats_lst = ['loan_advances']
data_df.drop(columns=drop_feats_lst, inplace=True)
fin_features = [feat for feat in fin_features if feat not in drop_feats_lst]
pay_features = [feat for feat in pay_features if feat not in drop_feats_lst]
stock_features = [feat for feat in stock_features if feat not in drop_feats_lst]
email_features = [feat for feat in email_features if feat not in drop_feats_lst]
features_list = [feat for feat in features_list if feat not in drop_feats_lst]

### Removed 'email' as signature upon loading.

### Drop persons who have NaN payment totals or NaN stock totals or NaN to_messages or NaN from_messages,
### and are missing 70% of their values.
### (Already made sure that all totals are not NaN if they have subvalues.)
nan_limit = 0.7 * len(data_df.columns)
sparse_records_idx_arr = \
    data_df.loc[data_df['total_payments'].isna() \
                | data_df['total_stock_value'].isna() \
                | data_df['to_messages'].isna() \
                | data_df['from_messages'].isna()]\
           .loc[data_df.isna().sum(axis='columns') > nan_limit]\
           .index.values
data_df.drop(labels=sparse_records_idx_arr, inplace=True)

### This leaves 123 records over 19 features.

In [4]:
### Make a quick baseline model for comparison.
### Impute with 0.
imp_0 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0, copy=False)
imp_0 = imp_0.fit(X=data_df)
data_imp0_df = pd.DataFrame(data=imp_0.transform(X=data_df), columns=data_df.columns, index=data_df.index)

### Split now for baseline model, but also before further processing, outlier removal, scaling, engineering,
### or else test set info leaks into training set.
### Even imputation could if using multivariate imputation or median.
### Decision on how to treat the data should not be influenced by test set either.
X_train, X_test, y_train, y_test = train_test_split(data_imp0_df[features_list[1:]], data_imp0_df[['poi']],
                                                    test_size=.3, random_state=42)
### Some algorithms want 1D y data.
y_train_1d = np.ravel(y_train.astype(bool))
y_test_1d = np.ravel(y_test.astype(bool))

### Split train set again for a baseline model that won't touch the final test set.
X_train_base, X_test_base, y_train_base, y_test_base \
    = train_test_split(X_train, y_train, test_size=.3, random_state=42)
y_train_1d_base = np.ravel(y_train_base.astype(bool))
y_test_1d_base = np.ravel(y_test_base.astype(bool))

### For metrics.
ordered_cols_lst = ['nonPOI_prec', 'POI_prec', 'nonPOI_rec', 'POI_rec', 'nonPOI_f', 'POI_f', 'nonPOI_sup',
                    'POI_sup', 't_neg', 'f_neg', 'f_pos', 't_pos', 'train_t', 'predict_t', 'model']
base_perf_df = pd.DataFrame(columns=ordered_cols_lst)

clf_dict = {'dt_clf': DecisionTreeClassifier, 'rf_clf': RandomForestClassifier, 'ab_clf': AdaBoostClassifier,
            'kn_clf': KNeighborsClassifier, 'gnb_clf': GaussianNB, 'svc_clf': svm.SVC}

print('\nBaseline model performance metrics:\n')
for key, method in clf_dict.items():
    _, _, _, _, perf_sr = run_skl(method=method, X_train=X_train_base,
                                  y_train=y_train_1d_base,
                                  X_test=X_test_base,
                                  y_test=y_test_1d_base,
                                  perf_series=key)
    base_perf_df = base_perf_df.append(perf_sr)


Baseline model performance metrics:

DecisionTreeClassifier()
Training time: 0.003 s
Prediction time: 0.002 s
Confusion matrix:
 [[20  3]
 [ 2  1]]
Precision, recall, f beta score, support:
 (array([0.90909091, 0.25      ]), array([0.86956522, 0.33333333]), array([0.88888889, 0.28571429]), array([23,  3], dtype=int64))
RandomForestClassifier()
Training time: 0.194 s
Prediction time: 0.013 s
Confusion matrix:
 [[21  2]
 [ 3  0]]
Precision, recall, f beta score, support:
 (array([0.875, 0.   ]), array([0.91304348, 0.        ]), array([0.89361702, 0.        ]), array([23,  3], dtype=int64))
AdaBoostClassifier()
Training time: 0.084 s
Prediction time: 0.011 s
Confusion matrix:
 [[21  2]
 [ 3  0]]
Precision, recall, f beta score, support:
 (array([0.875, 0.   ]), array([0.91304348, 0.        ]), array([0.89361702, 0.        ]), array([23,  3], dtype=int64))
KNeighborsClassifier()
Training time: 0.002 s
Prediction time: 0.004 s
Confusion matrix:
 [[22  1]
 [ 3  0]]
Precision, recall, f beta

In [5]:
##########################################################################################
### Task 2: Remove/handle outliers

### Dropped ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK'] row upon loading.

### Drop features that are too sparse.
### Drop 'other' because it's ill-defined and seems overly represented within important features. The nebulous nature of it seems like a good fit for fraud, but high gross 'other' amounts are more correlated with nonPOIs than POIs if anything.
drop_feats_lst = ['director_fees', 'restricted_stock_deferred', 'other']

X_train.drop(columns=drop_feats_lst, inplace=True)
X_test.drop(columns=drop_feats_lst, inplace=True)
data_df.drop(columns=drop_feats_lst, inplace=True)

fin_features = [feat for feat in fin_features if feat not in drop_feats_lst]
pay_features = [feat for feat in pay_features if feat not in drop_feats_lst]
stock_features = [feat for feat in stock_features if feat not in drop_feats_lst]
email_features = [feat for feat in email_features if feat not in drop_feats_lst]
features_list = [feat for feat in features_list if feat not in drop_feats_lst]
del drop_feats_lst

In [6]:
### Don't drop records now because it will mess up the split for Udacity.
### Could drop earlier and resplit, but I've already done a lot of EDA behind the scenes.
### NaN his financials.
X_train.loc[['POWERS WILLIAM'], pay_features] = np.nan
data_df.loc[['POWERS WILLIAM'], pay_features] = np.nan

### Bivariate linear regression of the ratios between to/from/shared with POIs and
### total to and from messages revealed that top coding to_messages and from_messages
### may slightly aid nonPOI precision.
### Only top coding the training set in order to bias the model,
### since I am less concerned with accuracy than I am with POI recall,
### and by extension, nonPOI precision.
X_train['to_messages'] = X_train['to_messages'].apply(lambda x: x if x < 12000 or np.isnan(x) else 12000)
X_train['from_messages'] = X_train['from_messages'].apply(lambda x: x if x < 8000 or np.isnan(x) else 8000)
data_df.loc[X_train.index]['to_messages'] \
    = data_df.loc[X_train.index]['to_messages'].apply(lambda x: x if x < 12000 or np.isnan(x) else 12000)
data_df.loc[X_train.index]['from_messages'] \
    = data_df.loc[X_train.index]['from_messages'].apply(lambda x: x if x < 8000 or np.isnan(x) else 8000)

### Not sure whether top coding these will really help or hinder, if anything at all.
### But, it appears to potentially aid POI recall in some cases
### when comparing payments to totals, and it's more in line with best practices.
### Only really affects Frevert.
top = X_train['total_payments'].dropna().sort_values()[-2]
X_train['total_payments'] = X_train['total_payments'].apply(lambda x : x if x < top or np.isnan(x) else top)
data_df.loc[X_train.index]['total_payments'] \
    = data_df.loc[X_train.index]['total_payments'].apply(lambda x : x if x < top or np.isnan(x) else top)

top = X_train['long_term_incentive'].dropna().sort_values()[-2]
X_train['long_term_incentive'] = \
    X_train['long_term_incentive'].apply(lambda x : x if x < top or np.isnan(x) else top)
data_df.loc[X_train.index]['long_term_incentive'] \
    = data_df.loc[X_train.index]['long_term_incentive'].apply(lambda x : x if x < top or np.isnan(x) else top)

In [7]:
### Same story as Powers, NaN all of Belfer instead of simply dropping.
X_train.loc['BELFER ROBERT'] = np.nan
# belfers_poi = data_df.loc['BELFER ROBERT']['poi']
data_df.loc['BELFER ROBERT', features_list[1:]]= np.nan
# data_df.loc['BELFER ROBERT']['poi'] = belfers_poi

### After look at distributions of ratios of features, more top/bottom coding. ###

### Nan Bannantine's salary, and bottom code salary.
X_train.loc['BANNANTINE JAMES M', 'salary'] = np.nan
data_df.loc['BANNANTINE JAMES M', 'salary'] = np.nan
bottom = X_train['salary'].dropna().sort_values(ascending=False)[-2]
X_train['salary'] = X_train['salary'].apply(lambda x : x if x > bottom or np.isnan(x) else bottom)
data_df.loc[X_train.index]['salary'] \
    = data_df.loc[X_train.index]['salary'].apply(lambda x : x if x > bottom or np.isnan(x) else bottom)

### These two only have one, very low payment value.
X_train.loc[['HAYES ROBERT E', 'HAUG DAVID L'], pay_features] = np.nan
data_df.loc[['HAYES ROBERT E', 'HAUG DAVID L'], pay_features] = np.nan

### Top code deferred_income.
top = X_train['deferred_income'].dropna().sort_values(ascending=True)[-3]
X_train['deferred_income'] = X_train['deferred_income'].apply(lambda x : x if x < top or np.isnan(x) else top)
data_df.loc[X_train.index]['deferred_income'] = \
    data_df.loc[X_train.index]['deferred_income'].apply(lambda x : x if x < top or np.isnan(x) else top)
del top
del bottom

In [8]:
##########################################################################################
### Task 3: Create new feature(s)


### Start with all ratios, within respective subspaces (fin:fin, e:e).
### Add financial ratios within subspaces to data sets.
pay_feats_divby_df = set_all_ratios(df=X_train, denoms=pay_features, numers=pay_features)
stock_feats_divby_df = set_all_ratios(df=X_train, denoms=stock_features, numers=stock_features)

### Only plausible email ratios (all reciprocals still, to get the 0s to infs):
to_lst = ['to_messages', 'from_poi_to_this_person', 'shared_receipt_with_poi']
from_lst = ['from_messages', 'from_this_person_to_poi']
email_to_divby_df = set_all_ratios(df=X_train, denoms=to_lst, numers=to_lst)
email_from_divby_df = set_all_ratios(df=X_train, denoms=from_lst, numers=from_lst)

X_train = pd.concat(objs=[X_train, pay_feats_divby_df, stock_feats_divby_df, email_to_divby_df,
                          email_from_divby_df], axis=1)

### Do for test set.
pay_feats_divby_df = set_all_ratios(df=X_test, denoms=pay_features, numers=pay_features)
stock_feats_divby_df = set_all_ratios(df=X_test, denoms=stock_features, numers=stock_features)
email_to_divby_df = set_all_ratios(df=X_test, denoms=to_lst, numers=to_lst)
email_from_divby_df = set_all_ratios(df=X_test, denoms=from_lst, numers=from_lst)
X_test = pd.concat(objs=[X_test, pay_feats_divby_df, stock_feats_divby_df, email_to_divby_df,
                         email_from_divby_df], axis=1)

### Do for full set.
pay_feats_divby_df = set_all_ratios(df=data_df, denoms=pay_features, numers=pay_features)
stock_feats_divby_df = set_all_ratios(df=data_df, denoms=stock_features, numers=stock_features)
email_to_divby_df = set_all_ratios(df=data_df, denoms=to_lst, numers=to_lst)
email_from_divby_df = set_all_ratios(df=data_df, denoms=from_lst, numers=from_lst)
data_df = pd.concat(objs=[data_df, pay_feats_divby_df, stock_feats_divby_df, email_to_divby_df,
                          email_from_divby_df], axis=1)
del to_lst
del from_lst

### Set all np.inf to np.nan.
X_train = X_train.apply(func=(lambda col: col.apply(func=(lambda x: np.nan if abs(x) == abs(np.inf) else x))))
X_test = X_test.apply(func=(lambda col: col.apply(func=(lambda x: np.nan if abs(x) == abs(np.inf) else x))))
data_df = data_df.apply(func=(lambda col: col.apply(func=(lambda x: np.nan if abs(x) == abs(np.inf) else x))))

### Remove all features containing less than 30% training observations.
drop_lst = list(X_train.count().loc[X_train.count() < .3 * len(X_train.index)].index)
X_train.drop(columns=drop_lst, inplace=True)
X_test.drop(columns=drop_lst, inplace=True)
data_df.drop(columns=drop_lst, inplace=True)

pay_feats_divby_lst = [feat for feat in list(pay_feats_divby_df.columns) if not feat in drop_lst]
stock_feats_divby_lst = [feat for feat in list(stock_feats_divby_df.columns) if not feat in drop_lst]
email_feats_divby_lst = [feat for feat in list(email_to_divby_df.columns) if not feat in drop_lst] \
                        + [feat for feat in list(email_from_divby_df.columns) if not feat in drop_lst]
fin_features = [feat for feat in fin_features if feat not in drop_lst] + pay_feats_divby_lst \
    + stock_feats_divby_lst
pay_features = [feat for feat in pay_features if feat not in drop_lst]
stock_features = [feat for feat in stock_features if feat not in drop_lst]
email_features = [feat for feat in email_features if feat not in drop_lst] + email_feats_divby_lst
features_list = [feat for feat in features_list if feat not in drop_lst] + pay_feats_divby_lst \
    + stock_feats_divby_lst + email_feats_divby_lst
del drop_lst

In [9]:
### Create features that flag mambership in various quantiles, outliership, and x > 0.
### Use multiple quantiles: quartiles, quintiles, and deciles.
### Retain np.nans.

to_flag_lst = fin_features + email_features

### Could write a function, but I'll just paste and edit.
### Flag train set.
fin_quant_flags_df = quant_flag_all(df=X_train[fin_features], quant_df=X_train[fin_features])
email_quant_flags_df = quant_flag_all(df=X_train[email_features], quant_df=X_train[email_features])
fin_out_flags_df = out_flag_all(df=X_train[fin_features], quant_df=X_train[fin_features])
email_out_flags_df = out_flag_all(df=X_train[email_features], quant_df=X_train[email_features])
sign_flags_df = flag_signs(df=X_train[to_flag_lst])
X_train = pd.concat(objs=[X_train, fin_quant_flags_df, email_quant_flags_df, fin_out_flags_df,
                          email_out_flags_df, sign_flags_df], axis=1)

### Flag test set.
fin_quant_flags_df = quant_flag_all(df=X_test[fin_features], quant_df=X_train[fin_features])
email_quant_flags_df = quant_flag_all(df=X_test[email_features], quant_df=X_train[email_features])
fin_out_flags_df = out_flag_all(df=X_test[fin_features], quant_df=X_train[fin_features])
email_out_flags_df = out_flag_all(df=X_test[email_features], quant_df=X_train[email_features])
sign_flags_df = flag_signs(df=X_test[to_flag_lst])
X_test = pd.concat(objs=[X_test, fin_quant_flags_df, email_quant_flags_df, fin_out_flags_df,
                          email_out_flags_df, sign_flags_df], axis=1)

### Flag whole set.
fin_quant_flags_df = quant_flag_all(df=data_df[fin_features], quant_df=X_train[fin_features])
email_quant_flags_df = quant_flag_all(df=data_df[email_features], quant_df=X_train[email_features])
fin_out_flags_df = out_flag_all(df=data_df[fin_features], quant_df=X_train[fin_features])
email_out_flags_df = out_flag_all(df=data_df[email_features], quant_df=X_train[email_features])
sign_flags_df = flag_signs(df=data_df[to_flag_lst])
data_df = pd.concat(objs=[data_df, fin_quant_flags_df, email_quant_flags_df, fin_out_flags_df,
                          email_out_flags_df, sign_flags_df], axis=1)

In [10]:
### Create and update feature lists.
fin_quant_flags_lst = list(fin_quant_flags_df.columns)
email_quant_flags_lst = list(email_quant_flags_df.columns)
quant_flags_lst = fin_quant_flags_lst + email_quant_flags_lst

fin_out_flags_lst = list(fin_out_flags_df.columns)
email_out_flags_lst = list(email_out_flags_df.columns)
out_flags_lst = fin_out_flags_lst + email_out_flags_lst

fin_features += fin_quant_flags_lst + fin_out_flags_lst
email_features += email_quant_flags_lst + email_out_flags_lst

sign_flags_lst = list(sign_flags_df.columns)

features_list = features_list + quant_flags_lst + out_flags_lst + sign_flags_lst

del to_flag_lst
del fin_quant_flags_df
del email_quant_flags_df
del fin_out_flags_df
del email_out_flags_df
del sign_flags_df

In [11]:
### Scale features
### Just do min-max on floats, not bools (some are objects for now because np.nan)

float_feats_lst = fin_features + email_features
bool_feats_lst =  sign_flags_lst

scaler = MinMaxScaler()
train_floats = pd.DataFrame(data=scaler.fit_transform(X=X_train[float_feats_lst]),
                            columns=float_feats_lst, index=X_train.index)
X_train_scaled = pd.concat(objs=[train_floats, X_train[bool_feats_lst]], axis=1)

test_floats = pd.DataFrame(data=scaler.transform(X=X_test[float_feats_lst]),
                           columns=float_feats_lst,index=X_test.index)
X_test_scaled = pd.concat(objs=[test_floats, X_test[bool_feats_lst]], axis=1)

all_floats = pd.DataFrame(data=scaler.transform(X=data_df[float_feats_lst]),
                          columns=float_feats_lst, index=data_df.index)
data_df_scaled = pd.concat(objs=[data_df['poi'], all_floats, data_df[bool_feats_lst]], axis=1)

del float_feats_lst
del scaler
del train_floats
del test_floats
del all_floats
del X_train
del X_test
del data_df

In [12]:
### Impute missing values:
### Financial features to 0, email features to median, and bools to mode.
### Restore bools to bool (from object because np.nan)

imp0 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
imp_mod = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

### Financial features to 0.
fin_train_df = pd.DataFrame(data=imp0.fit_transform(X=X_train_scaled[fin_features]),
                        columns=fin_features, index=X_train_scaled.index)
fin_test_df = pd.DataFrame(data=imp0.transform(X=X_test_scaled[fin_features]),
                       columns=fin_features, index=X_test_scaled.index)
fin_all_df = pd.DataFrame(data=imp0.transform(X=data_df_scaled[fin_features]),
                      columns=fin_features, index=data_df_scaled.index)

### email features to median
email_train_df = pd.DataFrame(data=imp_med.fit_transform(X=X_train_scaled[email_features]),
                        columns=email_features, index=X_train_scaled.index)
email_test_df = pd.DataFrame(data=imp_med.transform(X=X_test_scaled[email_features]),
                       columns=email_features, index=X_test_scaled.index)
email_all_df = pd.DataFrame(data=imp_med.transform(X=data_df_scaled[email_features]),
                      columns=email_features, index=data_df_scaled.index)

### Bools to mode.
### Restore bools to bool (from object because np.nan)
bool_train_df = (pd.DataFrame(data=imp_mod.fit_transform(X=X_train_scaled[bool_feats_lst]),
                              columns=bool_feats_lst, index=X_train_scaled.index)).astype(bool)
bool_test_df = pd.DataFrame(data=imp_mod.transform(X=X_test_scaled[bool_feats_lst]),
                            columns=bool_feats_lst, index=X_test_scaled.index).astype(bool)
bool_all_df = pd.DataFrame(data=imp_mod.transform(X=data_df_scaled[bool_feats_lst]),
                           columns=bool_feats_lst, index=data_df_scaled.index).astype(bool)

### Concat
X_train_scaled_imp = pd.concat(objs=[fin_train_df, email_train_df, bool_train_df], axis=1)
X_test_scaled_imp = pd.concat(objs=[fin_test_df, email_test_df, bool_test_df], axis=1)
data_df_scaled_imp = pd.concat(objs=[data_df_scaled['poi'], fin_all_df, email_all_df, bool_all_df], axis=1)

del fin_train_df
del email_train_df
del bool_train_df
del fin_test_df
del email_test_df
del bool_test_df
del fin_all_df
del email_all_df
del bool_all_df
del bool_feats_lst
del X_train_scaled
del X_test_scaled
del data_df_scaled

In [13]:
### sklearn predictions as features

# 1) Kmeans cluster.
train_cluster_subspace, test_cluster_subspace \
    = add_k_means_n(X_train=X_train_scaled_imp, X_test=X_test_scaled_imp)
X_train_scaled_imp_k = pd.concat(objs=[X_train_scaled_imp, train_cluster_subspace], axis=1)
X_test_scaled_imp_k = pd.concat(objs=[X_test_scaled_imp, test_cluster_subspace], axis=1)

train_cluster_subspace, test_cluster_subspace \
    = add_k_means_n(X_train=X_train_scaled_imp, X_test=data_df_scaled_imp[features_list[1:]])
data_df_scaled_imp_k = pd.concat(objs=[data_df_scaled_imp, test_cluster_subspace], axis=1)

k_means_feats_lst = k_means_feats_lst = list(train_cluster_subspace.columns)
features_list += k_means_feats_lst

del train_cluster_subspace
del test_cluster_subspace
del X_train_scaled_imp
del X_test_scaled_imp
del data_df_scaled_imp

In [14]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

### Construct baseline performance with all features before tuning/selection.
### Split train set again for a baseline model that won't touch the final test set.
X_train_base, X_test_base, y_train_base, y_test_base \
    = train_test_split(X_train_scaled_imp_k, y_train, test_size=.3, random_state=42)
y_train_1d_base = np.ravel(y_train_base.astype(bool))
y_test_1d_base = np.ravel(y_test_base.astype(bool))

base_perf_engineered_df = pd.DataFrame(columns=ordered_cols_lst)

base_perfs_dict = {'base_perf_engineered': base_perf_engineered_df}
imp_sets_dict = {'base_perf_engineered': [X_train_base, X_test_base]}

### Modifies the base_perfs_dict in place, since dict has no deep copy method.
get_base_perfs(base_perfs_dict=base_perfs_dict, imp_sets_dict=imp_sets_dict, clf_dict=clf_dict, y_train=y_train_1d_base,
               y_test=y_test_1d_base)

base_perfs_dict['first_base'] = base_perf_df


 base_perf_engineered

 dt_clf
DecisionTreeClassifier()
Training time: 0.022 s
Prediction time: 0.006 s
Confusion matrix:
 [[21  2]
 [ 3  0]]
Precision, recall, f beta score, support:
 (array([0.875, 0.   ]), array([0.91304348, 0.        ]), array([0.89361702, 0.        ]), array([23,  3], dtype=int64))

 rf_clf
RandomForestClassifier()
Training time: 0.182 s
Prediction time: 0.017 s
Confusion matrix:
 [[21  2]
 [ 3  0]]
Precision, recall, f beta score, support:
 (array([0.875, 0.   ]), array([0.91304348, 0.        ]), array([0.89361702, 0.        ]), array([23,  3], dtype=int64))

 ab_clf
AdaBoostClassifier()
Training time: 0.178 s
Prediction time: 0.044 s
Confusion matrix:
 [[20  3]
 [ 3  0]]
Precision, recall, f beta score, support:
 (array([0.86956522, 0.        ]), array([0.86956522, 0.        ]), array([0.86956522, 0.        ]), array([23,  3], dtype=int64))

 kn_clf
KNeighborsClassifier()
Training time: 0.007 s
Prediction time: 0.008 s
Confusion matrix:
 [[23  0]
 [ 3  0]]
Prec

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
imp_gscvs_dict = None
with open('imp_gscvs_dict3.pkl', 'rb') as file:
    imp_gscvs_dict = pickle.load(file=file)

In [16]:
imp_gscvs_dict

{'mixed_impute': {'sel_per_fica_rf_clf': GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                         ('fica', FastICA()),
                                         ('rf_clf', RandomForestClassifier())],
                                  verbose=True),
               n_jobs=-1,
               param_grid={'fica__algorithm': ['parallel', 'deflation'],
                           'fica__fun': ['logcosh', 'exp', 'cube'],
                           'fica__random_state': [42],
                           'rf_clf__bootstrap': [True, False],
                           'rf_clf__max_depth': [16, 32, 64],
                           'rf_clf__max_featu...
                           'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                           'rf_clf__n_jobs': [-1], 'rf_clf__random_state': [42],
                           'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30],
                           'sel_per__score_func': [<function

In [17]:
get_f = lambda precision, recall: 2 * ((precision * recall) / (precision + recall))

for name, gscv in imp_gscvs_dict['mixed_impute'].items():
    print(name, '\n')
    print('Best score:\n', gscv.best_score_, '\n')
    print('Best estimator:\n', gscv.best_estimator_, '\n')
    clf = gscv.best_estimator_.fit(X=X_train_scaled_imp_k, y=y_train_1d)
    pred = clf.predict(X_test_scaled_imp_k)
    conf = confusion_matrix(y_true=y_test_1d, y_pred=pred)
    print('Confusion matrix:\n', conf, '\n')
    prf = precision_recall_fscore_support(y_true=y_test_1d, y_pred=pred)
    print('Precision, recall, f beta score, support:\n', prf, '\n')
    print('Custom F beta using nonPOI precision and POI recall:\n', get_f(prf[0][0], prf[1][1]), '\n')
    print('\n')

sel_per_fica_rf_clf 

Best score:
 0.884967320261438 

Best estimator:
 Pipeline(steps=[('sel_per', SelectPercentile(percentile=15)),
                ('fica',
                 FastICA(algorithm='deflation', fun='exp', random_state=42)),
                ('rf_clf',
                 RandomForestClassifier(max_depth=16, max_features='log2',
                                        min_samples_leaf=4, n_estimators=4,
                                        n_jobs=-1, random_state=42))],
         verbose=True) 

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s


 334 339 340 346 347 350 351 352] are constant.
  f = msb / msw


[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.2s
[Pipeline] ............ (step 3 of 3) Processing rf_clf, total=   0.0s
Confusion matrix:
 [[28  3]
 [ 6  0]] 

Precision, recall, f beta score, support:
 (array([0.82352941, 0.        ]), array([0.90322581, 0.        ]), array([0.86153846, 0.        ]), array([31,  6], dtype=int64)) 

Custom F beta using nonPOI precision and POI recall:
 0.0 



sel_per_fica_ab_clf 

Best score:
 0.8725490196078433 

Best estimator:
 Pipeline(steps=[('sel_per',
                 SelectPercentile(score_func=functools.partial(<function mutual_info_classif at 0x0000018150CFC280>, random_state=42))),
                ('fica',
                 FastICA(algorithm='deflation', fun='exp', random_state=42)),
                ('ab_clf',
                 AdaBoostClassifier(base_estimator=GaussianNB(),
                                    n_estimators=16, random_state=42))],
         verbose=True) 

[Pipeline] ........... (step 1 of 3) Processing se

  _warn_prf(average, modifier, msg_start, len(result))


[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   1.2s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.9s
[Pipeline] ............ (step 3 of 3) Processing kn_clf, total=   0.0s
Confusion matrix:
 [[31  0]
 [ 6  0]] 

Precision, recall, f beta score, support:
 (array([0.83783784, 0.        ]), array([1., 0.]), array([0.91176471, 0.        ]), array([31,  6], dtype=int64)) 

Custom F beta using nonPOI precision and POI recall:
 0.0 





  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
clf = Pipeline(steps=[('sel_per', SelectPercentile(percentile=15)),
                ('fica',
                 FastICA(algorithm='deflation', fun='exp', random_state=42)),
                ('rf_clf',
                 RandomForestClassifier(max_depth=16, max_features='log2',
                                        min_samples_leaf=4, n_estimators=4,
                                        n_jobs=-1, random_state=42))],
               verbose=True).fit(X=X_train_scaled_imp_k, y=y_train_1d)
pred = clf.predict(X=X_test_scaled_imp_k)
pred = gscv.predict(X_test_scaled_imp_k)
conf = confusion_matrix(y_true=y_test_1d, y_pred=pred)
print('Confusion matrix:\n', conf, '\n')
prf = precision_recall_fscore_support(y_true=y_test_1d, y_pred=pred)
print('Precision, recall, f beta score, support:\n', prf, '\n')
print('Custom F beta using nonPOI precision and POI recall:\n', get_f(prf[0][0], prf[1][1]), '\n')

 334 339 340 346 347 350 351 352] are constant.
  f = msb / msw


[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.2s
[Pipeline] ............ (step 3 of 3) Processing rf_clf, total=   0.0s
Confusion matrix:
 [[31  0]
 [ 6  0]] 

Precision, recall, f beta score, support:
 (array([0.83783784, 0.        ]), array([1., 0.]), array([0.91176471, 0.        ]), array([31,  6], dtype=int64)) 

Custom F beta using nonPOI precision and POI recall:
 0.0 



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

### Search 'em all, round 1: See first_gridsearch.ipynb in supplemental_notebooks folder.
### Search 'em all, round 2: See second_gridsearch.ipynb in supplemental_notebooks folder.

n_jobs = -1

mutual_info_classif_partial = partial(mutual_info_classif, random_state=42)
DecisionTreeClassifier_partial = partial(DecisionTreeClassifier, random_state=42)
RandomForestClassifier_partial = partial(RandomForestClassifier, random_state=42, n_jobs=n_jobs)
AdaBoostClassifier_partial = partial(AdaBoostClassifier, random_state=42)
svm_SVC_partial = partial(svm.SVC, random_state=42)
KNeighborsClassifier_partial = partial(KNeighborsClassifier, n_jobs=n_jobs)

selectors = {
    'sel_per': {
        'sel': SelectPercentile(),
        'params': {
            'sel_per__score_func': [f_classif, chi2, mutual_info_classif_partial],
            'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30]
        }
    }
}

decomps = {
    'empty' : None
#     'fica': {
#         'dec': FastICA(),
#         'params': {
#             'fica__algorithm': ['parallel', 'deflation'],
#             'fica__fun': ['logcosh', 'exp', 'cube'],
#             'fica__random_state': [42]
#         }
#     }
}

classifiers = {
    'rf_clf': {
        'clf': RandomForestClassifier(),
        'params': {
            'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
            'rf_clf__max_features': ['sqrt', 'log2'],
            'rf_clf__max_depth': [16, 32, 64],
            'rf_clf__min_samples_split': [2],
            'rf_clf__min_samples_leaf': [1, 2, 3, 4, 5],
            'rf_clf__bootstrap': [True, False],
            'rf_clf__random_state': [42],
            'rf_clf__n_jobs': [n_jobs]
        }
    },
    'ab_clf': {
        'clf': AdaBoostClassifier(),
        'params': {
            'ab_clf__base_estimator': [
                DecisionTreeClassifier_partial(),
                RandomForestClassifier_partial(),
                AdaBoostClassifier_partial(),
                svm_SVC_partial(),
                KNeighborsClassifier_partial(),
                GaussianNB()
            ],
            'ab_clf__n_estimators': [8, 16, 24, 32, 40, 48, 56],
            'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
            'ab_clf__random_state': [42]
        }
    },
    'kn_clf': {
        'clf': KNeighborsClassifier(),
        'params': {
            'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
            'kn_clf__weights': ['uniform', 'distance'],
            'kn_clf__algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'kn_clf__leaf_size': [4, 8, 12, 16, 20, 24, 30],
            'kn_clf__n_jobs': [n_jobs]
        }
    },
    'gnb_clf': {
        'clf': GaussianNB(),
        'params': {
            # Defaults
        }
    },
}

imp_gscvs_dict = {}
imp_gscvs_dict['mixed_impute'] \
    = search_em_all(X_train=X_train_scaled_imp_k, y_train=y_train_1d, selectors=selectors,
                    decomps=decomps, classifiers=classifiers, pipe_verbose=True,
                    scoring='recall_weighted', n_jobs=-1)


 0 sel_per_empty_rf_clf 

Fitting 5 folds for each of 10080 candidates, totalling 50400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   53.5s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

[Pipeline] ........... (step 1 of 2) Processing sel_per, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing rf_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('rf_clf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'rf_clf__bootstrap': [True, False],
                         'rf_clf__max_depth': [16, 32, 64],
                         'rf_clf__max_features': ['sqrt', 'log2'],
                         'rf_clf__min_samples_leaf': [1, 2, 3, 4, 5],
                         'rf_clf__min_samples_split': [2],
                         'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                         'rf_clf__n_jobs': [-1], 'rf_clf__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30],
                         'sel_per__score_func': [<function f_classif at 0x0000018150

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 560 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1616 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2096 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2640 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 3248 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 3920 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 4656 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 5456 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 6320 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 7248 tasks      | e

[Pipeline] ........... (step 1 of 2) Processing sel_per, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing ab_clf, total=   6.9s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('ab_clf', AdaBoostClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
                         'ab_clf__base_estimator': [DecisionTreeClassifier(random_state=42),
                                                    RandomForestClassifier(n_jobs=-1,
                                                                           random_state=42),
                                                    AdaBoostClassifier(random_state=42),
                                                    SVC(random_state=42...
                                                    GaussianNB()],
                         'ab_clf__n_estimators': [8, 16, 24, 32, 40,

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 608 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 896 tasks      | elapsed:   52.4s
[Parallel(n_jobs=-1)]: Done 1248 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1664 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2144 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2688 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 3296 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3968 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 4704 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 5504 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 6368 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 7296 tasks      | e

[Pipeline] ........... (step 1 of 2) Processing sel_per, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing kn_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('kn_clf', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'kn_clf__algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'kn_clf__leaf_size': [4, 8, 12, 16, 20, 24, 30],
                         'kn_clf__n_jobs': [-1],
                         'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kn_clf__weights': ['uniform', 'distance'],
                         'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30],
                         'sel_per__score_func': [<function f_classif at 0x0000018150CD83A0>,
                                                 <function chi2 at 0x0000018150CD8700>,
                             

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  90 out of 105 | elapsed:    5.4s remaining:    0.8s


[Pipeline] ........... (step 1 of 2) Processing sel_per, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing gnb_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('gnb_clf', GaussianNB())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30],
                         'sel_per__score_func': [<function f_classif at 0x0000018150CD83A0>,
                                                 <function chi2 at 0x0000018150CD8700>,
                                                 functools.partial(<function mutual_info_classif at 0x0000018150CFC280>, random_state=42)]},
             scoring='recall_weighted', verbose=3)

best_score_: 0.7673202614379085

best_params_: {'sel_per__percentile': 2, 'sel_per__score_func': <function chi2 at 0x0000018150CD8700>}


[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:    7.6s finished


In [21]:
with open('data/imp_gscvs_dict4.pkl', 'wb') as file:
    pickle.dump(obj=imp_gscvs_dict, file=file)

In [22]:
imp_gscvs_dict

{'mixed_impute': {'sel_per_empty_rf_clf': GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                         ('rf_clf', RandomForestClassifier())],
                                  verbose=True),
               n_jobs=-1,
               param_grid={'rf_clf__bootstrap': [True, False],
                           'rf_clf__max_depth': [16, 32, 64],
                           'rf_clf__max_features': ['sqrt', 'log2'],
                           'rf_clf__min_samples_leaf': [1, 2, 3, 4, 5],
                           'rf_clf__min_samples_split': [2],
                           'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                           'rf_clf__n_jobs': [-1], 'rf_clf__random_state': [42],
                           'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30],
                           'sel_per__score_func': [<function f_classif at 0x0000018150CD83A0>,
                                                   <function chi2 at 

In [23]:
get_f = lambda precision, recall: 2 * ((precision * recall) / (precision + recall))

for name, gscv in imp_gscvs_dict['mixed_impute'].items():
    print(name, '\n')
    print('Best score:\n', gscv.best_score_, '\n')
    print('Best estimator:\n', gscv.best_estimator_, '\n')
    clf = gscv.best_estimator_.fit(X=X_train_scaled_imp_k, y=y_train_1d)
    pred = clf.predict(X_test_scaled_imp_k)
    conf = confusion_matrix(y_true=y_test_1d, y_pred=pred)
    print('Confusion matrix:\n', conf, '\n')
    prf = precision_recall_fscore_support(y_true=y_test_1d, y_pred=pred)
    print('Precision, recall, f beta score, support:\n', prf, '\n')
    print('Custom F beta using nonPOI precision and POI recall:\n', get_f(prf[0][0], prf[1][1]), '\n')
    print('\n')

sel_per_empty_rf_clf 

Best score:
 0.872549019607843 

Best estimator:
 Pipeline(steps=[('sel_per',
                 SelectPercentile(percentile=20,
                                  score_func=<function chi2 at 0x0000018150CD8700>)),
                ('rf_clf',
                 RandomForestClassifier(max_depth=16, max_features='log2',
                                        n_estimators=10, n_jobs=-1,
                                        random_state=42))],
         verbose=True) 

[Pipeline] ........... (step 1 of 2) Processing sel_per, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing rf_clf, total=   0.0s
Confusion matrix:
 [[29  2]
 [ 5  1]] 

Precision, recall, f beta score, support:
 (array([0.85294118, 0.33333333]), array([0.93548387, 0.16666667]), array([0.89230769, 0.22222222]), array([31,  6], dtype=int64)) 

Custom F beta using nonPOI precision and POI recall:
 0.27884615384615385 



sel_per_empty_ab_clf 

Best score:
 0.861437908496732 

Best estimator:
 P

 334 339 340 346 347 350 351 352] are constant.
  f = msb / msw


In [55]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

selected_features_list = []

#############################
### For Udacity. Not sure I need it.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, selected_features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
###############################

dump_classifier_and_data(clf, my_dataset, selected_features_list)

with open('X_train_scaled_imp0.pkl', 'wb') as file:
    pickle.dump(obj=X_train_scaled_imp0, file=file)
with open('X_test_scaled_imp0.pkl', 'wb') as file:
    pickle.dump(obj=X_test_scaled_imp0, file=file)
with open('data_df_scaled_imp0.pkl', 'wb') as file:
    pickle.dump(obj=data_df_scaled_imp0, file=file)
    
with open('X_train_scaled_imp_med.pkl', 'wb') as file:
    pickle.dump(obj=X_train_scaled_imp_med, file=file)
with open('X_test_scaled_imp_med.pkl', 'wb') as file:
    pickle.dump(obj=X_test_scaled_imp_med, file=file)
with open('data_df_scaled_imp_med.pkl', 'wb') as file:
    pickle.dump(obj=data_df_scaled_imp_med, file=file)
    
with open('X_train_scaled_imp_mv.pkl', 'wb') as file:
    pickle.dump(obj=X_train_scaled_imp_mv, file=file)
with open('X_test_scaled_imp_mv.pkl', 'wb') as file:
    pickle.dump(obj=X_test_scaled_imp_mv, file=file)
with open('data_df_scaled_imp_mv.pkl', 'wb') as file:
    pickle.dump(obj=data_df_scaled_imp_mv, file=file)

with open('full_features_list.pkl', 'wb') as file:
    pickle.dump(obj=full_features_list, file=file)
with open('selected_features_list.pkl', 'wb') as file:
    pickle.dump(obj=selected_features_list, file=file)
with open('fin_features.pkl', 'wb') as file:
    pickle.dump(obj=fin_features, file=file)
with open('pay_features.pkl', 'wb') as file:
    pickle.dump(obj=pay_features, file=file)
with open('stock_features.pkl', 'wb') as file:
    pickle.dump(obj=stock_features, file=file)
with open('email_features.pkl', 'wb') as file:
    pickle.dump(obj=email_features, file=file)
with open('pay_feats_divby_lst.pkl', 'wb') as file:
    pickle.dump(obj=pay_feats_divby_lst, file=file)
with open('stock_feats_divby_lst.pkl', 'wb') as file:
    pickle.dump(obj=stock_feats_divby_lst, file=file)
with open('email_feats_divby_lst.pkl', 'wb') as file:
    pickle.dump(obj=email_feats_divby_lst, file=file)
with open('quant_flags_lst.pkl', 'wb') as file:
    pickle.dump(obj=quant_flags_lst, file=file)
with open('sign_flags_lst.pkl', 'wb') as file:
    pickle.dump(obj=sign_flags_lst, file=file)
    
with open('base_perfs_dict.pkl', 'wb') as file:
    pickle.dump(obj=base_perfs_dict, file=file)


Imputation method stats:



imp_method
imp_mv     0.814379
imp_med    0.791503
imp0       0.732026
Name: best_score_, dtype: float64

imp_method
imp_mv     0.859869
imp_med    0.859673
imp0       0.850196
Name: best_score_, dtype: float64

imp_method
imp0       0.884314
imp_med    0.884314
imp_mv     0.883660
Name: best_score_, dtype: float64

imp_method
imp0       4
imp_med    5
imp_mv     6
Name: best_score_, dtype: int64

In [61]:
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head(15)\
    [classifiers['kn_clf']['params'].keys()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,kn_clf__n_neighbors,kn_clf__weights,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
imp0,sel_per,fica,kn_clf,4.0,distance,ball_tree,16.0,-1.0
imp_med,sel_per,fica,kn_clf,2.0,uniform,ball_tree,16.0,-1.0
imp_mv,sel_per,fica,rf_clf,,,,,
imp_med,sel_per,ipca,rf_clf,,,,,
imp_med,sel_per,fica,rf_clf,,,,,
imp_med,sel_per,fica,ab_clf,,,,,
imp0,sel_per,fica,rf_clf,,,,,
imp_mv,sel_per,ipca,kn_clf,8.0,uniform,ball_tree,16.0,-1.0
imp0,sel_per,ipca,rf_clf,,,,,
imp0,sel_per,ipca,kn_clf,4.0,uniform,ball_tree,16.0,-1.0


In [62]:
### Final search.

n_jobs = -1

mutual_info_classif_partial = partial(mutual_info_classif, random_state=42)
DecisionTreeClassifier_partial = partial(DecisionTreeClassifier, random_state=42)
RandomForestClassifier_partial = partial(RandomForestClassifier, random_state=42, n_jobs=n_jobs)
AdaBoostClassifier_partial = partial(AdaBoostClassifier, random_state=42)
svm_SVC_partial = partial(svm.SVC, random_state=42)
KNeighborsClassifier_partial = partial(KNeighborsClassifier, n_jobs=n_jobs)

selectors = {
    'sel_per': {
        'sel': SelectPercentile(),
        'params': {
            'sel_per__score_func': [f_classif, chi2, mutual_info_classif_partial],
            'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30]
        }
    }
}

decomps = {
    'fica': {
        'dec': FastICA(),
        'params': {
            'fica__algorithm': ['parallel', 'deflation'],
            'fica__fun': ['logcosh', 'exp', 'cube'],
            'fica__random_state': [42]
        }
    },
}

classifiers = {
    'rf_clf': {
        'clf': RandomForestClassifier(),
        'params': {
            'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
            'rf_clf__max_features': ['sqrt', 'log2'],
            'rf_clf__max_depth': [16, 32, 64],
            'rf_clf__min_samples_split': [2],
            'rf_clf__min_samples_leaf': [1, 2, 3, 4, 5],
            'rf_clf__bootstrap': [True, False],
            'rf_clf__random_state': [42],
            'rf_clf__n_jobs': [n_jobs]
        }
    },
    'ab_clf': {
        'clf': AdaBoostClassifier(),
        'params': {
            'ab_clf__base_estimator': [GaussianNB()],
            'ab_clf__n_estimators': [16, 32, 48],
            'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
            'ab_clf__random_state': [42]
        }
    },
    'kn_clf': {
        'clf': KNeighborsClassifier(),
        'params': {
            'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
            'kn_clf__weights': ['uniform', 'distance'],
            'kn_clf__algorithm': ['ball_tree'],
            'kn_clf__leaf_size': [8, 16, 24],
            'kn_clf__n_jobs': [n_jobs]
        }
    },
}

imp_gscvs_dict = {}
print('\nimp0\n')
imp_gscvs_dict['imp0'] = search_em_all(X_train=X_train_scaled_imp0)
print('\nimp_med\n')
imp_gscvs_dict['imp_med'] = search_em_all(X_train=X_train_scaled_imp_med)
print('\nimp_mv\n')
imp_gscvs_dict['imp_mv'] = search_em_all(X_train=X_train_scaled_imp_mv)


imp0


 0 sel_per_fica_rf_clf 

Fitting 5 folds for each of 60480 candidates, totalling 302400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing rf_clf, total=   0.0s



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('rf_clf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'rf_clf__bootstrap': [True, False],
                         'rf_clf__max_depth': [16, 32, 64],
                         'rf_clf__max_featu...
                         'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                         'rf_clf__n_jobs': [-1], 'rf_clf__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20, 25, 30],
                         'sel_per__score_func': [<function f_classif at 0x0000024138DC39D0>,
                                 

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3780 out of 3780 | elapsed:  4.2min finished
 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
 271 272 273 274 275 278 279 280] are constant.
  f = msb / msw
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.0s
[Pipeline] ............ (step 3 of 3) Processing ab_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('ab_clf', AdaBoostClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
                         'ab_clf__base_estimator': [GaussianNB()],
                         'ab_clf__n_estimators': [16, 32, 48],
                         'ab_clf__random_state': [42],
                         'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 360 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 872 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 1224 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1640 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2120 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2664 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3272 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3944 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 4680 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 5480 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 6344 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 7272 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 8264 tasks      | elapsed:  8.5m

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing kn_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('kn_clf', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'kn_clf__algorithm': ['ball_tree'],
                         'kn_clf__leaf_size': [8, 16, 24],
                         'kn_clf__n_jobs': [-1],
                         'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kn_clf__weights': ['uniform', 'distance'],


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 4632 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 5432 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 6296 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | e

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing rf_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('rf_clf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'rf_clf__bootstrap': [True, False],
                         'rf_clf__max_depth': [16, 32, 64],
                         'rf_clf__max_featu...
                         'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                         'rf_clf__n_jobs': [-1], 'rf_clf__random_st

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3780 out of 3780 | elapsed:  4.5min finished


[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.6s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing ab_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('ab_clf', AdaBoostClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
                         'ab_clf__base_estimator': [GaussianNB()],
                         'ab_clf__n_estimators': [16, 32, 48],
                         'ab_clf__random_state': [42],
                         'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 4632 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 5432 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 6296 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | e

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing kn_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('kn_clf', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'kn_clf__algorithm': ['ball_tree'],
                         'kn_clf__leaf_size': [8, 16, 24],
                         'kn_clf__n_jobs': [-1],
                         'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kn_clf__weights': ['uniform', 'distance'],




Fitting 5 folds for each of 60480 candidates, totalling 302400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 4632 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 5432 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 6296 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | e

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.6s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.3s
[Pipeline] ............ (step 3 of 3) Processing rf_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('rf_clf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'rf_clf__bootstrap': [True, False],
                         'rf_clf__max_depth': [16, 32, 64],
                         'rf_clf__max_featu...
                         'rf_clf__n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
                         'rf_clf__n_jobs': [-1], 'rf_clf__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3780 out of 3780 | elapsed:  3.9min finished
 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
 271 272 273 274 275 278 279 280] are constant.
  f = msb / msw
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing ab_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('ab_clf', AdaBoostClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'ab_clf__algorithm': ['SAMME', 'SAMME.R'],
                         'ab_clf__base_estimator': [GaussianNB()],
                         'ab_clf__n_estimators': [16, 32, 48],
                         'ab_clf__random_state': [42],
                         'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'sel_per__percentile': [2, 5, 10, 15, 20

[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3224 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 4632 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 5432 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 6296 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 8216 tasks      | elapsed:  7.6m

[Pipeline] ........... (step 1 of 3) Processing sel_per, total=   0.0s
[Pipeline] .............. (step 2 of 3) Processing fica, total=   0.1s
[Pipeline] ............ (step 3 of 3) Processing kn_clf, total=   0.0s

 GridSearchCV(estimator=Pipeline(steps=[('sel_per', SelectPercentile()),
                                       ('fica', FastICA()),
                                       ('kn_clf', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'fica__algorithm': ['parallel', 'deflation'],
                         'fica__fun': ['logcosh', 'exp', 'cube'],
                         'fica__random_state': [42],
                         'kn_clf__algorithm': ['ball_tree'],
                         'kn_clf__leaf_size': [8, 16, 24],
                         'kn_clf__n_jobs': [-1],
                         'kn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kn_clf__weights': ['uniform', 'distance'],




In [64]:
with open('imp_gscvs_dict_2.pkl', 'wb') as file:
    pickle.dump(obj=imp_gscvs_dict, file=file)

In [65]:
### Well, that was silly of me to return my search data in this structure.
### Reformat and expand data into a dataframe.
imp_gscvs_df = pd.DataFrame(columns=['imp_method', 'selector', 'decomp', 'classifier', 'best_score_',
                                     'gscv'])

for imp, gscv_dict in imp_gscvs_dict.items():
    for steps_name, gscv in gscv_dict.items():
        row_dict = {'imp_method': imp}
        
        steps_lst = steps_name.split('_')
        if len(steps_lst) == 5:
            row_dict['selector'] = '_'.join(steps_lst[:2])
            row_dict['decomp'] = steps_lst[2]
            row_dict['classifier'] = '_'.join(steps_lst[3:])
            row_dict['best_score_'] = gscv.best_score_
            row_dict['gscv'] = gscv
        
        row_dict.update(gscv.best_params_)
        imp_gscvs_df = imp_gscvs_df.append(row_dict, ignore_index=True)
        
imp_gscvs_df = imp_gscvs_df.set_index(keys=['imp_method', 'selector', 'decomp', 'classifier'])
imp_gscvs_df.info()
imp_gscvs_df

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9 entries, ('imp0', 'sel_per', 'fica', 'rf_clf') to ('imp_mv', 'sel_per', 'fica', 'kn_clf')
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   best_score_                9 non-null      float64
 1   gscv                       9 non-null      object 
 2   fica__algorithm            9 non-null      object 
 3   fica__fun                  9 non-null      object 
 4   fica__random_state         9 non-null      float64
 5   rf_clf__bootstrap          3 non-null      float64
 6   rf_clf__max_depth          3 non-null      float64
 7   rf_clf__max_features       3 non-null      object 
 8   rf_clf__min_samples_leaf   3 non-null      float64
 9   rf_clf__min_samples_split  3 non-null      float64
 10  rf_clf__n_estimators       3 non-null      float64
 11  rf_clf__n_jobs             3 non-null      float64
 12  rf_clf__random_state       3 non-nul

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_score_,gscv,fica__algorithm,fica__fun,fica__random_state,rf_clf__bootstrap,rf_clf__max_depth,rf_clf__max_features,rf_clf__min_samples_leaf,rf_clf__min_samples_split,...,sel_per__score_func,ab_clf__algorithm,ab_clf__base_estimator,ab_clf__n_estimators,ab_clf__random_state,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs,kn_clf__n_neighbors,kn_clf__weights
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
imp0,sel_per,fica,rf_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,1.0,16.0,log2,3.0,2.0,...,<function chi2 at 0x0000024138DC3D30>,,,,,,,,,
imp0,sel_per,fica,ab_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,cube,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,SAMME.R,GaussianNB(),16.0,42.0,,,,,
imp0,sel_per,fica,kn_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,<function chi2 at 0x0000024138DC3D30>,,,,,ball_tree,8.0,-1.0,3.0,uniform
imp_med,sel_per,fica,rf_clf,0.895425,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,deflation,exp,42.0,1.0,16.0,sqrt,3.0,2.0,...,<function chi2 at 0x0000024138DC3D30>,,,,,,,,,
imp_med,sel_per,fica,ab_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,functools.partial(<function mutual_info_classi...,SAMME.R,GaussianNB(),32.0,42.0,,,,,
imp_med,sel_per,fica,kn_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,,,,,ball_tree,16.0,-1.0,3.0,uniform
imp_mv,sel_per,fica,rf_clf,0.896078,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,logcosh,42.0,1.0,16.0,sqrt,2.0,2.0,...,functools.partial(<function mutual_info_classi...,,,,,,,,,
imp_mv,sel_per,fica,ab_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,deflation,exp,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,SAMME.R,GaussianNB(),48.0,42.0,,,,,
imp_mv,sel_per,fica,kn_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,logcosh,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,,,,,ball_tree,24.0,-1.0,9.0,uniform


In [80]:
### Sort by best_score_.
print('Best scores sorted:\n')
imp_gscvs_df.sort_values(by='best_score_', ascending=False)

Best scores sorted:



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_score_,gscv,fica__algorithm,fica__fun,fica__random_state,rf_clf__bootstrap,rf_clf__max_depth,rf_clf__max_features,rf_clf__min_samples_leaf,rf_clf__min_samples_split,...,sel_per__score_func,ab_clf__algorithm,ab_clf__base_estimator,ab_clf__n_estimators,ab_clf__random_state,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs,kn_clf__n_neighbors,kn_clf__weights
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
imp_mv,sel_per,fica,rf_clf,0.896078,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,logcosh,42.0,1.0,16.0,sqrt,2.0,2.0,...,functools.partial(<function mutual_info_classi...,,,,,,,,,
imp_med,sel_per,fica,rf_clf,0.895425,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,deflation,exp,42.0,1.0,16.0,sqrt,3.0,2.0,...,<function chi2 at 0x0000024138DC3D30>,,,,,,,,,
imp0,sel_per,fica,rf_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,1.0,16.0,log2,3.0,2.0,...,<function chi2 at 0x0000024138DC3D30>,,,,,,,,,
imp0,sel_per,fica,ab_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,cube,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,SAMME.R,GaussianNB(),16.0,42.0,,,,,
imp0,sel_per,fica,kn_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,<function chi2 at 0x0000024138DC3D30>,,,,,ball_tree,8.0,-1.0,3.0,uniform
imp_med,sel_per,fica,kn_clf,0.884314,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,,,,,ball_tree,16.0,-1.0,3.0,uniform
imp_med,sel_per,fica,ab_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,exp,42.0,,,,,,...,functools.partial(<function mutual_info_classi...,SAMME.R,GaussianNB(),32.0,42.0,,,,,
imp_mv,sel_per,fica,ab_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,deflation,exp,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,SAMME.R,GaussianNB(),48.0,42.0,,,,,
imp_mv,sel_per,fica,kn_clf,0.872549,GridSearchCV(estimator=Pipeline(steps=[('sel_p...,parallel,logcosh,42.0,,,,,,...,<function f_classif at 0x0000024138DC39D0>,,,,,ball_tree,24.0,-1.0,9.0,uniform


In [78]:
print('\nClassifier stats:\n')
imp_gscvs_df.groupby(by='classifier')['best_score_'].min().sort_values(ascending=False)
imp_gscvs_df.groupby(by='classifier')['best_score_'].mean().sort_values(ascending=False)
imp_gscvs_df.groupby(by='classifier')['best_score_'].max().sort_values(ascending=False)

imp_gscvs_df.sort_values(by='best_score_', ascending=False).head().groupby(by='classifier')\
    ['best_score_'].count()

### Groupby classifier parameters.
for clf, clf_dict in classifiers.items():
    for param in clf_dict['params'].keys():
        print('Count of', str(clf_dict['clf']), 'best', param, ':')
        print('Possible values:')
        clf_dict['params'][param]
        imp_gscvs_df.groupby(by=param, sort=False)['best_score_'].count()
        imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
            .groupby(by=param, sort=False)['best_score_'].count()


Classifier stats:



classifier
rf_clf    0.884314
ab_clf    0.872549
kn_clf    0.872549
Name: best_score_, dtype: float64

classifier
rf_clf    0.891939
kn_clf    0.880392
ab_clf    0.876471
Name: best_score_, dtype: float64

classifier
rf_clf    0.896078
ab_clf    0.884314
kn_clf    0.884314
Name: best_score_, dtype: float64

classifier
ab_clf    1
kn_clf    1
rf_clf    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__n_estimators :
Possible values:


[2, 4, 6, 8, 10, 12, 14, 16]

rf_clf__n_estimators
16.0    1
4.0     1
2.0     1
Name: best_score_, dtype: int64

rf_clf__n_estimators
2.0     1
4.0     1
16.0    1
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__max_features :
Possible values:


['sqrt', 'log2']

rf_clf__max_features
log2    1
sqrt    2
Name: best_score_, dtype: int64

rf_clf__max_features
sqrt    2
log2    1
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__max_depth :
Possible values:


[16, 32, 64]

rf_clf__max_depth
16.0    3
Name: best_score_, dtype: int64

rf_clf__max_depth
16.0    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__min_samples_split :
Possible values:


[2]

rf_clf__min_samples_split
2.0    3
Name: best_score_, dtype: int64

rf_clf__min_samples_split
2.0    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__min_samples_leaf :
Possible values:


[1, 2, 3, 4, 5]

rf_clf__min_samples_leaf
3.0    2
2.0    1
Name: best_score_, dtype: int64

rf_clf__min_samples_leaf
2.0    1
3.0    2
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__bootstrap :
Possible values:


[True, False]

rf_clf__bootstrap
1.0    3
Name: best_score_, dtype: int64

rf_clf__bootstrap
1.0    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__random_state :
Possible values:


[42]

rf_clf__random_state
42.0    3
Name: best_score_, dtype: int64

rf_clf__random_state
42.0    3
Name: best_score_, dtype: int64

Count of RandomForestClassifier() best rf_clf__n_jobs :
Possible values:


[-1]

rf_clf__n_jobs
-1.0    3
Name: best_score_, dtype: int64

rf_clf__n_jobs
-1.0    3
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__base_estimator :
Possible values:


[GaussianNB()]

ab_clf__base_estimator
GaussianNB()    3
Name: best_score_, dtype: int64

ab_clf__base_estimator
GaussianNB()    1
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__n_estimators :
Possible values:


[16, 32, 48]

ab_clf__n_estimators
16.0    1
32.0    1
48.0    1
Name: best_score_, dtype: int64

ab_clf__n_estimators
16.0    1
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__algorithm :
Possible values:


['SAMME', 'SAMME.R']

ab_clf__algorithm
SAMME.R    3
Name: best_score_, dtype: int64

ab_clf__algorithm
SAMME.R    1
Name: best_score_, dtype: int64

Count of AdaBoostClassifier() best ab_clf__random_state :
Possible values:


[42]

ab_clf__random_state
42.0    3
Name: best_score_, dtype: int64

ab_clf__random_state
42.0    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__n_neighbors :
Possible values:


[2, 3, 4, 5, 6, 7, 8, 9, 10]

kn_clf__n_neighbors
3.0    2
9.0    1
Name: best_score_, dtype: int64

kn_clf__n_neighbors
3.0    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__weights :
Possible values:


['uniform', 'distance']

kn_clf__weights
uniform    3
Name: best_score_, dtype: int64

kn_clf__weights
uniform    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__algorithm :
Possible values:


['ball_tree']

kn_clf__algorithm
ball_tree    3
Name: best_score_, dtype: int64

kn_clf__algorithm
ball_tree    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__leaf_size :
Possible values:


[8, 16, 24]

kn_clf__leaf_size
8.0     1
16.0    1
24.0    1
Name: best_score_, dtype: int64

kn_clf__leaf_size
8.0    1
Name: best_score_, dtype: int64

Count of KNeighborsClassifier() best kn_clf__n_jobs :
Possible values:


[-1]

kn_clf__n_jobs
-1.0    3
Name: best_score_, dtype: int64

kn_clf__n_jobs
-1.0    1
Name: best_score_, dtype: int64

In [79]:
### Groupby decomp parameters.
print('Count of FastICA best algorithms:')
print('Possible values:')
decomps['fica']['params']['fica__algorithm']
imp_gscvs_df.groupby(by='fica__algorithm')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    .groupby(by='fica__algorithm')['best_score_'].count()

print('Count of FastICA best functions:')
print('Possible values:')
decomps['fica']['params']['fica__fun']
imp_gscvs_df.groupby(by='fica__fun')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    .groupby(by='fica__fun')['best_score_'].count()

Count of FastICA best algorithms:
Possible values:


['parallel', 'deflation']

fica__algorithm
deflation    2
parallel     7
Name: best_score_, dtype: int64

fica__algorithm
deflation    1
parallel     4
Name: best_score_, dtype: int64

Count of FastICA best functions:
Possible values:


['logcosh', 'exp', 'cube']

fica__fun
cube       1
exp        6
logcosh    2
Name: best_score_, dtype: int64

fica__fun
cube       1
exp        3
logcosh    1
Name: best_score_, dtype: int64

In [69]:
###  Groupby selector parameters.
print('Count of SelectPercentile best score functions:')
print('Possible values:')
selectors['sel_per']['params']['sel_per__score_func']
imp_gscvs_df.groupby(by='sel_per__score_func', sort=False)['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    .groupby(by='sel_per__score_func', sort=False)['best_score_'].count()

print('Count of SelectPercentile best percentiles:')
print('Possible values:')
selectors['sel_per']['params']['sel_per__percentile']
imp_gscvs_df.groupby(by='sel_per__percentile')['best_score_'].count()
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    .groupby(by='sel_per__percentile')['best_score_'].count()

Count of SelectPercentile best score functions:
Possible values:


[<function sklearn.feature_selection._univariate_selection.f_classif(X, y)>,
 <function sklearn.feature_selection._univariate_selection.chi2(X, y)>,
 functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)]

sel_per__score_func
<function chi2 at 0x0000024138DC3D30>                                                       3
<function f_classif at 0x0000024138DC39D0>                                                  4
functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)    2
Name: best_score_, dtype: int64

sel_per__score_func
functools.partial(<function mutual_info_classif at 0x00000241390F9040>, random_state=42)    1
<function chi2 at 0x0000024138DC3D30>                                                       3
<function f_classif at 0x0000024138DC39D0>                                                  1
Name: best_score_, dtype: int64

Count of SelectPercentile best percentiles:
Possible values:


[2, 5, 10, 15, 20, 25, 30]

sel_per__percentile
2.0     1
5.0     1
10.0    3
15.0    1
20.0    2
25.0    1
Name: best_score_, dtype: int64

sel_per__percentile
2.0     1
5.0     1
10.0    1
15.0    1
25.0    1
Name: best_score_, dtype: int64

In [70]:
### Groupby methods/steps and compare count and score min, max, mean.
print('\nImputation method stats:\n')
imp_gscvs_df.groupby(by='imp_method')['best_score_'].min().sort_values(ascending=False)
imp_gscvs_df.groupby(by='imp_method')['best_score_'].mean().sort_values(ascending=False)
imp_gscvs_df.groupby(by='imp_method')['best_score_'].max().sort_values(ascending=False)

imp_gscvs_df.sort_values(by='best_score_', ascending=False).head().groupby(by='imp_method')\
    ['best_score_'].count()


Imputation method stats:



imp_method
imp0       0.884314
imp_med    0.872549
imp_mv     0.872549
Name: best_score_, dtype: float64

imp_method
imp0       0.884314
imp_med    0.884096
imp_mv     0.880392
Name: best_score_, dtype: float64

imp_method
imp_mv     0.896078
imp_med    0.895425
imp0       0.884314
Name: best_score_, dtype: float64

imp_method
imp0       3
imp_med    1
imp_mv     1
Name: best_score_, dtype: int64

In [71]:
imp_gscvs_df.sort_values(by='best_score_', ascending=False).head()\
    [classifiers['kn_clf']['params'].keys()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,kn_clf__n_neighbors,kn_clf__weights,kn_clf__algorithm,kn_clf__leaf_size,kn_clf__n_jobs
imp_method,selector,decomp,classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
imp_mv,sel_per,fica,rf_clf,,,,,
imp_med,sel_per,fica,rf_clf,,,,,
imp0,sel_per,fica,rf_clf,,,,,
imp0,sel_per,fica,ab_clf,,,,,
imp0,sel_per,fica,kn_clf,3.0,uniform,ball_tree,8.0,-1.0
