## Imports

In [None]:
# Py Data Stack
import numpy as np
import pandas as pd

# Visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Classifiers
from tpot import TPOTClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Sklearn Auxilarty Functions
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# SMOTE
from imblearn.over_sampling import SMOTE

# Auxilary
from tqdm import tqdm

## 1. Loading the Dataset

In [None]:
col_names = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over',
             'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
             'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business', 'word_freq_email',
             'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp',
             'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
             'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm',
             'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu',
             'word_freq_table', 'word_freq_conference', 'char_freq_;',  'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$',
             'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'LABEL']
len(col_names)

In [None]:
df = pd.read_csv('spambase.data', header=None, names=col_names)
print(f'df.shape: {df.shape}')
df.head()

In [None]:
# Check if there are any missing values and if all data types were properly loaded
df.info()

### 1.1. Train-Test Split

In [None]:
df_train, df_test= train_test_split(df, test_size=0.2, stratify=df.iloc[:,-1])
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')

### 1.2. Exploratory Analysis

Considering only the training data so as to leave testing data untouched

In [None]:
# Check the distribution of datapoint
df_train.describe()

In [None]:
# Check the distribution of values using a boxplot
fix, ax = plt.subplots(figsize=(15,15))
df_train.boxplot()
plt.show()

The skewness in this boxplot indicated the need to implement a MinMaxScaler during the model training pipeline.

In [None]:
sns.boxplot(df_train)

In [None]:
# Correlation Matrix
fig, ax = plt.subplots(figsize=(40,40))
g = sns.heatmap(df_train.corr(), vmin=-1, vmax=1, cmap='coolwarm_r', square=True)
plt.show()

In [None]:
# Coefficient of Variation
var_mean = df_train.mean(axis=0)
var_std = df_train.std(axis=0)
CV = var_std/var_mean
CV

In [None]:
# Identify the features with the highest CV
num_picks = int(np.floor(np.sqrt(df.shape[1])))
print(f'Number of features chosen: {num_picks}')
chosen_features = CV.sort_values(ascending=False).head(num_picks).index
chosen_features = [i for i in chosen_features]
print(f'List of features chosen: {chosen_features}')

In [None]:
# Scatterplots using target variable as hue
g = sns.pairplot(data=df_train[chosen_features], plot_kws=dict(hue=df_train.iloc[:,-1] , palette='coolwarm'), diag_kws=dict(color='black'))
g.add_legend(title="Target Variable")
plt.show()

In [None]:
# Boxplots
fig = plt.figure(figsize=(20,10))
g = sns.boxplot(data=df_train[chosen_features], orient='h')
plt.show()

## 2. Building the Classifier

### 2.1 Defining Grid Searches to Optimize the Classifiers

#### Support Vector Classifier

In [23]:
# Pipeline to standardize then run the classifier
svc =  Pipeline([("standardize", StandardScaler()),
                 ("svc", SVC(kernel="rbf", decision_function_shape='ovr'))])

# Grid with parameters to be tested via CV
svc_param_grid_ = {'svc__C': np.logspace(-3, 3, 5),
                   'svc__gamma': np.logspace(-3, 3, 5)}

# Instantiate GridSearchCV using accuracy as the scorer
svc_gridCV = GridSearchCV(svc, svc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')

#### Random Forest Classifier

In [24]:
# Pipeline to standardize then run the classifier
rfc = Pipeline([("standardize", StandardScaler()),
                 ("rfc", RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1))]) 

# Grid with parameters to be tested via CV
rfc_param_grid_ = {'rfc__min_samples_split': [2,3],
                   'rfc__min_samples_leaf': [1,2,3]}

# Instantiate GridSearchCV using accuracy as the scorer
rfc_gridCV = GridSearchCV(rfc, rfc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')

#### KNN Classifier

In [25]:
knn = Pipeline([("standardize", StandardScaler()),
                ("knn", KNeighborsClassifier(metric='minkowski', leaf_size=30, weights='distance', n_jobs=-1))]) 

# Grid with parameters to be tested via CV
knn_param_grid_ = {'knn__n_neighbors': [3,5,7,9]}

# Instantiate GridSearchCV using accuracy as the scorer
knn_gridCV = GridSearchCV(knn, knn_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')

#### Gaussian Process Classifier

In [26]:
gpc = Pipeline([("standardize", StandardScaler()),
                ("gpc", GaussianProcessClassifier(optimizer='fmin_l_bfgs_b', max_iter_predict=100, n_jobs=-1))]) 

# Grid with parameters to be tested via CV
gpc_param_grid_ = {'gpc__n_restarts_optimizer': [0,5,10]}

# Instantiate GridSearchCV using accuracy as the scorer
gpc_gridCV = GridSearchCV(gpc, gpc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')

#### Linear Discriminant Analysis

In [27]:
lda = Pipeline([("standardize", StandardScaler()),
                ("lda", LinearDiscriminantAnalysis(solver='svd'))]) 

# Grid with parameters to be tested via CV
lda_param_grid_ = {'lda__tol': [1.0e-2, 1.0e-4, 1.0e-6]}

# Instantiate GridSearchCV using accuracy as the scorer
lda_gridCV = GridSearchCV(lda, lda_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')

### 2.2 Finding Optimal Hyperparameter with Grid Search & K-Fold Cross-Validation

In [32]:
%%time
# Instantiate SMOTE
smote = SMOTE(n_jobs=-1)

# KFold
kf = KFold(n_splits=5, shuffle=True)
kfold_intermediate_results = pd.DataFrame()
for fold_num, (idx_train, idx_valid) in enumerate(kf.split(df_train), 1):

    # Print current label and fold
    print(f'Working on Fold: {fold_num}')

    # Select all folds to be smoted except for the validation fold
    x_train, y_train = smote.fit_sample(df_train.iloc[idx_train,:-1], df_train.iloc[idx_train,-1])
    x_valid = df_train.iloc[idx_valid,:-1] 
    y_valid = df_train.iloc[idx_valid,-1] 

    # Fit using grid search to find the best params
    svc_gridCV.fit(x_train, y_train)
    rfc_gridCV.fit(x_train, y_train)
    knn_gridCV.fit(x_train, y_train)
    gpc_gridCV.fit(x_train, y_train)
    lda_gridCV.fit(x_train, y_train)

    # Predict on the train and validation folds to calculate metrics
    pred_svc_train = svc_gridCV.predict(x_train)   
    pred_svc_valid = svc_gridCV.predict(x_valid)
    pred_rfc_train = rfc_gridCV.predict(x_train)   
    pred_rfc_valid = rfc_gridCV.predict(x_valid)   
    pred_knn_train = knn_gridCV.predict(x_train)   
    pred_knn_valid = knn_gridCV.predict(x_valid)
    pred_gpc_train = gpc_gridCV.predict(x_train)   
    pred_gpc_valid = gpc_gridCV.predict(x_valid)
    pred_lda_train = lda_gridCV.predict(x_train)   
    pred_lda_valid = lda_gridCV.predict(x_valid)
    
    # Store best params of each classifier for each fold
    kfold_intermediate_results.at['SVC_C', f'{fold_num}'] = svc_gridCV.best_params_['svc__C']
    kfold_intermediate_results.at['SVC_gamma', f'{fold_num}'] = svc_gridCV.best_params_['svc__gamma']
    kfold_intermediate_results.at['RFC_split', f'{fold_num}'] = rfc_gridCV.best_params_['rfc__min_samples_split']
    kfold_intermediate_results.at['RFC_leaf', f'{fold_num}'] = rfc_gridCV.best_params_['rfc__min_samples_leaf']
    kfold_intermediate_results.at['KNN_N', f'{fold_num}'] = knn_gridCV.best_params_['knn__n_neighbors']
    kfold_intermediate_results.at['GPC_restarts', f'{fold_num}'] = gpc_gridCV.best_params_['gpc__n_restarts_optimizer']
    kfold_intermediate_results.at['LDA_tol', f'{fold_num}'] = lda_gridCV.best_params_['lda__tol']
   
    # Store K-Fold intermedaite results
    kfold_intermediate_results.at['SVC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_svc_train)
    kfold_intermediate_results.at['SVC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_svc_valid)
    kfold_intermediate_results.at['RFC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_rfc_train)
    kfold_intermediate_results.at['RFC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_rfc_valid)
    kfold_intermediate_results.at['KNN_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_knn_train)
    kfold_intermediate_results.at['KNN_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_knn_valid)
    kfold_intermediate_results.at['GPC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_gpc_train)
    kfold_intermediate_results.at['GPC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_gpc_valid)    
    kfold_intermediate_results.at['LDA_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_lda_train)
    kfold_intermediate_results.at['LDA_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_lda_valid)
    
# After running all K-Folds get average results for each classifier
kfold_intermediate_results['mean'] = kfold_intermediate_results.mean(axis=1)

print()
print('--- GridSearch Best Parameters ---')
print(f'Mean SVC C Parameter: {kfold_intermediate_results["mean"]["SVC_C"]}')
print(f'Mean SVC gamma Parameter: {kfold_intermediate_results["mean"]["SVC_gamma"]}')
print(f'Mean RFC min_samples_split Parameter: {kfold_intermediate_results["mean"]["RFC_split"]}')
print(f'Mean RFC min_samples_leaf Parameter: {kfold_intermediate_results["mean"]["RFC_leaf"]}')
print(f'Mean KNN N Parameter: {kfold_intermediate_results["mean"]["RFC_split"]}')
print(f'Mean GPA n_restarts_optimizer Parameter: {kfold_intermediate_results["mean"]["GPC_restarts"]}')
print(f'Mean LDA tol Parameter: {kfold_intermediate_results["mean"]["LDA_tol"]}')
print()
print('--- K-Fold Cross-Validation Results ---')
print(f'SVC Error | Training : {kfold_intermediate_results["mean"]["SVC_train"]}')
print(f'RFC Error | Training : {kfold_intermediate_results["mean"]["RFC_train"]}')
print(f'KNN Error | Training : {kfold_intermediate_results["mean"]["KNN_train"]}')
print(f'GPC Error | Training : {kfold_intermediate_results["mean"]["GPC_train"]}')
print(f'LDA Error | Training : {kfold_intermediate_results["mean"]["LDA_train"]}')
print()
print(f'SVC Error | Validation : {kfold_intermediate_results["mean"]["SVC_valid"]}')
print(f'RFC Error | Validation : {kfold_intermediate_results["mean"]["RFC_valid"]}')
print(f'KNN Error | Validation : {kfold_intermediate_results["mean"]["KNN_valid"]}')
print(f'GPC Error | Validation : {kfold_intermediate_results["mean"]["GPC_valid"]}')
print(f'LDA Error | Validation : {kfold_intermediate_results["mean"]["LDA_valid"]}')
print()

Working on Fold: 1
Working on Fold: 2
Working on Fold: 3
Working on Fold: 4
Working on Fold: 5

--- GridSearch Best Parameters ---
Mean SVC C Parameter: 1000.0
Mean SVC gamma Parameter: 0.001
Mean RFC min_samples_split Parameter: 2.2
Mean RFC min_samples_leaf Parameter: 1.0
Mean KNN N Parameter: 2.2
Mean GPA n_restarts_optimizer Parameter: 0.0
Mean LDA tol Parameter: 0.01

--- K-Fold Cross-Validation Results ---
SVC Error | Training : 0.03329469338255566
RFC Error | Training : 0.0002801126074668536
KNN Error | Training : 0.00016806778953968316
GPC Error | Training : 0.01059442754648281
LDA Error | Training : 0.09220863331581039

SVC Error | Validation : 0.0605978260869565
RFC Error | Validation : 0.044836956521739135
KNN Error | Validation : 0.08804347826086956
GPC Error | Validation : 0.0798913043478261
LDA Error | Validation : 0.09429347826086956

Wall time: 12min 48s


Considering the validation data the Random Forest Classifier is ahead, but none of the other algorithms is too far behind!
Let me now use the best hyperparameters from each K-Fold iteration to build a final version of each model to be tested on the test dataset, so that I can select the best model.

### 2.3 Comparing Results of the Optimized Classifiers & Selecting the Best

In [35]:
# Dataframe to store results
summary = pd.DataFrame()

# Create new classifiers using the average of the best parameters in each k-fold
svc_kfold = SVC(kernel="rbf", decision_function_shape='ovr',
                C=kfold_intermediate_results.at['SVC_C', 'mean'],
                gamma=kfold_intermediate_results.at['SVC_gamma', 'mean'])

rfc_kfold = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1,
                                   min_samples_split=round(kfold_intermediate_results.at['RFC_split', 'mean']),
                                   min_samples_leaf=round(kfold_intermediate_results.at['RFC_leaf', 'mean']))

knn_kfold = KNeighborsClassifier(metric='minkowski', leaf_size=30, weights='distance', n_jobs=-1,
                                 n_neighbors=round(kfold_intermediate_results.at['KNN_N', 'mean']))

gpc_kfold = GaussianProcessClassifier(optimizer='fmin_l_bfgs_b', max_iter_predict=100, n_jobs=-1,
                                      n_restarts_optimizer=round(kfold_intermediate_results["mean"]["GPC_restarts"]))
                                       
lda_kfold = LinearDiscriminantAnalysis(solver='svd',
                                       tol=kfold_intermediate_results.at['LDA_tol', 'mean'])


# Get X's and Y's - This time using the full datasets for trainin and testing
x_train, y_train = smote.fit_sample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
x_test = df_test.iloc[:, :-1].copy()
y_test = df_test.iloc[:, -1].copy()

# Fit using the optimized model created with the mean hyperparameters from K-Fold cross-validation
svc_kfold.fit(x_train, y_train)
rfc_kfold.fit(x_train, y_train)
knn_kfold.fit(x_train, y_train)
gpc_kfold.fit(x_train, y_train)
lda_kfold.fit(x_train, y_train)                                     
                                       
# Predict
pred_svc_train = svc_kfold.predict(x_train)
pred_svc_test = svc_kfold.predict(x_test)
pred_rfc_train = rfc_kfold.predict(x_train)
pred_rfc_test = rfc_kfold.predict(x_test)
pred_knn_train = knn_kfold.predict(x_train)
pred_knn_test = knn_kfold.predict(x_test)
pred_gpc_train = gpc_kfold.predict(x_train)
pred_gpc_test = gpc_kfold.predict(x_test)
pred_lda_train = gpc_kfold.predict(x_train)
pred_lda_test = gpc_kfold.predict(x_test)
                                       
# Model Error
summary.at['train', f'SVC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_svc_train)
summary.at['test', f'SVC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_svc_test)
summary.at['train', f'RFC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_rfc_train)
summary.at['test', f'RFC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_rfc_test)
summary.at['train', f'KNN'] = 1-accuracy_score(y_true=y_train, y_pred=pred_knn_train)
summary.at['test', f'KNN'] = 1-accuracy_score(y_true=y_test, y_pred=pred_knn_test)
summary.at['train', f'GPC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_gpc_train)
summary.at['test', f'GPC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_gpc_test)
summary.at['train', f'LDA'] = 1-accuracy_score(y_true=y_train, y_pred=pred_lda_train)
summary.at['test', f'LDA'] = 1-accuracy_score(y_true=y_test, y_pred=pred_lda_test)
                                        
# Print model results for optimized classifiers model
print(f'------------------------------ MODEL OVERALL ------------------------------') 
print('SVC Error | Training: ', summary.at['train', f'SVC'])
print('RFC Error | Training: ', summary.at['train', f'RFC'])                                       
print('KNN Error | Training: ', summary.at['train', f'KNN'])
print('GPC Error | Training: ', summary.at['train', f'GPC'])                                       
print('LDA Error | Training: ', summary.at['train', f'LDA'])                                     
print()                                       
print('SVC Error | Testing: ', summary.at['test', f'SVC'])
print('RFC Error | Testing: ', summary.at['test', f'RFC'])
print('KNN Error | Testing: ', summary.at['test', f'KNN'])
print('GPC Error | Testing: ', summary.at['test', f'GPC'])
print('LDA Error | Testing: ', summary.at['test', f'LDA'])

------------------------------ MODEL OVERALL ------------------------------
SVC Error | Training:  0.009641255605381205
RFC Error | Training:  0.00022421524663673864
KNN Error | Training:  0.00022421524663673864
GPC Error | Training:  0.00022421524663673864
LDA Error | Training:  0.00022421524663673864

SVC Error | Testing:  0.11834961997828453
RFC Error | Testing:  0.053203040173724236
KNN Error | Testing:  0.2041259500542888
GPC Error | Testing:  0.21172638436482083
LDA Error | Testing:  0.21172638436482083


In [36]:
# View results in the dataframe
summary

Unnamed: 0,SVC,RFC,KNN,gpc,LDA,GPC
train,0.009641,0.000224,0.000224,0.000224,0.000224,0.000224
test,0.11835,0.053203,0.204126,0.206298,0.211726,0.211726


Among the optimized models there were many ties in the training set, but when looking at the testing set, the Random Forecast Classifier is the clear winner!

But before I move ahead to calculating the performance metrics for our winning algorithm, let's introduce one final challenger: TPOT's AutoML.

### Bonus: AutoML

Attempting to improve on the performance of the best classifier by using TPOT's AutoML tool to search for an ideal classifier pipeline.

TPOT AutoML Documentation: http://epistasislab.github.io/tpot/

In [46]:
# Split X and Y
x_train = df_train.iloc[:,:-1]
y_train= df_train.iloc[:,-1]
x_test = df_test.iloc[:,:-1]
y_test= df_test.iloc[:,-1]

In [47]:
%%time
#Instantiate and run the AutoML classifier
AutoML = TPOTClassifier(generations=5, population_size=20, cv=5, verbosity=2, n_jobs=-1)
AutoML.fit(x_train, y_train)

Version 0.11.6.post3 of tpot is outdated. Version 0.11.7 was released Wednesday January 06, 2021.


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9535326086956522

Generation 2 - Current best internal CV score: 0.9535326086956522

Generation 3 - Current best internal CV score: 0.9535326086956522

Generation 4 - Current best internal CV score: 0.9535326086956522

Generation 5 - Current best internal CV score: 0.9546195652173914

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=10, max_features=0.7000000000000001, min_samples_leaf=18, min_samples_split=7, n_estimators=100, subsample=0.8500000000000001)


TPOTClassifier(generations=5, n_jobs=-1, population_size=20, verbosity=2)

In [76]:
# And the best model is:
pipeline_optimizer.fitted_pipeline_.steps[-1][1]

GradientBoostingClassifier(learning_rate=0.5, max_depth=10,
                           max_features=0.7000000000000001, min_samples_leaf=18,
                           min_samples_split=7, subsample=0.8500000000000001)

In [78]:
# Check performance
print("AutoML Error | Training: ", 1-AutoML.score(x_train, y_train))
print("AutoML Error | Testing: ", 1-AutoML.score(x_test, y_test))

AutoML Error | Training:  0.0005434782608695343
AutoML Error | Testing:  0.0456026058631922


Looks like TPOT proved no match for our Random Forest Classifier, so let's move ahead with extracting performance metrics for our champion.

## 3. Evaluating the Results

Now the the best model has been identified, rerun a K-Fold cross-validation without using SMOTE, so that more representative FPR, FNR and Error values can be obtained

In [126]:
# Now let's run a K-Fold cross-validation with the winner model and see how it performs over each fold
kf = KFold(n_splits=5, shuffle=True)
scores_train, scores_valid, scores_test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for fold_num, (idx_train, idx_valid) in enumerate(kf.split(df_train), 1):
    
    # Print current label and fold
    print(f'Working on Fold: {fold_num}')

    # Select all folds to be smoted except for the validation fold
    x_train = df_train.iloc[idx_train,:-1].copy() 
    y_train = df_train.iloc[idx_train,-1].copy() 
    x_valid = df_train.iloc[idx_valid,:-1].copy() 
    y_valid = df_train.iloc[idx_valid,-1].copy() 
    x_test = df_test.iloc[:, :-1].copy()
    y_test = df_test.iloc[:, -1].copy()
    
    # Fit a Random Forest Classifier using the best parameters
    winner_model = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1,
                                   min_samples_split=round(kfold_intermediate_results.at['RFC_split', 'mean']),
                                   min_samples_leaf=round(kfold_intermediate_results.at['RFC_leaf', 'mean']))
    
    winner_model.fit(x_train, y_train)
    
    # Predict on the train and validation folds to calculate metrics (with the winnier rfc_kfold model)
    pred_train = winner_model.predict(x_train)   
    pred_valid = winner_model.predict(x_valid)
    pred_test = winner_model.predict(x_test)
    
    # Train dataset metrics
    scores_train.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_train, y_pred=pred_train)
    cm_train = confusion_matrix(y_train, pred_train)
    TN, FP, FN, TP = cm_train.ravel()
    scores_train.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
    scores_train.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)
    
    # Valid dataset metrics
    scores_valid.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_valid)
    cm_valid = confusion_matrix(y_valid, pred_valid)
    TN, FP, FN, TP = cm_valid.ravel()
    scores_valid.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
    scores_valid.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)
    
    # Test dataset metrics
    scores_test.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_test, y_pred=pred_test)
    cm_test = confusion_matrix(y_test, pred_test)
    TN, FP, FN, TP = cm_test.ravel()
    scores_test.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
    scores_test.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)

# Get mean scores over all K-folds
scores_train.at['mean', 'Error'] = scores_train['Error'].mean()
scores_train.at['mean', 'FPR'] = scores_train['FPR'].mean()
scores_train.at['mean', 'FNR'] = scores_train['FNR'].mean()
scores_valid.at['mean', 'Error'] = scores_valid['Error'].mean()
scores_valid.at['mean', 'FPR'] = scores_valid['FPR'].mean()
scores_valid.at['mean', 'FNR'] = scores_valid['FNR'].mean()
scores_test.at['mean', 'Error'] = scores_test['Error'].mean()
scores_test.at['mean', 'FPR'] = scores_test['FPR'].mean()
scores_test.at['mean', 'FNR'] = scores_test['FNR'].mean()

Working on Fold: 1
Working on Fold: 2
Working on Fold: 3
Working on Fold: 4
Working on Fold: 5


In [127]:
scores_train

Unnamed: 0,Error,FPR,FNR
1,0.0,0.0,0.0
2,0.00034,0.0,0.00086
3,0.000679,0.0,0.001715
4,0.000679,0.000561,0.000862
5,0.00034,0.0,0.000871
mean,0.000408,0.000112,0.000862


In [128]:
scores_valid

Unnamed: 0,Error,FPR,FNR
1,0.057065,0.048998,0.069686
2,0.050272,0.024499,0.090592
3,0.039402,0.026549,0.059859
4,0.044837,0.024664,0.075862
5,0.052989,0.034562,0.07947
mean,0.048913,0.031854,0.075094


In [129]:
scores_test

Unnamed: 0,Error,FPR,FNR
1,0.052117,0.039427,0.071625
2,0.04886,0.03405,0.071625
3,0.051031,0.03405,0.077135
4,0.04886,0.030466,0.077135
5,0.052117,0.030466,0.085399
mean,0.050597,0.033692,0.076584


Note: It is possible to obtain all the metrics (error, FPR, FNR) while comparing all optimized models.

I've chosen to re-train the winning model and calculate the metrics only for that model for the sake of clarity, as I believe this apporach makes the step-by-step of the pipeline I've implemented clearer.

In [130]:
# Function to plot confusion matrix
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot = True, fmt = "d", cmap = "Blues", square=True)
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation = 0, ha = "right")
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation = 0, ha = "right")
    plt.title('Confusion Matrix', pad=20, fontweight='bold', fontsize=15)
    plt.ylabel('True Class')
    plt.xlabel('Predicted Class')

In [132]:
# Function to plot ROC-AUC Curve
def show_roc_curve(roc_curve):
    auc_score = auc(roc_curve[0], roc_curve[1])
    plt.axis('square')
    plt.plot(roc_curve[0], roc_curve[1], color='tomato', lw=3, label='AUC: ' + str(auc_score))
    plt.plot([0, 1], [0, 1], 'k--', lw=1.5)
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve', pad=20, fontweight='bold', fontsize=15)
    legend = plt.legend('AUC Score')
    legend._legend_box.align = "right"
    plt.legend(loc="lower right")

In [131]:
show_confusion_matrix(cm_test)

AttributeError: module 'seaborn' has no attribute 'heatmap'

AttributeError: module 'seaborn' has no attribute '__version__'