# A Template to cover all basic aspects of a initial classification experimentation for structured data 

This is the collection of ideas I use for personal projects and am happy to share and get feedback to continuoually improve this workflow.

An assembly of common functions, libraries, graphs with personal configuration preferences

(regression is similar but with slight modifications to the model and preprocessing) 

In [7]:
# install packages
%%capture
!{sys.executable} -m pip install shap
!{sys.executable} -m pip install hvplot
!{sys.executable} -m pip install catboost
!{sys.executable} -m pip install imblearn


In [8]:
# Import Core Libraries
from enum import Enum
from pathlib import Path
from time import time
import sys
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 100)

# Plotting 
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import hvplot.pandas

# sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from imblearn.over_sampling import RandomOverSampler

# specific model and explainability
from catboost import CatBoostClassifier, Pool
import shap

# ignore warnings to improve readability
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Business Problem Definition

## Data Science Problem Definition

## Data Description

In [32]:
# load paths with Enum
class Paths(Enum):
    BASE_PATH = Path()
    DATA_PATH = BASE_PATH / "sample_data"
    TRAIN_DATA = DATA_PATH / "train.csv"
    TEST_DATA = DATA_PATH / "test.csv"
    SUBMISSION_FILE = DATA_PATH / "submission.csv"

# OR depending on requirements and path complexity

# load paths with pathlib
TRAIN_DATA =  Path.cwd() / 'sample_data' / 'train.csv'

TEST_DATA =  Path.cwd() / 'sample_data' / 'test.csv'

In [34]:
# Loading Train data
train_df = pd.read_csv(TRAIN_DATA), parse_dates = ['date_column'])

# OR

# train_df = pd.read_csv(Paths.TRAIN_DATA.value)#, parse_dates = ['date_column'])
print(train_df.shape)
train_df.head()

(19999, 785)


Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.20,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.30,0.31,0.32,0.33,0.34,0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,0.43,0.44,0.45,0.46,0.47,0.48,...,0.541,0.542,0.543,0.544,0.545,0.546,0.547,0.548,0.549,0.550,0.551,0.552,0.553,0.554,0.555,0.556,0.557,0.558,0.559,0.560,0.561,0.562,0.563,0.564,0.565,0.566,0.567,0.568,0.569,0.570,0.571,0.572,0.573,0.574,0.575,0.576,0.577,0.578,0.579,0.580,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,91,225,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 1. Data Wrangling & Visualization

In [35]:
train_df.columns

Index(['6', '0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8',
       ...
       '0.581', '0.582', '0.583', '0.584', '0.585', '0.586', '0.587', '0.588',
       '0.589', '0.590'],
      dtype='object', length=785)

In [36]:
train_df.describe(percentiles = [0.05,0.5,0.95,0.975,0.99])

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.20,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.30,0.31,0.32,0.33,0.34,0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,0.43,0.44,0.45,0.46,0.47,0.48,...,0.541,0.542,0.543,0.544,0.545,0.546,0.547,0.548,0.549,0.550,0.551,0.552,0.553,0.554,0.555,0.556,0.557,0.558,0.559,0.560,0.561,0.562,0.563,0.564,0.565,0.566,0.567,0.568,0.569,0.570,0.571,0.572,0.573,0.574,0.575,0.576,0.577,0.578,0.579,0.580,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
count,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,...,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0
mean,4.470124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0005,0.010801,0.010801,0.00045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012401,0.028451,0.058303,0.065503,0.127556,0.19236,0.241562,0.19466,0.20941,0.19816,0.157658,0.173959,0.165008,0.066853,0.030752,...,0.546877,1.19221,2.300515,3.504675,4.80004,6.103155,6.752638,6.39627,5.458473,4.561978,3.714036,2.627231,1.718486,1.020101,0.553228,0.247412,0.097755,0.020751,0.0014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00035,0.010651,0.036052,0.088304,0.112306,0.158508,0.276914,0.40607,0.546827,0.572079,0.696235,0.671684,0.545927,0.366318,0.215011,0.087704,0.036502,0.013651,0.032602,0.006,0.0,0.0,0.0,0.0
std,2.892807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070712,1.527389,1.527389,0.063641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.233369,1.986589,3.473328,3.100786,5.003077,5.989394,6.812301,5.880979,6.453814,6.029114,5.134903,5.875138,5.74611,3.149948,2.531629,...,10.194395,15.017723,21.405696,26.297483,30.406274,34.319075,36.34226,35.244748,31.964435,29.752297,26.815104,22.334578,18.262801,14.000786,10.463422,6.750766,4.079112,1.322117,0.197995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049499,1.095106,2.22082,3.900144,4.749952,5.406774,7.0053,8.719149,10.379141,10.254843,11.457391,11.297264,10.05733,8.255546,6.314821,3.921664,2.712527,0.950818,2.718102,0.600333,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95%,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97.5%,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,59.0,117.05,137.0,131.0,99.0,45.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99%,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,104.06,189.02,221.0,240.0,251.0,245.0,222.0,220.0,188.02,131.06,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,216.0,216.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,132.0,212.0,253.0,230.0,255.0,255.0,255.0,255.0,255.0,255.0,253.0,254.0,255.0,248.0,229.0,...,255.0,254.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,254.0,255.0,135.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,132.0,231.0,253.0,253.0,253.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,253.0,254.0,253.0,79.0,254.0,62.0,0.0,0.0,0.0,0.0


In [37]:
train_df.isna().sum()

6        0
0        0
0.1      0
0.2      0
0.3      0
        ..
0.586    0
0.587    0
0.588    0
0.589    0
0.590    0
Length: 785, dtype: int64

 - graph for outliers - density histogram and boxplots, grubbs test  
 (train_data.describe)  
 - missing values  
 - skew and kurtosis  
 ()  
 - Highlight:   
    -  Features to exclude  
    - Qualitative Features  
    - Quantitative Features  

## Define the datatypes


### Qualitative/Categorical Data  
**nominal:** (variable labels, no meaningful order, no quantised value. i.e. hair colur, marital status)  
**ordinal:** (ordered by position on a scale, cannot do arithmetic, i.e.. customer satisfaction, education level)  


remove outliers, impute missing data  



### Quantitative/Scalar Data  
**discrete:** distinct, separate, integer/whole number (number of days since an event, number of items, numbers with fixed data values determined by counting)

**continuous:** fractional numbers, temperature, weight, width, speed, software version, height. complex numbers and fluctuating data, measured over a defined timeframe

bar graph, number line, freq table  



PLOTTING REFERENCE: https://seaborn.pydata.org/tutorial/categorical.html

In [38]:
# helps determine which categoricals are suitable for One-Hot Encoding
train_df.nunique()

6        10
0         1
0.1       1
0.2       1
0.3       1
         ..
0.586     3
0.587     1
0.588     1
0.589     1
0.590     1
Length: 785, dtype: int64

In [None]:
# target
TARGET = 'Target Variable'

# non-feature columns
TO_DROP ='observation unique identifier'

# datetime features

DATETIME_COLUMNS = ['datetime_column']

# categorical features

ORDINAL_FEATURES = ['age_bracket','education_attained']

NOMINAL_FEATURES = ['job_type']

BINARY_FEATURES = ['has_loan','has_car']

# scalar features

CONTINUOUS_FEATURES = ['contact_duration']

DISCRETE_FEATURES = ['num_previous_contacts','days_since_last_contact']




CATEGORICAL_FEATURES = ORDINAL_FEATURES + NOMINAL_FEATURES

SCALAR_FEATURES = CONTINUOUS_FEATURES + DISCRETE_FEATURES



In [None]:
# define graphing functions

def prepare_ordinal_features_for_graphing(dataframe):
    '''
    Order the calues so Ordinal categories make sense graphically
    '''
    #ordinal ranked categories
    sorted = dataframe.sort_values(['comparison_variable_name'])
    equiv = {'category one':1, 'category two':2, 'category three':3}
    sorted["rank"] = sorted["comparison_variable_name"].map(equiv)
    # binary with unknowns
    equiv = {-1:'failure',0:'unknown',1:'success'}
    sorted["Target Variable"] = sorted["comparison_variable_name"].map(equiv)
    return sorted

def plot_categorical_var_against_target(dataframe, feature = 'Comparison Variable Name'):
    dataframe = prepare_ordinal_features_for_graphing(dataframe)
    fig, axs = plt.subplots(figsize=(30, 20))
    axs.set_xticklabels(dataframe[feature])#.values())
    sequential_colors = sns.color_palette("RdPu", 10)
    pal = sns.color_palette("Blues", len(dataframe))
    sns.countplot(x=feature,data=dataframe,hue='Target Variable',palette='Blues')
    axs.set_title(f"Target Variable Outcome Count Plot", fontsize=55)
    axs.set_xlabel("Comparison Input Variable Outcome", fontsize=40)
    axs.set_ylabel("Comparison Variable Name", fontsize=40)
    axs.tick_params(labelsize=35)
    plt.setp(axs.get_legend().get_texts(), fontsize='35') # for legend text
    plt.setp(axs.get_legend().get_title(), fontsize='40')
    plt.show()

def multi_graph_plot(dataframe, categorical_features):
    fig, axs = plt.subplots(3, 3, figsize=(20, 15), facecolor='w', edgecolor='k')
    fig.tight_layout()
    axs = axs.ravel()
    for idx, feature in enumerate(categorical_features):
        ax = sns.countplot(x=feature, data=dataframe, ax=axs[idx],palette='Blues')
        ax.set_title(f"{feature} count")
        plt.subplots_adjust(hspace=0.2, wspace=0.15, top=0.92)
    fig.suptitle("Categorical Feature Counts")
    plt.show()


def plot_target_value_counts(dataframe, feature):
    feature = 'Target Variable'
    fig, axs = plt.subplots(figsize=(30, 20))
    axs.set_xticklabels(dataframe[feature]))
    sequential_colors = sns.color_palette("RdPu", 10)
    pal = sns.color_palette("Blues", len(dataframe))
    sns.countplot(x=feature,data=dataframe,palette='Blues')
    axs.set_title(f"Target Variable Count Plot", fontsize=55)
    axs.set_xlabel("Target Variable Outcome", fontsize=40)
    axs.set_ylabel("Target Variable Count", fontsize=40)
    axs.tick_params(labelsize=35)
    plt.show()

## 2. Data preprocessing

### 2.1 Feature Engineer Categorical Variables

### Regarding decision trees, random forest and boosted decision trees, different preprocessing may be desirable  


The modularity of the preprocessing allows us to select which feature engineering processes to apply per model, this in important because train, test and validation should have the same preprocessing applied (except for class imbalance correction)  

The modules can be called separately, multiple times and independently on the raw data - allowing multiple stable iterations to isolate specific feature engineering modules or models to hone performance.

In [None]:
def encode_categorical_features(dataframe: pd.DataFrame, ordinal_features: list,binary_features: list,nominal_features:list) -> tuple[pd.DataFrame,dict]:   
    '''
    Map feature names to ordinal variable encoding

    Map feature names to nominal variable encoding

    Encode binary, including with missing information
    ''' 
    # ordinal encoding
    encoded_ordinal_features = encode_ordinal_features(dataframe,dataframe,ordinal_features)
    # Encode binary variables
    encoded_binary_features = encode_binary_features(encoded_ordinal_features, binary_features)
    # One-Hot Encoding for nominal features
    encoded_nominal_features = encode_nominal_features(dataframe, nominal_features)
    return encoded_nominal_features,categorical_features_map

In [None]:
def encode_ordinal_features(dataframe, ordinal_features):
    '''
    Categorical encoding of Ordinal Features
    Returns:
        Dataframe with categorical encodings
        dictionary linking encodings with original values
    '''
    categorical_features_map = {}
    for feature in ordinal_features:
        cat_cols_map = dict(enumerate(dataframe[feature].astype('category').cat.categories))
        categorical_features_map[feature] = cat_cols_map
        dataframe[feature] = dataframe[feature].astype('category').cat.codes
    return dataframe, categorical_features_map

In [None]:
def encode_binary_features(dataframe, binary_features):
    '''
    Categorical encoding of Binary Features
    A special case where -1 and 1 will indicate the binary encoding
    values indicating missing data can be encoded as 0
    Returns:
        Dataframe with categorical binary encodings
    '''
    features = {'failure': -1,'no': -1, 'nonexistent':0,'unknown':  0, 'success': 1,'yes': 1}
    for col in binary_features:
        dataframe.replace({col: features},inplace=True)
    return dataframe

In [None]:
def encode_nominal_features(dataframe, nominal_features):
    '''
    Apply OHE to selected nominal features (lower cardinality nominal features)
    Apply categorical or mean target encoding to selected nominal features (higher cardinality nominal features)
    Returns:
        Dataframe with nominal encodings
    '''
    print(f"OHE for feature: {nominal_features}")
    encoded_nominal_data = pd.get_dummies(dataframe=dataframe, columns=[nominal_features], dtype=int)
    return encoded_nominal_data

In [None]:
encoded_categoricals, cat_feature_map = encode_categorical_features(train_df,ORDINAL_FEATURES,BINARY_FEATURES, NOMINAL_FEATURES)
encoded_categoricals

### 2.2 Feature Engineer Scalar Variables


In [None]:
train_df[CONTINUOUS_FEATURES].hist(figsize=(10,10), bins=15)
plt.show()

In [None]:
def density_boxplot(dataframe, feature):
    sns.set(style="darkgrid")    
    # creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
    # assigning a graph to each ax
    sns.boxplot(dataframe[feature], ax=ax_box)
    sns.histplot(data=dataframe, x=feature, ax=ax_hist)
    # Remove x axis name for the boxplot
    ax_box.set(xlabel='')
    plt.show()

Outliers



* It is worth noting again though that, if using an algorithm that can deal with outliers (e.g. GBDT, Decision Trees, Random Forests, etc.) then this step isn't completely necessary, however with outliers removed the feature search space is simplified and model complexity is reduced

In [None]:
sns.distplot(train_df[feature])

In [None]:
def deal_with_outliers(dataframe,CONTINUOUS_FEATURES):
    UPPER_QUANTILE_VALUE = 0.975
    LOWER_QUANTILE_VALUE = 0.025
    for feature in CONTINUOUS_FEATURES:
        removed_upper_outliers_df = remove_upper_outliers(dataframe, feature, UPPER_QUANTILE_VALUE)
        removed_lower_outliers_df = remove_lower_outliers(removed_upper_outliers_df, feature, LOWER_QUANTILE_VALUE)
        # Fill NaN values with Median value
        removed_lower_outliers_df.loc[removed_lower_outliers_df[feature].isna(), feature] = removed_lower_outliers_df[feature].median()
        # Introduce tests along the way to make sure we maintain data integrity as we data wrangle
        assert len(removed_lower_outliers_df[removed_lower_outliers_df[feature].isna()]) == 0
    return removed_lower_outliers_df


In [None]:
def remove_upper_outliers(dataframe, feature, UPPER_QUANTILE_VALUE):
    UPPER_CUTOFF_VALUE = dataframe[feature].quantile(UPPER_QUANTILE_VALUE)
    dataframe[feature].loc[
        (dataframe[feature] < UPPER_QUANTILE_VALUE)
        ].max()
    dataframe[feature].loc[
        (dataframe[feature] > UPPER_QUANTILE_VALUE)
    ]
    dataframe.loc[
        dataframe[feature] > UPPER_CUTOFF_VALUE, feature
    ] = np.nan
    return dataframe

In [None]:
def remove_lower_outliers(dataframe, feature, LOWER_QUANTILE_VALUE):
    LOWER_CUTOFF_VALUE = dataframe[feature].quantile(LOWER_QUANTILE_VALUE)
    dataframe[feature].loc[
        (dataframe[feature] > LOWER_QUANTILE_VALUE)
        ].min()
    dataframe[feature].loc[
        (dataframe[feature] < LOWER_QUANTILE_VALUE)
    ]
    dataframe.loc[
        dataframe[feature] > LOWER_CUTOFF_VALUE, feature
    ] = np.nan
    return dataframe

In [None]:
sns.distplot(train_df[feature])

In [None]:
def impute_missing_data(dataframe):
    columns = dataframe.columns
    for feature in columns:
        len_na = dataframe[feature].isna().sum()
        len_col = len(dataframe[feature])
        if len_na/len_col > 0.2:
            dataframe.drop(columns=[feature])
        else:
            dataframe[feature] = dataframe[feature].fillna(dataframe[feature].median())
    
    for idx in range(len(dataframe)):
        len_na = dataframe.loc[[idx]].isna().sum().sum()
        len_row = len(dataframe.loc[[idx]])
        if len_na/len_col > 0.2:
            dataframe.drop([idx])
        else:
            dataframe[feature] = dataframe.fillna(dataframe.median(axis=1), axis=1)

            dataframe[feature] = dataframe[feature].fillna(dataframe[feature].median())
    return dataframe


## 2.3 Datetime Feature Engineering

In [None]:
def get_day_of_the_week(date_timestamp):
    """Get week day index of passed timestamp
    """
    try:
        return date_timestamp.weekday()
    except AttributeError as e:
        return np.nan

In [None]:
train_df[DATETIME_COLUMNS] = pd.to_datetime(train_df[DATETIME_COLUMNS], format="%Y-%m-%dT%%H:%M%:S")
train_df[DATETIME_COLUMNS].isoformat()

def process_datetime_feature(train_df,DATETIME_COLUMNS):
    dataframe["day_num"] = dataframe[DATETIME_COLUMNS].apply(lambda date: get_day_of_the_week(date))
    dataframe["is_weekday"] = train_df["day_num"]
    dataframe.loc[dataframe["is_weekday"] < 5, "is_weekday"] = 1
    dataframe.loc[dataframe["is_weekday"] >= 5, "is_weekday"] = 0
    dataframe.drop(columns=["day_num"], inplace=True)
    return dataframe


In [None]:
train_df["day_num"].hist()
plt.xticks(range(7), ["mon", "tues", "wed", "thurs", "fri", "sat", "sun"])
plt.xlabel('Days of the week', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title("Day index count")
plt.show()

Which day of the week is most associated with the target and features

In [None]:
train_df.groupby(['is_weekday'])['is_weekday'].count().plot(kind='pie',  
  ylim=0, title= 'Calls made on weekdays/working days', autopct='%1.1f%%')
plt.ylabel('Number of weekdays', fontsize = 10,colors=["#68BBE3", "#0E86D4"]);
plt.show()


In [None]:
# Target for Weekdays/Work days
fig, axs = plt.subplots(figsize=(10, 10))
train_df[train_df['is_weekday'] == 1].groupby([TARGET])[TARGET].count().plot(kind='pie',  
  ylim=0, autopct='%1.1f%%', labels=["Positive Outcome", "Negative Outcome"], fontsize=32, colors=["#055C9D", "#003060"])
plt.ylabel('');
plt.subplots_adjust(hspace=0, wspace=0.45, top=0.92)
plt.show()

In [None]:
# Target for Weekend days
fig, axs = plt.subplots(figsize=(10, 10))
train_df[train_df['is_weekday'] == 0].groupby([TARGET])[TARGET].count().plot(kind='pie',  
  ylim=0, autopct='%1.1f%%', labels=["Positive Outcome", "Negative Outcome"], fontsize=32, colors=["#055C9D", "#003060"])
plt.ylabel('');
plt.subplots_adjust(hspace=0, wspace=0.45, top=0.92)
plt.show()

In [None]:
CATEGORICAL_FEATURES.append('is_weekday')
# add categorical variables as they are created 

In [None]:
def preprocess_data(dataframe_list: list, TO_DROP, ORDINAL_FEATURES,BINARY_FEATURES,NOMINAL_FEATURES,CONTINUOUS_FEATURES, DATETIME_COLUMNS):
    '''
    Combine all preprocessing steps once the 
    basic workflow is set
    Can be model/stage of development specific by modifying this
    preprocess orchestration function
    '''
    processed_dataframe_list = []
    for dataframe in dataframe_list:   
        imputed_data_df = impute_missing_data(dataframe)
        encoded_categorical_features = encode_categorical_features(imputed_data_df,ORDINAL_FEATURES,BINARY_FEATURES,NOMINAL_FEATURES)   
        continuous_corrected_df = deal_with_outliers(encoded_categorical_features,CONTINUOUS_FEATURES)
        processed_datetime_feature = process_datetime_feature(continuous_corrected_df,DATETIME_COL)
        processed_dataframe_list.append(processed_datetime_feature) 
    return processed_dataframe_list  



### 2.5 Correlation analysis

In [None]:
corr_values = train_df[train_df + [TARGET]].corr()
mask = np.zeros_like(corr_values, dtype=bool)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr_values, mask=mask, ax=ax, cmap="BuPu", annot=True)
plt.show()

In [None]:
feature_subset_1 = CONTINUOUS_FEATURES[:3]
feature_subset_2 = CONTINUOUS_FEATURES[:8]

sns.pairplot(train_df[feature_subset_1 + [TARGET]])

## 3 Modelling

In [None]:
CV_NUM_FOLDS = 5
TRAIN_TEST_SPLIT = 0.15
RANDOM_SEED=42

# set raw dataframes for modelling
dataframe_list = [train_df, test_df]

### Utility Functions

In [None]:
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def get_precision_recall_scores(y_test, y_pred):
    print(f"Precision score: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall score: {recall_score(y_test, y_pred):.2f}")


#  for k-folds plot
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def plot_average_kfolds_learning_curves(gridsearch_results_cv):
    """Plot the average learning performance across folds, averaging row-wise across fold scores.
    This gives us a better understanding of how the model is learning, and how much variance or bias it may have
    """
    kfolds_test_score_mean = np.mean(np.column_stack(
        (gridsearch_results_cv.cv_results_['split0_test_score'],
         gridsearch_results_cv.cv_results_['split1_test_score'],
         gridsearch_results_cv.cv_results_['split2_test_score'],
         gridsearch_results_cv.cv_results_['split3_test_score'],
         gridsearch_results_cv.cv_results_['split4_test_score'])), axis=1)
    kfolds_train_score_mean = np.mean(np.column_stack(
        (gridsearch_results_cv.cv_results_['split0_train_score'],
         gridsearch_results_cv.cv_results_['split1_train_score'],
         gridsearch_results_cv.cv_results_['split2_train_score'],
         gridsearch_results_cv.cv_results_['split3_train_score'],
         gridsearch_results_cv.cv_results_['split4_train_score'])), axis=1)

    plt.plot(moving_average(kfolds_test_score_mean, n=15), label='test')
    plt.plot(moving_average(kfolds_train_score_mean, n=15), label='train')
    plt.legend(loc='best')
    plt.title("Test vs Train Learning Curves for K-Folds CV")
    plt.show()    


def create_confusion_matrix(y_test, y_pred, figsize=(15,10)):
    """Creates a confusion matrix for passed ytest and yprod arrays"""
    cf_matrix = confusion_matrix(y_test, y_pred)
    group_names = ['True Negatives','False Positives','False Negatives','True Positives']
    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
              zip(group_names,group_counts,group_percentages)]

    labels = np.asarray(labels).reshape(2,2)
    plt.figure(figsize=figsize)
    ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues',annot_kws={"size":25})

    ax.set_title('##Model## Confusion Matrix\n\n', size=45);
    ax.set_xlabel('\nPredicted Values', size=30)
    ax.set_ylabel('Actual Values ', size=30);
    ax.xaxis.set_ticklabels(['False','True'],size=30)
    ax.yaxis.set_ticklabels(['False','True'],size=30)
    plt.show()

In [None]:
COMPLETE_TO_DROP = TO_DROP + DATETIME_COLUMNS + NOMINAL_FEATURES 
# nominal features have OHE encodings and are unnecessary
# datetime has had an engineered feature column created 

In [None]:
def prep_datasets(dataframe,TARGET,COMPLETE_TO_DROP):
    model_features = list(set(dataframe.columns.tolist()) - set([TARGET]) - set([COMPLETE_TO_DROP]))

    # first let's make sure we remove any columns or rows before starting any processing
    no_missing_data = impute_missing_data(dataframe)

    x_all = no_missing_data[model_features]
    y_all = no_missing_data[TARGET]
    # train test split
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=TRAIN_TEST_SPLIT, shuffle=True, random_state=RANDOM_SEED)
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    
    # balance classes
    oversample = RandomOverSampler(sampling_strategy='minority', random_state=RANDOM_SEED)
    x_train_balanced, y_train_balanced = oversample.fit_resample(x_train, y_train)
    print(f"Before imbalance correction: {x_train.shape, y_train.shape}")
    print(f"After imbalance correction: {x_train_balanced.shape, y_train_balanced.shape}")
    return x_train, x_test, y_train, y_test,x_train_balanced, y_train_balanced



In [None]:

preprocessed_train_data,preprocessed_test_data = preprocess_data(dataframe_list: list, TO_DROP, ORDINAL_FEATURES,BINARY_FEATURES,NOMINAL_FEATURES,CONTINUOUS_FEATURES, DATETIME_COL)
x_train, x_test, y_train, y_test,x_train_balanced, y_train_balanced = prep_datasets(preprocessed_train_data,TARGET)




---



---



## Model comparisons

In [None]:
# COLLECT THE METRICS OF
# INTEREST FOR COMPARISON

top_accuracy = {}
top_f1 = {}
train_time = {}
interence_time = {}


### Random Forest

In [None]:
CV_NUM_FOLDS = 5

rf_params = {
#     'n_samples':[1000], 
#     "n_features": [100],
#     "n_informative":[2],
#     "class_sep":[0.5],
    "random_state":[RANDOM_SEED]
}

rf_params_gs = {
    'n_estimators':[10, 20, 50], # , 20, 50 
    "max_features": ['auto', 'sqrt', 'log2'],
    "criterion":['gini', 'entropy'],
    "max_depth":[2, 5, 10]
}

In [None]:
rf_model = RandomForestClassifier(random_state=42)

start = time()
gridsearch_rf = GridSearchCV(rf_model, rf_params_gs, cv=CV_NUM_FOLDS, scoring="f1", return_train_score=True, refit=True)
gridsearch_rf.fit(x_train, y_train)
score = cross_val_score(rf_model, X, y, scoring='f1', cv=CV_NUM_FOLDS, n_jobs=-1)
train_time['RandomForest'] = np.round(time() - start, 3)

In [None]:
plot_average_kfolds_learning_curves(gridsearch_rf)

In [None]:
gridsearch_rf.best_score_

In [None]:
best_estimator_rf = gridsearch_rf.best_estimator_

In [None]:
start = time()

y_pred = best_estimator_rf.predict(x_test)
interence_time['RandomForest'] = np.round(time() - start, 3)

In [None]:
top_score = gridsearch_rf.best_score_

In [None]:
top_f1['RandomForest'] = f1_score(best_estimator_rf.predict(x_test), y_test)

In [None]:
from sklearn.metrics import accuracy_score
top_accuracy['RandomForest'] = accuracy_score(y_test, best_estimator_rf.predict(x_test))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8,10))
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
df = pd.DataFrame(gridsearch_rf.cv_results_)
results = ['mean_test_score',
           'mean_train_score',
           'std_test_score', 
           'std_train_score']

In [None]:
def pooled_var(stds):
    # https://en.wikipedia.org/wiki/Pooled_variance#Pooled_standard_deviation
    n = 5 # size of each group
    return np.sqrt(sum((n-1)*(stds**2))/ len(stds)*(n-1))

fig, axes = plt.subplots(1, len(rf_params_gs), 
                         figsize = (5*len(rf_params_gs), 7),
                         sharey='row')
axes[0].set_ylabel("Score", fontsize=25)


for idx, (param_name, param_range) in enumerate(rf_params_gs.items()):
    grouped_df = df.groupby(f'param_{param_name}')[results]\
        .agg({'mean_train_score': 'mean',
              'mean_test_score': 'mean',
              'std_train_score': pooled_var,
              'std_test_score': pooled_var})

    previous_group = df.groupby(f'param_{param_name}')[results]
    axes[idx].set_xlabel(param_name, fontsize=30)
    axes[idx].set_ylim(0.0, 1.1)
    lw = 2
    axes[idx].plot(param_range, grouped_df['mean_train_score'], label="Training score",
                color="darkorange", lw=lw)
    axes[idx].fill_between(param_range,grouped_df['mean_train_score'] - grouped_df['std_train_score'],
                    grouped_df['mean_train_score'] + grouped_df['std_train_score'], alpha=0.2,
                    color="darkorange", lw=lw)
    axes[idx].plot(param_range, grouped_df['mean_test_score'], label="Cross-validation score",
                color="navy", lw=lw)
    axes[idx].fill_between(param_range, grouped_df['mean_test_score'] - grouped_df['std_test_score'],
                    grouped_df['mean_test_score'] + grouped_df['std_test_score'], alpha=0.2,
                    color="navy", lw=lw)

handles, labels = axes[0].get_legend_handles_labels()
fig.suptitle('Validation curves', fontsize=40)
fig.legend(handles, labels, loc=8, ncol=2, fontsize=20)

fig.subplots_adjust(bottom=0.25, top=0.85)  
plt.show()

### vanilla gradient boosting


In [None]:
vgbm_params_gs = {
    'learning_rate':[0.1,0.2], 
    "random_state": [RANDOM_SEED],
    "n_estimators":[5, 10]

In [None]:
vgb_model = GradientBoostingClassifier()

start = time()
gridsearch_vgbm = GridSearchCV(vgb_model, vgbm_params_gs, cv=CV_NUM_FOLDS, scoring="f1", return_train_score=True, refit=True)
gridsearch_vgbm.fit(X_train, y_train)
score = cross_val_score(vgb_model, X_test, y_test, scoring='f1', cv=CV_NUM_FOLDS, n_jobs=-1)
train_time['VanillaGradientBoosting'] = np.round(time() - start, 3)


In [None]:
gridsearch_vgbm.best_score_ 

In [None]:
start = time()
y_pred = best_estimator.predict(X_test)
interence_time['VanillaGradientBoosting'] = np.round(time() - start, 3)

In [None]:
top_score = gridsearch_vgbm.best_score_

In [None]:
top_f1['VanillaGradientBoosting'] = f1_score(best_estimator.predict(X_test), y_test)


In [None]:
top_accuracy['VanillaGradientBoosting'] = accuracy_score(y_test, best_estimator.predict(X_test))


In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8,10))
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [39]:
# get the column indices to add to catboost params
cols_index = [train_df.columns.get_loc(col) for col in ORDINAL_FEATURES]

NameError: ignored

### CatBoost

In [None]:
cb_params_gs = {
    "thread_count": [2],
    "random_state": [RANDOM_SEED],
    "logging_level": ["Silent"],
    "learning_rate":[0.01,0.3],
    "depth":[1,3,6,10], # < 17 recommended
    "l2_leaf_reg": [10], # L2 regularisation coefficient, any positive int is OK
    "cat_features": [cols_index], # indices of ordinal_features
    "one_hot_max_size":[255], # use onehot encoding for all features with a number of differnet values <=< given parameter (max:  255)
    # controlling training/inference time
    "rsm":[0.5], # random subspace method - % of features used at each split selection
    "iterations": [100], # max number of trees that can be built (too many can lead to overfit)
    }

In [None]:
cb_model = CatBoostClassifier(random_state=RANDOM_SEED)
gridsearch_cb = GridSearchCV(cb_model, cb_params_gs, cv=CV_NUM_FOLDS, scoring="f1", return_train_score=True, refit=True)
start = time()

gridsearch_cb.fit(X_train, y_train)
score = cross_val_score(gridsearch_cb, X_test, y_test, scoring='f1', cv=CV_NUM_FOLDS, n_jobs=-1)
train_time['CatBoost'] = np.round(time() - start, 3)

In [None]:
create_confusion_matrix(y_test, y_pred)

In [None]:
gridsearch_cb.best_score_

In [None]:
cb_best_estimator = gridsearch_cb.best_estimator_

In [None]:
start = time()

y_pred = cb_best_estimator.predict(X_test)
interence_time['CatBoost'] = np.round(time() - start, 3)

In [None]:
top_score = gridsearch_cb.best_score_

In [None]:
top_f1['CatBoost'] = f1_score(cb_best_estimator.predict(X_test), y_test)

In [None]:
top_accuracy['CatBoost'] = accuracy_score(y_test, cb_best_estimator.predict(X_test))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8,10))
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

# 4. Model Comparison

Scikit-Learn vs XGBoost vs CatBoost

compare results - build table and geraphs

In [None]:
for algo, result in top_accuracy.items():
    print(f"{algo:{20}}: top_accuracy: {top_accuracy[algo]}, top_f1: {top_f1[algo]}, train_time: {train_time[algo]}, interence_time: {interence_time[algo]}")

In [None]:
for algo, result in top_accuracy.items():
    print(f"{algo:{20}}: top_accuracy: {top_accuracy[algo]}, top_f1: {top_f1[algo]}, train_time: {train_time[algo]}, interence_time: {interence_time[algo]}")

In [None]:
accuracy_df = pd.DataFrame(list(top_accuracy.items()), columns=['Algorithm', 'top_accuracy'])
top_f1_df = pd.DataFrame(list(top_f1.items()), columns=['Algorithm', 'top_f1'])
train_time_df = pd.DataFrame(list(train_time.items()), columns=['Algorithm', 'train_time'])
interence_time_df = pd.DataFrame(list(interence_time.items()), columns=['Algorithm', 'interence_time'])

In [None]:
accuracy_df.hvplot.barh(x='Algorithm', y='top_accuracy')

In [None]:
top_f1_df.hvplot.barh(x='Algorithm', y='top_f1')

In [None]:
train_time_df.hvplot.barh(x='Algorithm', y='train_time')

In [None]:
interence_time_df.hvplot.barh(x='Algorithm', y='interence_time')

## Model Explainability

In [None]:
# Retrieve the SHAP values
shap_values = cb_best_estimator.get_feature_importance(Pool(x_test, y_test), type="ShapValues")
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

In [None]:
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
shap.initjs()
shap_values.summary_plot(shap_values[:,:5], x_test.iloc[:,:5], max_display=20)

In [None]:
shap.initjs()
shap.summary_plot(shap_values, x_test, max_display=x_test.shape[1])

In [None]:
plt.style.use('dark_background')

In [None]:
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
explainer = shap.TreeExplainer(best_estimator)
shap_values = explainer.shap_values(x_train)

In [None]:
model_features = list(set(data.columns.tolist()) - set([TARGET]))

In [None]:
shap.summary_plot(shap_values, x_train, plot_type="bar", feature_names = list(x_train.columns))

In [None]:
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

In [None]:
shap.force_plot(expected_value, shap_values[10], x_test.iloc[10], matplotlib=True)
plt.figure(facecolor='w')

In [None]:
shap_results_mean = np.abs(shap_results_df.values).mean(0)
shap_importance = pd.DataFrame(list(zip(cb_best_estimator.feature_names_, shap_results_mean)), columns=['feature', 'shap_importance_value'])
shap_importance.sort_values(by=['shap_importance_value'], ascending=False, inplace=True)
shap_importance

In [None]:
top_20_features = shap_importance[:20]
top_20_features["feature"].tolist()

In [None]:
shap.summary_plot(shap_values, x_train, plot_type="bar", feature_names = model_features)

In [None]:
shap.force_plot(expected_value, shap_values[10], x_test.iloc[10], matplotlib=True)
plt.figure(facecolor='w')

## 5. Run interence with best model on a holdout test set

In [None]:
test_results = cb_best_estimator.predict(preprocessed_test_data)

In [None]:
client_ids = test_df['client_id']
client_ids.shape

In [None]:
test_results = pd.DataFrame({"client_id": client_ids, "target": test_results})

## Save results


In [None]:
test_results[['client_id','subs_deposit']].to_csv('submissions.csv',index=False)

---
# 6. Summary and Key Findings
---

## Potential Improvements