# Machine Learning Experiments

In this notebook, we experiment with various machine learning models using autoML tools, select one model, fine tune its parameters, optimize the threshold, evaluate the model using classic performance metrics and using domain metrics, and save the model. Here, we also explore the feature importance.

In [1]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from xgboost import plot_importance

## Import data

In [2]:
data_path = os.path.join(os.getcwd(), '../data')
df = pd.read_csv(os.path.join(data_path, 'data_extracted', 'bank-additional', 'bank-additional-full.csv'), delimiter=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## Train an XGBoost model

Ideally, we would test many different algorithms and select the one that performs the best. This process could be speeded up by tools like AutoML. However, in the scope of this project, we train a model using an XGBoost implementation of gradient boosted trees algorithm. This implementation is known to provide good and efficient results.

We start with preparing the data to train an XGBoost model.

In [3]:
required_features = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed']
# duration of the call is not available before the call and isn't included to the

In [4]:
# split to features and target
df_y = df[['y']]
df_x = df.filter(items=required_features)

In [5]:
# encoding for xgb
# convert objects to categories
def objects_to_categories(df, obj_col):
    df[obj_col] = df[obj_col].astype('category')
    return(df)

In [6]:
obj_cols = list(df_x.select_dtypes(include='object').columns)

for col in obj_cols:
    df_x = objects_to_categories(df_x, col)

In [7]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(df_y)
df_y = label_encoder.transform(df_y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [None]:
# encode string class values as integers
df_y['y'] = df_y['y'].replace('yes', 1)
df_y['y'] = df_y['y'].replace('no', 0)
df_y['y'] = df_y['y'].astype(int)

In [8]:
# classes are highly imbalanced, so stratified sampling by target is necessary to ensure equal distribution of classes in train, val, and test samples
tr_val_x, test_x, tr_val_y, test_y = train_test_split(df_x, df_y, test_size = 0.10, random_state = 42, stratify = df_y)
tr_x, val_x, tr_y, val_y = train_test_split(tr_val_x, tr_val_y, test_size = 0.10, random_state = 42, stratify = tr_val_y)

In [9]:
feature_names = list(df_x.columns)

In [10]:
# matrix for XGBoost
dtrain = xgb.DMatrix(tr_x, label=tr_y, feature_names=feature_names, enable_categorical = True)
dtest = xgb.DMatrix(test_x, label=test_y, feature_names=feature_names, enable_categorical = True)
dval = xgb.DMatrix(val_x, label=val_y, feature_names=feature_names, enable_categorical = True)

: 

In [None]:
print('Training Features Shape:', tr_x.shape)
print('Training Labels Shape:', tr_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)
print('Validation Features Shape:', val_x.shape)
print('Validation Labels Shape:', val_y.shape)

In [None]:
def f1_eval(predt: np.ndarray, dtrain: xgboost.DMatrix):
    y = dtrain.get_label()
    predt_binary = np.where(predt > 0.5, 1, 0)
    return "F1_score", metrics.f1_score(y_true=y, y_pred=predt_binary)

In [None]:
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':"binary:logistic",
    'disable_default_eval_metric': 1,
    'seed': 42,
    'tree_method': 'hist'
}

xgboost_test_model = xgb.train(
    params = params,
    dtrain = dtrain,
    num_boost_round=100,
    evals=[(dtrain, 'dtrain'), (dval, 'dval')],
    #early_stopping_rounds = 10,
    maximize=True)

In [None]:
y_pred = xgboost_test_model.predict(dtest)

In [None]:
pd.DataFrame({"pred": y_pred, "label": test_y}).hist(column="pred",by="label",bins=30)

In [None]:
xgboost_test_model_pred = np.where(xgboost_test_model.predict(dtest) > 0.1, 1, 0)

In [None]:
print('Performance of xgboost_tuned_weighted, test:')
print('F1:', round(metrics.f1_score(dtest.get_label(),xgboost_test_model_pred),2))
print('Precision:', round(metrics.precision_score(dtest.get_label(), xgboost_test_model_pred),2))
print('Recall:', round(metrics.recall_score(dtest.get_label(), xgboost_test_model_pred),2))
print('AUC:', round(metrics.roc_auc_score(dtest.get_label(), xgboost_test_model_pred),2))
print('Accuracy:', round(metrics.accuracy_score(dtest.get_label(), xgboost_test_model_pred),2))
print('Confusion matrix:\n', 
      metrics.confusion_matrix(dtest.get_label(), xgboost_test_model_pred).transpose())

In [None]:
cf_matrix = metrics.confusion_matrix(dtest.get_label(), xgboost_test_model_pred)
TN = cf_matrix[0][0] # correctly predicted useless calls | SAVED 8$ | 0$
FN = cf_matrix[1][0] # worthy calls predicted as useless calls | MINIMIZE TO 0 !!! | OPPORTUNITY COST -72$ 
TP = cf_matrix[1][1] # correctly predicted worthy calls | WIN 72$ | 
FP = cf_matrix[0][1] # useless calls predicted as worthy calls | LOSS -8$ |
fl = [TN, FN, FP, TP]
fig, ax = plt.subplots(figsize=(8,6))

sns.set(font_scale=2)
group_names = ['TN','FN','FP','TP']
group_counts = ["{0:0.0f}".format(value) for value in fl]
group_percentages = ["{0:.2%}".format(value) for value in
                     fl/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

sns.heatmap(cf_matrix, annot=labels, fmt='', cmap="BuPu")
plt.ylabel('Predicted', fontsize=20)
plt.xlabel('Actual', fontsize=20)
ax.set_ylim([0,2])
ax.invert_yaxis()