# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
import random
import pickle

from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit as sss

import xgboost as xgb

## 0.1 Load Data

In [2]:
df_train_res = pd.read_pickle("data_in_progress/df_train_res_nn.pkl")

df_val = pd.read_pickle("data_in_progress/df_val.pkl")

cols_selected_boruta_resampled = pickle.load(open("data_in_progress/cols_selected_boruta_resampled_nn.pkl", "rb"))

In [3]:
resp = ['response']
cols_selected_boruta_resampled_full = cols_selected_boruta_resampled.copy()
cols_selected_boruta_resampled_full.extend(resp)

## 0.2 Helper Functions

In [4]:
def metric_scores(y_true, y_pred):
    return pd.DataFrame({'accuracy':accuracy_score(y_true, y_pred),
                        'balanced_accuracy':balanced_accuracy_score(y_true, y_pred),
                        'precision':precision_score(y_true, y_pred),
                        'precision_0':precision_score(y_true, y_pred, pos_label=0),
                        'recall':recall_score(y_true, y_pred),
                        'specificity':recall_score(y_true, y_pred, pos_label=0),
                        'F1':f1_score(y_true, y_pred),
                        'F1_weighted':f1_score(y_true, y_pred, average='weighted'),
                        'G_mean':np.sqrt(recall_score(y_true, y_pred)*recall_score(y_true, y_pred, pos_label=0))},
                        index=[0])


def cross_validation_xgb(training_data, kfolds, model, model_name, verbose=False):
    xtraining = training_data.drop(['response'], axis=1)
    ytraining = training_data.response
    
    cv = sss(n_splits=kfolds)
    acc_list = []
    bal_acc_list = []
    prec_list = []
    prec0_list = []
    rec_list = []
    spec_list = []
    f1_list = []
    f1w_list = []
    g_list = []
    for train_index, prim_val_index in cv.split(xtraining, ytraining):
        X_training, X_prim_val = xtraining.iloc[train_index], xtraining.iloc[prim_val_index]
        y_training, y_prim_val = ytraining.iloc[train_index], ytraining.iloc[prim_val_index]
        
        m = model.fit(X_training, y_training, eval_metric='logloss')
        yhat = m.predict(X_prim_val)
        
        score_table = metric_scores(y_prim_val, yhat)
        acc_list.append(score_table['accuracy'])
        bal_acc_list.append(score_table['balanced_accuracy'])
        prec_list.append(score_table['precision'])
        prec0_list.append(score_table['precision_0'])
        rec_list.append(score_table['recall'])
        spec_list.append(score_table['specificity'])
        f1_list.append(score_table['F1'])
        f1w_list.append(score_table['F1_weighted'])
        g_list.append(score_table['G_mean'])
    
    acc_pred = np.round(np.mean(acc_list), 4).astype(str) + '+/-' + np.round(np.std(acc_list), 4).astype(str)
    bal_acc_pred = np.round(np.mean(bal_acc_list), 4).astype(str) + '+/-' + np.round(np.std(bal_acc_list), 4).astype(str)
    prec_pred = np.round(np.mean(prec_list), 4).astype(str) + '+/-' + np.round(np.std(prec_list), 4).astype(str)
    prec0_pred = np.round(np.mean(prec0_list), 4).astype(str) + '+/-' + np.round(np.std(prec0_list), 4).astype(str)
    rec_pred = np.round(np.mean(rec_list), 4).astype(str) + '+/-' + np.round(np.std(rec_list), 4).astype(str)
    spec_pred = np.round(np.mean(spec_list), 4).astype(str) + '+/-' + np.round(np.std(spec_list), 4).astype(str)
    f1_pred = np.round(np.mean(f1_list), 4).astype(str) + '+/-' + np.round(np.std(f1_list), 4).astype(str)
    f1w_pred = np.round(np.mean(f1w_list), 4).astype(str) + '+/-' + np.round(np.std(f1w_list), 4).astype(str)
    g_pred = np.round(np.mean(g_list), 4).astype(str) + '+/-' + np.round(np.std(g_list), 4).astype(str)
    return pd.DataFrame({'Model name':model_name,
                         'accuracy':acc_pred,
                         'balanced_accuracy':bal_acc_pred,
                         'precision':prec_pred,
                         'precision_0':prec0_pred,
                         'recall':rec_pred,
                         'specificity':spec_pred,
                         'F1':f1_pred,
                         'F1_weighted':f1w_pred,
                         'G_mean':g_pred}, index=[0])

# 8.0 Hyperparameter Fine Tunning

In [5]:
df8 = df_train_res[cols_selected_boruta_resampled_full].copy()

In [6]:
X_train_res = df8.drop(['response'], axis=1)
y_train_res = df8.response

In [7]:
df8_val = df_val[cols_selected_boruta_resampled_full].copy()

In [8]:
X_val = df8_val.drop(['response'], axis=1)
y_val = df8_val.response

## 8.1 Random Search

In [9]:
param = {'n_estimators':[1500, 1700, 2500, 3000, 3500],
         'eta':[0.01, 0.03],
         'max_depth':[3, 5, 9],
         'subsample':[0.1, 0.5, 0.7],
         'colsample_bytree':[0.3, 0.7, 0.9],
         'min_child_weight':[3, 8, 15],
        }

MAX_EVAL = 5

In [10]:
final_result = pd.DataFrame()
hyperparam_lst = []

for i in range(MAX_EVAL):
    #randomly selected hyperparameters
    hp = {k:random.sample(v, 1)[0] for k, v in param.items()}
    #print the selected ones
    print(hp)
    #store in list
    hyperparam_lst.append(hp.copy())
    #model
    model_xgb = xgb.XGBClassifier(objective='binary:logistic',
                                  use_label_encoder=False,
                                  n_estimators=hp['n_estimators'],
                                  eta=hp['eta'],
                                  max_depth=hp['max_depth'],
                                  n_jobs=-1,
                                  subsample=hp['subsample'],
                                  colsample_bytree=hp['colsample_bytree'],
                                  min_child_weight=hp['min_child_weight'])
    result = cross_validation_xgb(df8, 5, model_xgb, 'XGBoost Classifier', verbose=False)
    iter_nr = pd.Series([i], name='Iter_nr')
    result_w_nr = pd.concat([result, iter_nr], axis=1)
    final_result = pd.concat([final_result, result_w_nr], ignore_index=True)

final_result.sort_values(by=['F1'], ascending=False, inplace=True, ignore_index=True)
final_result

{'n_estimators': 2500, 'eta': 0.01, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.9, 'min_child_weight': 15}
{'n_estimators': 3000, 'eta': 0.03, 'max_depth': 9, 'subsample': 0.5, 'colsample_bytree': 0.9, 'min_child_weight': 3}
{'n_estimators': 3500, 'eta': 0.03, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 0.3, 'min_child_weight': 8}
{'n_estimators': 1700, 'eta': 0.01, 'max_depth': 9, 'subsample': 0.1, 'colsample_bytree': 0.9, 'min_child_weight': 3}
{'n_estimators': 2500, 'eta': 0.01, 'max_depth': 3, 'subsample': 0.1, 'colsample_bytree': 0.9, 'min_child_weight': 15}


Unnamed: 0,Model name,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean,Iter_nr
0,XGBoost Classifier,0.8217+/-0.0007,0.8258+/-0.0007,0.7681+/-0.001,0.8838+/-0.0013,0.8842+/-0.0016,0.7673+/-0.0015,0.822+/-0.0007,0.8217+/-0.0007,0.8237+/-0.0007,1
1,XGBoost Classifier,0.8202+/-0.0019,0.8243+/-0.002,0.7655+/-0.0019,0.8838+/-0.0028,0.8847+/-0.0031,0.7639+/-0.0023,0.8208+/-0.002,0.8201+/-0.0019,0.8221+/-0.0019,3
2,XGBoost Classifier,0.8179+/-0.0016,0.8223+/-0.0015,0.7617+/-0.0019,0.8844+/-0.0014,0.8862+/-0.0014,0.7585+/-0.0024,0.8193+/-0.0014,0.8178+/-0.0016,0.8198+/-0.0016,0
3,XGBoost Classifier,0.8142+/-0.0012,0.8189+/-0.0011,0.7558+/-0.0015,0.8845+/-0.0008,0.8876+/-0.0008,0.7502+/-0.002,0.8164+/-0.001,0.814+/-0.0012,0.816+/-0.0012,4
4,XGBoost Classifier,0.8122+/-0.0012,0.8171+/-0.0012,0.753+/-0.0016,0.8844+/-0.0014,0.8881+/-0.0015,0.7461+/-0.0022,0.815+/-0.0011,0.812+/-0.0013,0.814+/-0.0013,2


In [11]:
final_result.to_pickle('data_in_progress/xgb_tunning_cv')

## 8.2 Model Validation

In [12]:
final_result.loc[0, 'Iter_nr']

1

In [13]:
best_param_nr = final_result.loc[0, 'Iter_nr']

param_tuned = hyperparam_lst[best_param_nr]

In [16]:
#model
model_xgb_tuned = xgb.XGBClassifier(objective='binary:logistic',
                                    use_label_encoder=False,
                                    n_estimators=param_tuned['n_estimators'],
                                    eta=param_tuned['eta'],
                                    max_depth=param_tuned['max_depth'],
                                    subsample=param_tuned['subsample'],
                                    colsample_bytree=param_tuned['colsample_bytree'],
                                    min_child_weight=param_tuned['min_child_weight']
                                    ).fit(X_train_res, y_train_res, eval_metric='logloss')

yhat_xgb_tuned = model_xgb_tuned.predict(X_val)

xgb_result = metric_scores(y_val, yhat_xgb_tuned)
xgb_result

Unnamed: 0,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,0.666658,0.705064,0.233919,0.950469,0.755941,0.654187,0.35728,0.723781,0.703226


Highly overfitted!

In [17]:
xgb_result.to_pickle('data_in_progress/xgb_result')

In [18]:
model_xgb_tuned.save_model('xgb_tuned.json')