# Config

In [None]:
# System
import os
import json
import pickle
    
# Data processing
import pandas as pd
import numpy as np
from scipy.stats import zscore #Z-score
from sklearn.model_selection import train_test_split

# XGBoost
import xgboost as xgb 
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, KFold
from sklearn.model_selection import train_test_split

# Hyperparameters Optimisation
import hyperopt
from hyperopt import hp, fmin, tpe, Trials, partial
from hyperopt.early_stop import no_progress_loss

# Accuracy
from sklearn.metrics import accuracy_score

# Plot
import graphviz
%matplotlib ipympl
import matplotlib.pyplot as plt

model_path = 'best_paramas.pickle'

# Data preparation

In [148]:
class data_processing:
    def __init__(self, file, index_col):
        self.data = pd.read_csv(file, index_col=index_col)
        self.one_hot_columns_name = []

    def onehot_category(self, columns2onehot):
        self.one_hot_columns = pd.get_dummies(self.data[columns2onehot], drop_first=True)
        self.one_hot_columns_name = self.one_hot_columns.columns
        self.data = pd.concat([self.data, self.one_hot_columns], axis=1)
        self.data = self.data.drop(columns=columns2onehot)

    def z_score(self, no_zscore_columns):
        self.columns_category = pd.DataFrame(self.data, columns=no_zscore_columns)
        columns_4_Z_score = self.data.drop(columns=no_zscore_columns)
        self.columns_4_Z_score = columns_4_Z_score.apply(zscore)
        self.data = pd.concat([self.columns_category, self.columns_4_Z_score], axis=1)
        
def data_train_test_split (data_label, dataset):
    X = dataset.drop(columns=[data_label])
    y = dataset[data_label]
    
    # Splitting train test dataset
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.3,random_state=1412)
    dtrain = xgb.DMatrix(Xtrain,Ytrain)
    dtest = xgb.DMatrix(Xtest,Ytest)
    
    return dtrain, Xtrain, Xtest, dtest, Xtest, Ytest

# Model Construction and Tuning

In [263]:
def size_param_grid(param_grid):
    # inspecting the size of param space
    total =1
    for key, value in param_grid.items():
        total = total *len(value)
    print("the size of param space is",int(total))

In [228]:
def hyperopt_objective(params):
    paramsforxgb = {"eta":params["eta"]
                    ,"booster":params["booster"]
                    ,"colsample_bytree":params["colsample_bytree"]
                    ,"colsample_bynode":params["colsample_bynode"]
                    ,"gamma":params["gamma"]
                    ,"lambda":params["lambda"]
                    ,"min_child_weight":params["min_child_weight"]
                    ,"max_depth":int(params["max_depth"])
                    ,"subsample":params["subsample"]
                    ,"objective":params["objective"]
                    ,"rate_drop":params["rate_drop"]
                    ,"nthread":14
                    ,"verbosity":0
                    ,"seed":1412}
    result = xgb.cv(params,data_xgb, seed=1412, metrics=("error")
                    ,num_boost_round=int(params["num_boost_round"]))
    return result.iloc[-1,2]

In [231]:
# # Stadarlizer format of parameters using hp.choice for both category and numeric values and np.arange(start, end, interval) for both int and float value.
# Version 1
# param_grid_simple = {"booster":hp.choice("booster",["gbtree","dart"])
#                      ,"objective":hp.choice("objective",["binary:logistic"])
#                      ,"max_depth":hp.choice("max_depth",[*np.arange(2,30,2)])
#                      ,'num_boost_round': hp.choice("num_boost_round",[*np.arange(50,200,10)])
#                      ,"eta": hp.choice("eta",[*np.arange(0.05,2.05,0.05)])
#                      ,"colsample_bytree":hp.choice("colsample_bytree",[*np.arange(0.3,1,0.1)])
#                      ,"colsample_bynode":hp.choice("colsample_bynode",[*np.arange(0.1,1,0.1)])
#                      ,"gamma":hp.choice("gamma",[*np.arange(0,10,0.3)])
#                      ,"lambda":hp.choice("lambda",[*np.arange(0,3,0.2)])
#                      ,"min_child_weight":hp.choice("min_child_weight",[*np.arange(0,50,2)])
#                      ,"subsample":hp.choice("subsample",[*np.arange(0.1,1,0.1)])
#                      ,"rate_drop":hp.choice("rate_drop",[*np.arange(0.1,1,0.1)])
#                     }

In [229]:
# # Stadarlizer format of parameters using hp.choice for both category and numeric values and np.arange(start, end, interval) for both int and float value.
# Version 2
# param_grid_simple = {"booster":hp.choice("booster",param_grid["booster"])
#                      ,"objective":hp.choice("objective",param_grid["objective"])
#                      ,"max_depth":hp.choice("max_depth",param_grid["max_depth"])
#                      ,'num_boost_round': hp.choice("num_boost_round",param_grid["num_boost_round"])
#                      ,"eta": hp.choice("eta",param_grid["eta"])
#                      ,"colsample_bytree":hp.choice("colsample_bytree",param_grid["colsample_bytree"])
#                      ,"colsample_bynode":hp.choice("colsample_bynode",param_grid["colsample_bynode"])
#                      ,"gamma":hp.choice("gamma",param_grid["gamma"])
#                      ,"lambda":hp.choice("lambda",param_grid["lambda"])
#                      ,"min_child_weight":hp.choice("min_child_weight",param_grid["min_child_weight"])
#                      ,"subsample":hp.choice("subsample",param_grid["subsample"])
#                      ,"rate_drop":hp.choice("rate_drop",param_grid["rate_drop"])
#                     }
# param_grid_simple

In [None]:
# Version 3
# param_grid_simple = {key:hp.choice(key,value) for key, value in param_grid.items()}

In [265]:
def param_hyperopt(max_evals=100):
    trials = Trials()
    
    early_stop_fn = no_progress_loss(30)

    params_best = fmin(hyperopt_objective
                       , space = param_grid_simple
                       , algo = tpe.suggest
                       , max_evals = max_evals
                       , verbose=True
                       , trials = trials
                       , early_stop_fn = early_stop_fn
                      )
    best_paramas = dict([(param, value[params_best[param]]) for param, value in param_grid.items()])
    print(best_paramas)
    return best_paramas, trials

In [None]:
# xgb.set_config(verbosity=0) # hide bizzare warning
# params_best, trials = param_hyperopt(3) 

# Matching the best settings from the original param range by Bayesian optimization results
# best_paramas = []
# for param in param_grid:
#     best_paramas.append((param, param_grid.get(param)[params_best[param]]))
# best_paramas = dict(best_paramas)

# oneline VERSION ↓
# best_paramas = dict([(param, value[params_best[param]]) for param, value in param_grid.items()])
# best_paramas

# Execute

## Data prepartion

In [289]:
dataset = data_processing("Data/Financial-Data.csv", "Entry_id")
dataset.z_score(no_zscore_columns=["home_owner", "has_debt", "e_signed", "pay_schedule"])
dataset.onehot_category(["pay_schedule"])
data = dataset.data
dtrain, Xtrain, Xtest, dtest, Xtest, Ytest = data_train_test_split("e_signed", data)

## Train

In [291]:
train = True
if train:
    param_grid = {"booster":["gbtree","dart"]
                 ,"objective":["binary:logistic"]
                 ,"max_depth":[*np.arange(2,30,2)]
                 ,"num_boost_round":[*np.arange(50,200,10)]
                 ,"eta":[*np.arange(0.05,2.05,0.05)]
                 ,"colsample_bytree":[*np.arange(0.3,1,0.1)]
                 ,"colsample_bynode":[*np.arange(0.1,1,0.1)]
                 ,"gamma":[*np.arange(0,10,0.3)]
                 ,"lambda":[*np.arange(0,3,0.2)]
                 ,"min_child_weight":[*np.arange(0,50,2)]
                 ,"subsample":[*np.arange(0.1,1,0.1)]
                 ,"rate_drop":[*np.arange(0.1,1,0.1)]
    }
    size_param_grid(param_grid)
    param_grid_simple = {key:hp.choice(key,value) for key, value in param_grid.items()}
    xgb.set_config(verbosity=0) # hide bizzare warning
    best_paramas, trials = param_hyperopt(3)

    # save params to local
    modelfile = open(model_path, 'wb')
    pickle.dump(best_paramas, modelfile)
    modelfile.close()


the size of param space is 1093062600000
100%|███████████████████████████████████████████████| 3/3 [00:20<00:00,  6.85s/trial, best loss: 0.3887275705457524]
{'booster': 'dart', 'objective': 'binary:logistic', 'max_depth': 26, 'num_boost_round': 140, 'eta': 1.7500000000000002, 'colsample_bytree': 0.6000000000000001, 'colsample_bynode': 0.8, 'gamma': 8.4, 'lambda': 0.8, 'min_child_weight': 22, 'subsample': 0.5, 'rate_drop': 0.8}


## Prediction and Evaluation

In [292]:
dtrain, Xtrain, Xtest, dtest, Xtest, Ytest = data_train_test_split("e_signed", data)
Xtest = xgb.DMatrix(Xtest)

# Load Params from local
modelfile = open(model_path, 'rb')
best_paramas = pickle.load(modelfile)

# Fit
xgb_model = xgb.train(best_paramas, dtrain)

Ypredict = (xgb_model.predict(Xtest) > 0.5).astype("int") # convert probability to category
print(accuracy_score(Ytest, Ypredict))

0.6128792108691606


In [293]:
ploton_tree = False
if ploton_tree:
    xgb.plot_importance(xgb_model)
    fig, ax = plt.subplots(figsize=(30, 30))
    xgb.plot_tree(xgb_model, num_trees=0, ax=ax)
    plt.show()
    # plt.savefig("temp.pdf")