In [11]:
#Packages
import pandas as pd
import numpy as np
import itertools
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, plot_confusion_matrix, f1_score, accuracy_score, matthews_corrcoef
from xgboost import cv
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# XGBoost functions

In [12]:
#Function to perform cross-validation with XGBoost for a grid of parameters.
def xgboost_tune_class(param_comb, X, y, num_iterations, row_subsample, colsample, tree_build, class_imbal, sparse, learning_rate, cv_metric):

    param_comb = np.hstack((param_comb, np.zeros((param_comb.shape[0], 2))))

    #Checking if sparse matrix. If so, then 0 entries are treated as missing.
    if(sparse == True): 
        xgb_dat = xgb.DMatrix(data=X,label=y,missing=0)
    else: 
        xgb_dat = xgb.DMatrix(data=X,label=y)
    
    #Checking if class imbalance
    if(class_imbal == True): 
        weight_pos = sum(y==0)/sum(y==1)
    else: 
        weight_pos = 1

    for i in range(param_comb.shape[0]):
  
        #Setting up parameters.
        xgb_params = {
            'eta' : learning_rate,
            'gamma' : param_comb[i, 0],
            'max_depth' : int(param_comb[i, 1]),
            'subsample' : row_subsample,
            'colsample_bytree' : colsample,
            'reg_alpha': param_comb[i, 2],
            'reg_lambda' : param_comb[i, 3],
            'tree_method' : tree_build,
            'scale_pos_weight' : weight_pos,
            'objective' : 'binary:logistic',
            'n_jobs' : 1
        }

        #Performing cross-validation
        xgb_cv = xgb.cv(params = xgb_params,
                        dtrain = xgb_dat,
                        num_boost_round = num_iterations,
                        nfold = 5,
                        stratified = True,
                        metrics = cv_metric,
                        early_stopping_rounds = 200,
                        seed = 75,
                        verbose_eval = 100)
    
        if(cv_metric == 'auc'):
            num_tree = np.argmax(xgb_cv.iloc[:,2]) + 1 #number of trees
            opt_val = max(xgb_cv.iloc[:,2])
        else:
            num_tree = np.argmin(xgb_cv.iloc[:,2]) + 1 #number of trees
            opt_val = min(xgb_cv.iloc[:,2])

        param_comb[i,4:6] = num_tree, opt_val
        
        print(i)
    
    return(param_comb)

In [13]:
#Function to fit an XGBoost model with the optimal set of parameter values determined from cross-validation.
def xgboost_fit_class(param_opt, X, y, row_subsample, colsample, tree_build, class_imbal, sparse, learning_rate):

    #Checking if sparse matrix. If so, then 0 entries are treated as missing.
    if(sparse == True): 
        missing_val = 0
    else: 
        missing_val = np.nan
    
    #Checking if user specified class imbalance.
    if(class_imbal == True): 
        weight_pos = sum(y==0)/sum(y==1)
    else: 
        weight_pos = 1
    
    clf_xgb = xgb.XGBClassifier(eta = learning_rate,
                                gamma = param_opt[0],
                                max_depth = int(param_opt[1]),
                                subsample = row_subsample,
                                colsample_bytree = colsample,
                                reg_alpha = param_opt[2],
                                reg_lambda = param_opt[3],
                                tree_method = tree_build,
                                scale_pos_weight = weight_pos,
                                objective = 'binary:logistic',
                                seed = 33,
                                n_estimators = int(param_opt[4]),
                                missing = missing_val,
                                verbosity = 1,
                                n_jobs = 1)
    #Fitting model
    clf_xgb.fit(X, y)
    
    return(clf_xgb)

In [14]:
#Function to remove all the no regurlisation parameters from the parameter grid.
def regularisation_param(gamma, max_depth, l1, l2):

    param_comb_reg = list(itertools.product(gamma, max_depth, l1, l2))
    param_comb_reg = np.array(param_comb_reg)

    #Removing no regularisation in grid (where gamma == 0 & l1 == 0 & l2 == 0)
    zero_gamma = param_comb_reg[:,0] == 0
    zero_alpha = param_comb_reg[:,2] == 0
    zero_lambda = param_comb_reg[:,3] == 0
    param_comb_reg = param_comb_reg[~zero_gamma | ~zero_alpha | ~zero_lambda,:]
    
    return(np.array(param_comb_reg))

In [15]:
#Function to calculate the errors of the models.
def errors_model(mod, X_test, y_test):
    
    pred_class = mod.predict(X_test) #Class predictions
    acc = accuracy_score(y_test, pred_class) #Accuracy
    f1 = f1_score(y_test, pred_class) #F1 score 
    matt = matthews_corrcoef(y_test, pred_class) #Matthews Correlation Coefficient
    pred_prob = mod.predict_proba(X_test)[:,1] #Probability predictions
    auc = roc_auc_score(y_test, pred_prob) #AUC
    errs_mod = [acc, f1, matt, auc]
    return(errs_mod)

# AdaBoost (Overlap data)

In [19]:
#loading dataset and setting up data 
import pickle
file_name = 'Overlap_data.pickle'
file = open(file_name,'rb')
Overlap_data = pickle.load(file)
file.close()

X_train_over = Overlap_data['X_train']
y_train_over = Overlap_data['y_train']
X_test_over = Overlap_data['X_test']
y_test_over = Overlap_data['y_test']

In [16]:
#Parameter grid no regularisation.
gamma = [0]
max_depth = list(range(1, 11))
l1 = [0]
l2 = [0]
param_noreg_over = list(itertools.product(gamma, max_depth, l1, l2))
param_noreg_over = np.array(param_noreg_over)

In [None]:
#Tuning model with no regularisation.
cv_noreg_over = xgboost_tune_class(param_comb = param_noreg_over, 
                                   X = X_train_over, 
                                   y = y_train_over, 
                                   learning_rate = 0.05, 
                                   num_iterations = 10000, 
                                   row_subsample = 1, 
                                   colsample = 1, 
                                   tree_build = 'exact', 
                                   class_imbal = False, 
                                   sparse = False,
                                   cv_metric = 'logloss')

In [10]:
#Fitting optimal XGBoost model with no regularisation.
opt_ind_noreg_over = np.argmin(cv_noreg_over[:,5])
xgmod_noreg_over = xgboost_fit_class(param_opt = cv_noreg_over[opt_ind_noreg_over,:], 
                                     X = X_train_over, 
                                     y = y_train_over, 
                                     learning_rate = 0.05,
                                     row_subsample = 1, 
                                     colsample = 1, 
                                     tree_build = 'exact',
                                     class_imbal = False,
                                     sparse = False)

#Calculating errors of model
errs_noreg_over = errors_model(mod = xgmod_noreg_over, X_test = X_test_over, y_test = y_test_over)

In [54]:
#Parameter grid regularisation.

#Parameters 1
###################################################################################
gamma = [0, 0.5, 1, 2, 3]
max_depth = [1, 2, 3]
l1 = [0, 0.5, 1, 2, 3]
l2 = [0, 0.5, 1, 2, 3]
param_reg_over = regularisation_param(gamma, max_depth, l1, l2)
###################################################################################

#Parameters 2 (refined)
###################################################################################
#gamma = [0]
#max_depth = [1]
#l1 = [0.5]
#l2 = [3, 3.5, 4, 4.5, 5]
#param_reg_over = regularisation_param(gamma, max_depth, l1, l2)
###################################################################################

In [2]:
#Tuning model with regularisation.
cv_reg_over = xgboost_tune_class(param_comb = param_reg_over,
                                 X = X_train_over, 
                                 y = y_train_over, 
                                 learning_rate = 0.05, 
                                 num_iterations = 10000, 
                                 row_subsample = 1, 
                                 colsample = 1, 
                                 tree_build = 'exact', 
                                 class_imbal = False, 
                                 sparse = False,
                                 cv_metric = 'logloss')

In [12]:
#Fitting optimal XGBoost model with regularisation.
opt_ind_reg_over = np.argmin(cv_reg_over[:,5])
xgmod_reg_over = xgboost_fit_class(param_opt = cv_reg_over[opt_ind_reg_over,:], 
                                     X = X_train_over, 
                                     y = y_train_over, 
                                     learning_rate = 0.05,
                                     row_subsample = 1, 
                                     colsample = 1, 
                                     tree_build = 'exact',
                                     class_imbal = False,
                                     sparse = False)

#Calculating errors of model
errs_reg_over = errors_model(mod = xgmod_reg_over, X_test = X_test_over, y_test = y_test_over)

In [None]:
#Table of parameters and test errors. 
test_err_noreg = np.append(cv_noreg_over[opt_ind_noreg_over,0:5],errs_noreg_over)
test_err_reg = np.append(cv_reg_over[opt_ind_reg_over,0:5],errs_reg_over)
test_err_over = np.row_stack((test_err_noreg,test_err_reg))
np.round(test_err_over,3)

# Phoneme

In [6]:
#Importing and setting up data.
file_path_ph = 'C:\\Users\\Matt\\Documents\\Python code thesis\\Datasets\\phoneme.csv'
df_ph = pd.read_csv(file_path_ph)
X_ph = df_ph.drop('target',axis=1).copy()
y_ph = df_ph['target'].copy()
X_train_ph, X_test_ph, y_train_ph, y_test_ph = train_test_split(X_ph, y_ph, random_state=65, stratify=y_ph, test_size=0.2)

In [25]:
#Parameter grid no regularisation.
gamma = [0]
max_depth = list(range(1, 11))
l1 = [0]
l2 = [0]
param_noreg_ph = list(itertools.product(gamma, max_depth, l1, l2))
param_noreg_ph = np.array(param_noreg_ph)

In [19]:
#Tuning model with no regularisation.
cv_noreg_ph = xgboost_tune_class(param_comb = param_noreg_ph, 
                                 X = X_train_ph, 
                                 y = y_train_ph, 
                                 learning_rate = 0.05, 
                                 num_iterations = 10000, 
                                 row_subsample = 0.8, 
                                 colsample = 1, 
                                 tree_build = 'exact', 
                                 class_imbal = True, 
                                 sparse = False,
                                 cv_metric = 'auc')

In [8]:
#Fitting optimal XGBoost model with no regularisation.
opt_ind_noreg_ph = np.argmax(cv_noreg_ph[:,5])
xgmod_noreg_ph = xgboost_fit_class(param_opt = cv_noreg_ph[opt_ind_noreg_ph,:], 
                                   X = X_train_ph, 
                                   y = y_train_ph, 
                                   learning_rate = 0.05,
                                   row_subsample = 0.8, 
                                   colsample = 1, 
                                   tree_build = 'exact',
                                   class_imbal = True,
                                   sparse = False)

#Calculating errors of model
errs_noreg_ph = errors_model(mod = xgmod_noreg_ph, X_test = X_test_ph, y_test = y_test_ph)

In [62]:
#Parameter grid regularisation.

#Parameters 1
#####################################################################
gamma = [0, 0.5, 1, 2, 3]
max_depth = [10]
l1 = [0, 0.5, 1, 2, 3]
l2 = [0, 0.5, 1, 2, 3]
param_reg_ph = regularisation_param(gamma, max_depth, l1, l2)
#####################################################################

In [20]:
#Tuning model with regularisation.
cv_reg_ph = xgboost_tune_class(param_comb = param_reg_ph,
                               X = X_train_ph, 
                               y = y_train_ph, 
                               learning_rate = 0.05, 
                               num_iterations = 10000, 
                               row_subsample = 0.8, 
                               colsample = 1, 
                               tree_build = 'exact', 
                               class_imbal = True, 
                               sparse = False,
                               cv_metric = 'auc')

In [12]:
#Fitting optimal XGBoost model with regularisation.
opt_ind_reg_ph = np.argmax(cv_reg_ph[:,5])
xgmod_reg_ph = xgboost_fit_class(param_opt = cv_reg_ph[opt_ind_reg_ph,:], 
                                 X = X_train_ph, 
                                 y = y_train_ph, 
                                 learning_rate = 0.05,
                                 row_subsample = 0.8, 
                                 colsample = 1, 
                                 tree_build = 'exact',
                                 class_imbal = True,
                                 sparse = False)

#Calculating errors of model
errs_reg_ph = errors_model(mod = xgmod_reg_ph, X_test = X_test_ph, y_test = y_test_ph)

In [None]:
#Table of parameters and test errors. 
test_err_noreg = np.append(cv_noreg_ph[opt_ind_noreg_ph,0:5],errs_noreg_ph)
test_err_reg = np.append(cv_reg_ph[opt_ind_reg_ph,0:5],errs_reg_ph)
test_err_ph = np.row_stack((test_err_noreg,test_err_reg))
np.round(test_err_ph,3)

# Adult

In [3]:
#Importing and setting up data.
file_path = 'C:\\Users\\Matt\\Documents\\Python code thesis\\Datasets\\adult1.csv'
df_adult1 = pd.read_csv(file_path,header=None)
file_path = 'C:\\Users\\Matt\\Documents\\Python code thesis\\Datasets\\adult2.csv'
df_adult2 = pd.read_csv(file_path,header=None)
df_adult = pd.concat([df_adult1, df_adult2])
df_adult.columns = ["age","workclass","fnlwgt","education","education-num","martial-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","target"]
df_adult.replace(' ','',regex=True,inplace=True)
df_adult['target'] = df_adult['target'].apply(lambda x: x.rstrip('.'))
df_adult['target'] = df_adult['target'].apply(lambda x: 0 if x=="<=50K" else 1)
df_adult = df_adult.reset_index(drop=True)
df_adult = df_adult.drop('education',axis=1)

In [7]:
#Creating dummy varaibles and Training/Test split
X_adult = df_adult.drop('target', axis = 1).copy()
X_adult = pd.get_dummies(X_adult, columns = X_adult.columns[X_adult.dtypes==object], drop_first=True)
y_adult = df_adult['target'].copy()
X_train_adult, X_test_adult, y_train_adult, y_test_adult = train_test_split(X_adult, y_adult, random_state=65, stratify=y_adult, test_size=0.2)

In [8]:
#Parameter grid no regularisation.
gamma = [0]
max_depth = list(range(1, 11))
l1 = [0]
l2 = [0]
param_noreg_adult = list(itertools.product(gamma, max_depth, l1, l2))
param_noreg_adult = np.array(param_noreg_adult)

In [14]:
#Tuning model with no regularisation.
cv_noreg_adult = xgboost_tune_class(param_comb = param_noreg_adult, 
                                    X = X_train_adult, 
                                    y = y_train_adult, 
                                    learning_rate = 0.05, 
                                    num_iterations = 10000, 
                                    row_subsample = 0.5, 
                                    colsample = 0.5, 
                                    tree_build = 'approx',
                                    class_imbal = True,
                                    sparse = True,
                                    cv_metric = 'auc')

In [9]:
#Fitting optimal XGBoost model with no regularisation.
opt_ind_noreg_adult = np.argmax(cv_noreg_adult[:,5])
xgmod_noreg_adult = xgboost_fit_class(param_opt = cv_noreg_adult[opt_ind_noreg_adult,:], 
                                   X = X_train_adult, 
                                   y = y_train_adult, 
                                   learning_rate = 0.05,
                                   row_subsample = 0.5, 
                                   colsample = 0.5, 
                                   tree_build = 'approx',
                                   class_imbal = True,
                                   sparse = True)

#Calculating errors of model
errs_noreg_adult = errors_model(mod = xgmod_noreg_adult, X_test = X_test_adult, y_test = y_test_adult)

In [16]:
#Tuning model with regularisation.
cv_reg_adult = xgboost_tune_class(param_comb = param_reg_adult,
                                  X = X_train_adult, 
                                  y = y_train_adult, 
                                  learning_rate = 0.05, 
                                  num_iterations = 10000, 
                                  row_subsample = 0.5, 
                                  colsample = 0.5, 
                                  tree_build = 'approx',
                                  class_imbal = True,
                                  sparse = True,
                                  cv_metric = 'auc')

In [11]:
#Fitting optimal XGBoost model with regularisation.
opt_ind_reg_adult = np.argmax(cv_reg_adult[:,5])
xgmod_reg_adult = xgboost_fit_class(param_opt = cv_reg_adult[opt_ind_reg_adult,:], 
                                   X = X_train_adult, 
                                   y = y_train_adult, 
                                   learning_rate = 0.05,
                                   row_subsample = 0.5, 
                                   colsample = 0.5, 
                                   tree_build = 'approx',
                                   class_imbal = True,
                                   sparse = True)

#Calculating errors of model
errs_reg_adult = errors_model(mod = xgmod_reg_adult, X_test = X_test_adult, y_test = y_test_adult)

In [None]:
#Table of parameters and test errors. 
test_err_noreg = np.append(cv_noreg_adult[opt_ind_noreg_adult,0:5],errs_noreg_adult)
test_err_reg = np.append(cv_reg_adult[opt_ind_reg_adult,0:5],errs_reg_adult)
test_err_adult = np.row_stack((test_err_noreg,test_err_reg))
np.round(test_err_adult,3)

# Santander Customer Satisfaction

In [21]:
#loading dataset and setting up data.
import pickle
file_name = 'customersatformat.pickle'
file = open(file_name,'rb')
df_sat = pickle.load(file)
file.close()

X_sat = df_sat.drop('TARGET', axis = 1).copy()
y_sat = df_sat['TARGET'].copy()
X_train_sat, X_test_sat, y_train_sat, y_test_sat = train_test_split(X_sat, y_sat, random_state=71, stratify=y_sat, test_size=0.2)

In [8]:
#Parameter grid no regularisation.
gamma = [0]
max_depth = list(range(1, 11))
l1 = [0]
l2 = [0]
param_noreg_sat = list(itertools.product(gamma, max_depth, l1, l2))
param_noreg_sat = np.array(param_noreg_sat)

In [None]:
#Tuning model with no regularisation.
cv_noreg_sat = xgboost_tune_class(param_comb = param_noreg_sat, 
                                  X = X_train_sat, 
                                  y = y_train_sat, 
                                  learning_rate = 0.05, 
                                  num_iterations = 10000, 
                                  row_subsample = 0.3, 
                                  colsample = 0.3, 
                                  tree_build = 'approx', 
                                  class_imbal = True, 
                                  sparse = True,
                                  cv_metric='auc')

In [16]:
#Fitting optimal XGBoost model with no regularisation.
opt_ind_noreg_sat = np.argmax(cv_noreg_sat[:,5])
xgmod_noreg_sat = xgboost_fit_class(param_opt = cv_noreg_sat[opt_ind_noreg_sat,:], 
                                   X = X_train_sat, 
                                   y = y_train_sat, 
                                   learning_rate = 0.05,
                                   row_subsample = 0.3, 
                                   colsample = 0.3, 
                                   tree_build = 'approx', 
                                   class_imbal = True, 
                                   sparse = True)

#Calculating errors of model
errs_noreg_sat = errors_model(mod = xgmod_noreg_sat, X_test = X_test_sat, y_test = y_test_sat)

In [47]:
#Parameter grid regularisation

#Parameters 1
###################################################################################
gamma = [0, 0.5, 1, 2, 3]
max_depth = [2, 3, 4]
l1 = [0, 0.5, 1, 2, 3]
l2 = [0, 0.5, 1, 2, 3]
param_reg_sat = regularisation_param(gamma, max_depth, l1, l2)
###################################################################################

#Parameters 2
###################################################################################
#gamma = [2]
#max_depth = [3]
#l1 = [2]
#l2 = [3, 3.5, 4, 4.5, 5]
param_reg_sat = regularisation_param(gamma, max_depth, l1, l2)
###################################################################################

In [14]:
#Tuning model with regularisation.
cv_reg_sat = xgboost_tune_class(param_comb = param_reg_sat,
                                X = X_train_sat, 
                                y = y_train_sat, 
                                learning_rate = 0.05, 
                                num_iterations = 10000, 
                                row_subsample = 0.3, 
                                colsample = 0.3, 
                                tree_build = 'approx', 
                                class_imbal = True, 
                                sparse = True,
                                cv_metric = 'auc')

In [10]:
#Fitting optimal XGBoost model with regularisation.
opt_ind_reg_sat = np.argmax(cv_reg_sat[:,5])
xgmod_reg_sat = xgboost_fit_class(param_opt = cv_reg_sat[opt_ind_reg_sat,:], 
                                  X = X_train_sat, 
                                  y = y_train_sat, 
                                  learning_rate = 0.05,
                                  row_subsample = 0.3, 
                                  colsample = 0.3, 
                                  tree_build = 'approx', 
                                  class_imbal = True, 
                                  sparse = True)

#Calculating errors of model
errs_reg_sat = errors_model(mod = xgmod_reg_sat, X_test = X_test_sat, y_test = y_test_sat)

In [None]:
#Table of parameters and test errors. 
test_err_noreg = np.append(cv_noreg_sat[opt_ind_noreg_sat,0:5],errs_noreg_sat)
test_err_reg = np.append(cv_reg_sat[opt_ind_reg_sat,0:5],errs_reg_sat)
test_err_sat = np.row_stack((test_err_noreg,test_err_reg))
np.round(test_err_sat,3)