In [1]:
#Packages 
import pandas as pd
import numpy as np
import itertools
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, plot_confusion_matrix, f1_score, accuracy_score, matthews_corrcoef
from xgboost import cv
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [3]:
#Function for performing cross-validation with LightGBM model
def lightgbm_tune_class(param_comb, subsample_type, X, y, learning_rate, num_iterations, colsample, max_depth, class_imbal, efb_enable, cv_metric):

    p = param_comb.shape[1]
    param_comb = np.hstack((param_comb, np.zeros((param_comb.shape[0], 2))))

    #Creating lightgbm dataset
    lgb_dat = lgb.Dataset(data = X, label = y)
    
    #Checking if class imbalance
    if(class_imbal == True): 
        weight_pos = sum(y==0)/sum(y==1)
    else: 
        weight_pos = 1
        
    #Parameters
    lgb_params = {
            'boosting_type' : subsample_type,
            'learning_rate' : learning_rate,
            'max_depth' : max_depth,
            'feature_fraction' : colsample,
            'num_leaves' : 1100, #High so number of leaves does not affect tree building
            'scale_pos_weight' : weight_pos,
            'objective' : 'binary',
            'enable_bundle' : efb_enable,
            'n_jobs' : 1,
            'verbose' : -1
        }

    for i in range(param_comb.shape[0]):
  
        #Setting up other parameters depending on subsample type
        if(subsample_type == 'goss'):
            lgb_params.update({'top_rate' : param_comb[i, 0], 'other_rate' : param_comb[i, 1]})
        elif(subsample_type == 'gbdt'):
            lgb_params.update({'bagging_fraction' : param_comb[i, 0],'bagging_freq' : 1})
        else: 
            raise ValueError("Must give subsampling type")
        
        lgb_cv = lgb.cv(params = lgb_params,
                        train_set = lgb_dat,
                        num_boost_round = num_iterations,
                        nfold = 5,
                        stratified = True,
                        metrics = cv_metric,
                        seed = 86)
        
        if(cv_metric == 'AUC'):
            num_tree = np.argmax(lgb_cv.get('auc-mean')) + 1 #number of trees
            opt_val = max(lgb_cv.get('auc-mean'))
        else:
            num_tree = np.argmin(lgb_cv.get('binary_logloss-mean')) + 1 #number of trees
            opt_val = min(lgb_cv.get('binary_logloss-mean'))

        param_comb[i,p:(p+2)] = num_tree, opt_val
        
        print(i)
    
    return(param_comb)

In [4]:
#Function for fitting LightGBM model
def lightgbm_fit_class(param_vals, subsample_type, X, y, learning_rate, colsample, max_depth, class_imbal, efb_enable):
    
    #Checking if user specified class imbalance
    if(class_imbal == True): 
        weight_pos = sum(y==0)/sum(y==1)
    else: 
        weight_pos = 1
    
    #Fitting LightGBM with GOSS
    if(subsample_type == 'goss'):
        clf_lgb = lgb.LGBMClassifier(boosting_type = subsample_type,
                                     learning_rate = learning_rate,
                                     max_depth = max_depth,
                                     feature_fraction = colsample,
                                     num_leaves = 1100,
                                     scale_pos_weight = weight_pos,
                                     objective = 'binary',
                                     enable_bundle = efb_enable,
                                     top_rate = param_vals[0],
                                     other_rate = param_vals[1],
                                     n_estimators = int(param_vals[2]),
                                     n_jobs = 1,
                                     verbose = -1,
                                     seed = 79)
     
    #Fitting LightGBM with standard row subsampling
    elif(subsample_type == 'gbdt'):
        clf_lgb = lgb.LGBMClassifier(boosting_type = subsample_type,
                                     learning_rate = learning_rate,
                                     max_depth = max_depth,
                                     feature_fraction = colsample,
                                     num_leaves = 1100,
                                     scale_pos_weight = weight_pos,
                                     objective = 'binary',
                                     enable_bundle = efb_enable,
                                     bagging_fraction = param_vals[0],
                                     bagging_freq = 1,
                                     n_estimators = int(param_vals[1]),
                                     n_jobs = 1,
                                     verbose = -1,
                                     seed = 79)
        
    else: 
        raise ValueError("Must give subsampling type")
    
    clf_lgb.fit(X, y)
    
    return(clf_lgb)

In [5]:
#Function for calculating accuracy, F1, MCC and AUC of model
def errors_model(mod, X_test, y_test):
    
    pred_class = mod.predict(X_test) #Class predictions
    acc = accuracy_score(y_test, pred_class) #Accuracy
    f1 = f1_score(y_test, pred_class) #F1 score 
    matt = matthews_corrcoef(y_test, pred_class) #Matthews Correlation Coefficient
    pred_prob = mod.predict_proba(X_test)[:,1] #Probability predictions
    auc = roc_auc_score(y_test, pred_prob) #AUC
    errs_mod = [acc, f1, matt, auc]
    return(errs_mod)

In [6]:
#Returns the test erros of the selected LightGBM models from cross-validation
def test_errs(cv_sub, cv_goss, learning_rate, colsample, max_depth, class_imbal, efb_enable, X_train, y_train, X_test, y_test,cv_metric):
    
    lgbmod_test_err = {}
    for i,x in enumerate(cv_goss):
    
        if(cv_metric=='AUC'):
            ind_opt = np.argmax(cv_goss[x][:,3])
            cv_goss_opt = cv_goss[x][ind_opt,:]
        else:
            ind_opt = np.argmin(cv_goss[x][:,3])
            cv_goss_opt = cv_goss[x][ind_opt,:]
            
        cv_sub_ent = cv_sub[i,]
    
        params_fit = {
            'cv_sub' : [list(cv_sub_ent),'gbdt'],
            'cv_goss' : [list(cv_goss_opt),'goss']
        }  
    
        errs_lgb = [[],[]]
        for j, y in enumerate(params_fit):
        
            lgb_mod = lightgbm_fit_class(param_vals = params_fit[y][0], 
                                         subsample_type = params_fit[y][1], 
                                         X = X_train, 
                                         y = y_train, 
                                         learning_rate = learning_rate, 
                                         colsample = colsample, 
                                         max_depth = max_depth, 
                                         class_imbal = class_imbal, 
                                         efb_enable = efb_enable)
        
        
            errs_lgb[j] = errors_model(mod = lgb_mod, X_test = X_test, y_test = y_test)
    
        lgbmod_test_err[x] = np.array(errs_lgb)
        
    return(lgbmod_test_err)

In [7]:
#Setting parameter values for row subsampling
param_sub = np.round(np.linspace(0.2,0.8,num=7),1).reshape(-1,1)

In [8]:
#Setting parameter values for GOSS
param_goss = {}
x = np.round(np.linspace(0.2,0.8,num=7),1)

for sub_rat in x:
    my_len = int(np.round(sub_rat/0.05))
    
    rat_comb = []
    for i in range(1,my_len):
        rat_comb.append(i*0.05)

    rat_comb = np.round(rat_comb,4)
    rat_comb = np.vstack((rat_comb,rat_comb[::-1]))
    
    param_goss[sub_rat] = rat_comb

In [None]:
#No scientific notation
np.set_printoptions(suppress=True)

# AdaBoost (Overlap data)

In [9]:
#loading dataset and setting up data 
import pickle
file_name = 'Overlap_data.pickle'
file = open(file_name,'rb')
Overlap_data = pickle.load(file)
file.close()

X_train_over = Overlap_data['X_train']
y_train_over = Overlap_data['y_train']
X_test_over = Overlap_data['X_test']
y_test_over = Overlap_data['y_test']

In [None]:
#Cross-validation with standard row subsampling
cv_sub_over = lightgbm_tune_class(param_comb = param_sub, 
                                  subsample_type = 'gbdt', 
                                  X = X_train_over, 
                                  y = y_train_over, 
                                  learning_rate = 0.05, 
                                  num_iterations = 10000, 
                                  colsample = 1, 
                                  max_depth = 1,
                                  class_imbal = False, 
                                  efb_enable = False,
                                  cv_metric = 'binary_logloss')

In [None]:
#Cross-validation with GOSS
cv_goss_over = {}
for x in param_goss:
    cv_goss_over[x] = lightgbm_tune_class(param_comb = param_goss[x].T, 
                                          subsample_type = 'goss', 
                                          X = X_train_over, 
                                          y = y_train_over, 
                                          learning_rate = 0.05, 
                                          num_iterations = 10000, 
                                          colsample = 1, 
                                          max_depth = 1,
                                          class_imbal = False, 
                                          efb_enable = False,
                                          cv_metric = 'binary_logloss')

In [None]:
#Fitting models, and calculating Accuracy, F1, MCC and AUC on test data
test_errs_over = test_errs(cv_sub = cv_sub_over,
                           cv_goss = cv_goss_over,
                           learning_rate = 0.05, 
                           colsample = 1, 
                           max_depth = 1, 
                           class_imbal = False, 
                           efb_enable = False, 
                           X_train = X_train_over, 
                           y_train = y_train_over, 
                           X_test = X_test_over, 
                           y_test = y_test_over,
                           cv_metric = 'binary_logloss')

# Phoneme

In [8]:
#Importing and setting up data
file_path_ph = 'C:\\Users\\Matt\\Documents\\Python code thesis\\Datasets\\phoneme.csv'
df_ph = pd.read_csv(file_path_ph)
X_ph = df_ph.drop('target',axis=1).copy()
y_ph = df_ph['target'].copy()
X_train_ph, X_test_ph, y_train_ph, y_test_ph = train_test_split(X_ph, y_ph, random_state=65, stratify=y_ph, test_size=0.2)

In [None]:
#Cross-validation with standard row subsampling
cv_sub_ph = lightgbm_tune_class(param_comb = param_sub, 
                                subsample_type = 'gbdt', 
                                X = X_train_ph, 
                                y = y_train_ph, 
                                learning_rate = 0.05, 
                                num_iterations = 10000, 
                                colsample = 1, 
                                max_depth = 10,
                                class_imbal = True, 
                                efb_enable = False,
                                cv_metric = 'AUC')

In [97]:
#Cross-validation with GOSS
cv_goss_ph = {}
for x in param_goss:
    cv_goss_ph[x] = lightgbm_tune_class(param_comb = param_goss[x].T, 
                                        subsample_type = 'goss', 
                                        X = X_train_ph, 
                                        y = y_train_ph, 
                                        learning_rate = 0.05, 
                                        num_iterations = 10000, 
                                        colsample = 1, 
                                        max_depth = 10,
                                        class_imbal = True, 
                                        efb_enable = False,
                                        cv_metric = 'AUC')

In [None]:
#Fitting models, and calculating Accuracy, F1, MCC and AUC on test data
test_errs_ph = test_errs(cv_sub = cv_sub_ph,
                         cv_goss = cv_goss_ph,
                         learning_rate = 0.05, 
                         colsample = 1, 
                         max_depth = 10, 
                         class_imbal = True, 
                         efb_enable = False, 
                         X_train = X_train_ph, 
                         y_train = y_train_ph, 
                         X_test = X_test_ph, 
                         y_test = y_test_ph,
                         cv_metric = 'AUC')

# Adult

In [32]:
#Importing and setting up data.
file_path = 'C:\\Users\\Matt\\Documents\\Python code thesis\\Datasets\\adult1.csv'
df_adult1 = pd.read_csv(file_path,header=None)
file_path = 'C:\\Users\\Matt\\Documents\\Python code thesis\\Datasets\\adult2.csv'
df_adult2 = pd.read_csv(file_path,header=None)
df_adult = pd.concat([df_adult1, df_adult2])
df_adult.columns = ["age","workclass","fnlwgt","education","education-num","martial-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","target"]
df_adult.replace(' ','',regex=True,inplace=True)
df_adult['target'] = df_adult['target'].apply(lambda x: x.rstrip('.'))
df_adult['target'] = df_adult['target'].apply(lambda x: 0 if x=="<=50K" else 1)
df_adult = df_adult.reset_index(drop=True)
df_adult = df_adult.drop('education',axis=1)

In [33]:
#Defining the categorical features
ind_cat = df_adult.dtypes == object
cat_feat = df_adult.columns[ind_cat].tolist()
df_adult[cat_feat] = df_adult[cat_feat].astype('category')

In [34]:
#Training/Test split
X_adult = df_adult.drop('target', axis=1)
y_adult = df_adult['target']
X_train_adult, X_test_adult, y_train_adult, y_test_adult = train_test_split(X_adult, y_adult, random_state=65, stratify=y_adult, test_size=0.2)

In [13]:
#Cross validation with standard row subsampling
cv_sub_adult = lightgbm_tune_class(param_comb = param_sub, 
                                   subsample_type = 'gbdt', 
                                   X = X_train_adult, 
                                   y = y_train_adult, 
                                   learning_rate = 0.05, 
                                   num_iterations = 10000, 
                                   colsample = 1,
                                   max_depth = 8,
                                   class_imbal = True, 
                                   efb_enable = False,
                                   cv_metric = 'AUC')

In [11]:
#Cross-validation with GOSS
cv_goss_adult = {}
for x in param_goss:
    cv_goss_adult[x] = lightgbm_tune_class(param_comb = param_goss[x].T, 
                                           subsample_type = 'goss', 
                                           X = X_train_adult, 
                                           y = y_train_adult, 
                                           learning_rate = 0.05, 
                                           num_iterations = 10000, 
                                           colsample = 1, 
                                           max_depth = 8,
                                           class_imbal = True, 
                                           efb_enable = False, 
                                           cv_metric = 'AUC')

In [None]:
#Calculating Accuracy, F1, MCC and AUC on test data
test_errs_adult = test_errs(cv_sub = cv_sub_adult,
          cv_goss = cv_goss_adult,
          learning_rate = 0.05, 
          colsample = 1, 
          max_depth = 8, 
          class_imbal = True, 
          efb_enable = False, 
          X_train = X_train_adult, 
          y_train = y_train_adult, 
          X_test = X_test_adult, 
          y_test = y_test_adult, 
          cv_metric = 'AUC')

# Santander Customer Satisfaction

In [41]:
#loading dataset
import pickle
file_name = 'customersatformat.pickle'
file = open(file_name,'rb')
df_sat = pickle.load(file)
file.close()

In [42]:
#Training/Test split
X_sat = df_sat.drop('TARGET', axis = 1).copy()
y_sat = df_sat['TARGET'].copy()
X_train_sat, X_test_sat, y_train_sat, y_test_sat = train_test_split(X_sat, y_sat, random_state=71, stratify=y_sat, test_size=0.2)

In [104]:
#Cross validation with standard row subsampling
cv_sub_sat = lightgbm_tune_class(param_comb = param_sub, 
                                 subsample_type = 'gbdt', 
                                 X = X_train_sat, 
                                 y = y_train_sat, 
                                 learning_rate = 0.05, 
                                 num_iterations = 10000, 
                                 colsample = 0.3, 
                                 max_depth = 2,
                                 class_imbal = True, 
                                 efb_enable = True,
                                 cv_metric = 'AUC')

In [105]:
#Cross-validation with GOSS
cv_goss_sat = {}
for x in param_goss:
    cv_goss_sat[x] = lightgbm_tune_class(param_comb = param_goss[x].T, 
                                         subsample_type = 'goss', 
                                         X = X_train_sat, 
                                         y = y_train_sat, 
                                         learning_rate = 0.05, 
                                         num_iterations = 10000, 
                                         colsample = 0.3, 
                                         max_depth = 2,
                                         class_imbal = True, 
                                         efb_enable = True, 
                                         cv_metric = 'AUC')

In [None]:
#Calculating Accuracy, F1, MCC and AUC on test data
test_errs_sat = test_errs(cv_sub = cv_sub_sat,
          cv_goss = cv_goss_sat,
          learning_rate = 0.05, 
          colsample = 0.3, 
          max_depth = 2, 
          class_imbal = True, 
          efb_enable = True, 
          X_train = X_train_sat, 
          y_train = y_train_sat, 
          X_test = X_test_sat, 
          y_test = y_test_sat, 
          cv_metric = 'AUC')