# AMSA: Light Gradient Boosting Machine Classification

## Relevant libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold  # for creating k-fold cv and deal with class imbalanceissue

from lightgbm import LGBMClassifier # for the Light Gradient Boosting Machine classification model

from sklearn.compose import make_column_transformer  # for applying appropriate transformations for each columns
from category_encoders.cat_boost import CatBoostEncoder # for Encoding categorical variables in a way that makes tree-based method able to efficiently evaluate splits

from sklearn.pipeline import Pipeline # for generating the pipeline

from skopt import BayesSearchCV # for grid search and tuning the hyperparameters
from sklearn.model_selection import RandomizedSearchCV

  from pandas.core import (


## Data

In [2]:
default_train_complete = pd.read_csv('../data/default_train_complete.csv')
X_train = default_train_complete.drop(columns = ['Default'])
y_train = default_train_complete['Default']

In [3]:
print(default_train_complete.shape)
print(X_train.shape)
print(y_train.shape)

(29060, 48)
(29060, 47)
(29060,)


In [4]:
default_train_complete.head(n = 10)

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,...,AverageCreditScore_MaritalStatus,AverageCreditScore_LoanPurpose,AverageAge_Education,AverageAge_EmploymentType,AverageAge_MaritalStatus,AverageAge_LoanPurpose,AverageNumCreditLines_Education,AverageNumCreditLines_EmploymentType,AverageNumCreditLines_MaritalStatus,AverageNumCreditLines_LoanPurpose
0,H371G1RDCW,52,79745,157128,384,70,4,20.74,24,0.69,...,574.025699,573.365874,43.458939,43.518016,43.575062,43.33324,2.4979,2.500192,2.501677,2.504109
1,D0UEW34SHK,18,42939,101325,552,31,1,3.91,36,0.17,...,575.347396,573.973738,43.445776,43.518016,43.483009,43.522576,2.496773,2.500192,2.495573,2.491924
2,RC2RK8N9A2,48,114601,41214,433,106,1,22.35,24,0.11,...,575.347396,574.332077,43.458939,43.482213,43.483009,43.686002,2.4979,2.49855,2.495573,2.492704
3,CW5Y12WNTS,44,140505,177916,803,100,3,23.04,48,0.38,...,575.347396,573.365874,43.458939,43.482213,43.483009,43.33324,2.4979,2.49855,2.495573,2.504109
4,1Z7KJ9098R,24,31937,241481,777,22,3,20.14,36,0.45,...,574.025699,574.332077,43.51916,43.418105,43.575062,43.686002,2.500047,2.502993,2.501677,2.492704
5,IPC8D4QK5U,28,36010,18168,831,32,1,16.5,12,0.38,...,574.025699,574.363546,43.458939,43.418105,43.575062,43.474134,2.4979,2.502993,2.501677,2.503345
6,G0GQOHUJHQ,67,36128,64652,719,13,1,10.23,36,0.14,...,575.347396,573.973738,43.47835,43.418105,43.483009,43.522576,2.501302,2.502993,2.495573,2.491924
7,C3CW34SMSF,52,123163,145716,598,75,4,21.6,36,0.41,...,573.231535,574.363546,43.51916,43.485026,43.368479,43.474134,2.500047,2.494253,2.499772,2.503345
8,X4T7X0R9HN,35,85193,135223,844,28,1,18.66,48,0.62,...,573.231535,573.973738,43.47835,43.482213,43.368479,43.522576,2.501302,2.49855,2.499772,2.491924
9,LAY7JB7P4U,66,34226,143052,555,27,2,13.87,12,0.52,...,573.231535,574.363546,43.51916,43.485026,43.368479,43.474134,2.500047,2.494253,2.499772,2.503345


## Cross validation

In [5]:
cv_folds = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 99999)

## Assessment metrics

In [6]:
metrics_set = {
    'recall': 'recall', 
    'bal_accuracy': 'balanced_accuracy', 
    'precision': 'precision', 
    'f1': 'f1', 
    'roc_auc': 'roc_auc'
}

## Modelling

Recall that Boosting is an ensemble method that are developed on top of decision trees which are one of the based-/weak-learners. More precisely, with boosting trees are built sequentially where the new trees try to take into account the errors or mistaes that rose from the previous tree. Notably, penalty term is applied when the subsequent trees try to correct the mistakes of the previous tree to prevent the algorithm from overfitting. 

Consequently, we will explore Light Gradient Boosting Algorithm (LightGBM) instead of normal gradient boosting machine algorithm and extreme gradient boosting machine because it is a much more efficient algorithm rather than XGBoost. Furthermore, similar to the XGBoost, in the case where there are missing values in the data, we won't have to worry about them when implementing the LightGBM algorithm beccause the algorithm deals with the missing values on its own. 

Model specification

In [7]:
lightgbm_class_model = LGBMClassifier(random_state = 33999, n_jobs = -1)

Preprocessing transformation pipeplines

In [8]:
preprocessor = make_column_transformer(
    (CatBoostEncoder(), [str(col) for col in X_train.select_dtypes(['category', 'object'])]),
    remainder = 'passthrough'
)

Notably, since boosting is a tree-based method because it is an ensemble of decision trees, we will use CatBoostEncoder() instead of OneHotEncoder() as tree-based method does not perform well with sparse matrices. Additionally, since tree-based methods are insensitive to scaling, we will also not apply any feature scaling.

Subsequently, we specify the model's main pipeline

In [9]:
lightgbm_class_pipeline = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('lightgbm_class_model', lightgbm_class_model)
    ]
)

Setting up the tuning grid

In [10]:
lightgbm_class_tune_grid = {
    'lightgbm_class_model__learning_rate': [0.05, 0.1, 0.15, 0.2],
    'lightgbm_class_model__max_depth': [int(x) for x in range(2, 11, 2)],
    'lightgbm_class_model__n_estimators': [int(x) for x in range(100, 2001, 50)]
}
lightgbm_class_tune_grid

{'lightgbm_class_model__learning_rate': [0.05, 0.1, 0.15, 0.2],
 'lightgbm_class_model__max_depth': [2, 4, 6, 8, 10],
 'lightgbm_class_model__n_estimators': [100,
  150,
  200,
  250,
  300,
  350,
  400,
  450,
  500,
  550,
  600,
  650,
  700,
  750,
  800,
  850,
  900,
  950,
  1000,
  1050,
  1100,
  1150,
  1200,
  1250,
  1300,
  1350,
  1400,
  1450,
  1500,
  1550,
  1600,
  1650,
  1700,
  1750,
  1800,
  1850,
  1900,
  1950,
  2000]}

Then intiatiate the grid search object. 

Notably, we will use BayesSearchCV instead of GridSearchCV and RandomizedGridSearchCV because of the benefits that BayesSearchCV offers. Particularly, the BayesSearchCV use the Bayesian Optimization approach that tries to find the minimum of an objective function that is very expensive to evaluate. It uses a probabilistic model of a function and uses it to select the most promising hyperparameters to evaluate the true function. BayesSearchCV are much more efficient than GridSearchCV and RandomizedGridSearchCV because it uses the results (combinations of hyperparameters) of past evaluations to inform the choice about the next set of promising hyperparameters to evaluate. Ultimately, this leads to a better model performance at a much lower computational cost. 

In [14]:
lightgbm_class_randomized_grid_search = RandomizedSearchCV(
    estimator = lightgbm_class_pipeline,
    param_distributions = lightgbm_class_tune_grid,
    scoring = metrics_set, 
    refit = 'recall',
    cv = cv_folds,
    error_score = 'raise',
    random_state = 44444,
    n_iter = 20
)

In [15]:
lightgbm_class_randomized_grid_search.fit(X = X_train, y = y_train)

[LightGBM] [Info] Number of positive: 13077, number of negative: 13077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002003 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5718
[LightGBM] [Info] Number of data points in the train set: 26154, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 13077, number of negative: 13077
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5718
[LightGBM] [Info] Number of data points in the train set: 26154, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 13077, number of neg

In [16]:
lightgbm_class_tune_results = pd.DataFrame(lightgbm_class_randomized_grid_search.cv_results_)
lightgbm_class_tune_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lightgbm_class_model__n_estimators,param_lightgbm_class_model__max_depth,param_lightgbm_class_model__learning_rate,params,split0_test_recall,split1_test_recall,...,split3_test_roc_auc,split4_test_roc_auc,split5_test_roc_auc,split6_test_roc_auc,split7_test_roc_auc,split8_test_roc_auc,split9_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc
0,4.655918,1.327911,0.209211,0.009433,550,8,0.05,"{'lightgbm_class_model__n_estimators': 550, 'l...",0.654508,0.650379,...,0.750488,0.741749,0.761609,0.730786,0.733519,0.719626,0.73265,0.737183,0.011556,6
1,1.60832,0.140464,0.154379,0.026604,650,2,0.15,"{'lightgbm_class_model__n_estimators': 650, 'l...",0.653131,0.642808,...,0.760593,0.74801,0.763643,0.736061,0.738088,0.725428,0.74184,0.743361,0.010992,2
2,2.487487,0.366985,0.169867,0.007792,1100,2,0.15,"{'lightgbm_class_model__n_estimators': 1100, '...",0.63042,0.62629,...,0.756886,0.743944,0.760373,0.730888,0.734113,0.722693,0.736355,0.739546,0.011184,3
3,3.994969,1.349356,0.201149,0.023015,1550,2,0.15,"{'lightgbm_class_model__n_estimators': 1550, '...",0.621473,0.617343,...,0.753844,0.740155,0.758328,0.728261,0.73028,0.718606,0.731122,0.735959,0.011814,7
4,4.33741,1.439581,0.202896,0.015312,1850,2,0.1,"{'lightgbm_class_model__n_estimators': 1850, '...",0.61872,0.63042,...,0.755406,0.743307,0.760299,0.731137,0.732894,0.720522,0.736182,0.73888,0.011295,4
5,7.588259,1.367886,0.267722,0.026274,1650,4,0.15,"{'lightgbm_class_model__n_estimators': 1650, '...",0.635926,0.612526,...,0.726,0.712864,0.729002,0.706593,0.704,0.688392,0.707658,0.709162,0.012119,9
6,0.881735,0.16252,0.134004,0.018061,150,2,0.05,"{'lightgbm_class_model__n_estimators': 150, 'l...",0.667584,0.671025,...,0.754559,0.746063,0.758773,0.738885,0.745021,0.732138,0.743881,0.744113,0.008303,1
7,8.468587,1.034464,0.287302,0.0566,1750,4,0.15,"{'lightgbm_class_model__n_estimators': 1750, '...",0.631796,0.615967,...,0.725145,0.711737,0.727601,0.707566,0.7029,0.687421,0.707034,0.708374,0.012055,10
8,2.454003,0.913817,0.165306,0.012844,200,10,0.1,"{'lightgbm_class_model__n_estimators': 200, 'l...",0.65382,0.657949,...,0.749873,0.743851,0.758952,0.73212,0.733222,0.719981,0.738861,0.738162,0.010447,5
9,7.307411,0.877058,0.250154,0.023839,1350,10,0.2,"{'lightgbm_class_model__n_estimators': 1350, '...",0.622161,0.636614,...,0.736424,0.715442,0.728975,0.71513,0.709125,0.70002,0.716288,0.715923,0.010302,8


In [17]:
# First, create a data frame that contains the number of neighbors, the mean test estimates and the standard error of each mean test estimates. 
n_cv = 10
learning_rates = []
n_estimators = []
max_depth = []
mean_estimates = []
std_err = []

for i in range(len(lightgbm_class_tune_results)):
    n_estimators.append(lightgbm_class_tune_results.loc[i, 'param_lightgbm_class_model__n_estimators'])
    max_depth.append(lightgbm_class_tune_results.loc[i, 'param_lightgbm_class_model__max_depth'])
    learning_rates.append(lightgbm_class_tune_results.loc[i, 'param_lightgbm_class_model__learning_rate'])
    mean_estimates.append(lightgbm_class_tune_results.loc[i, 'mean_test_recall'])
    std_err_i = lightgbm_class_tune_results.loc[i, 'std_test_recall']/np.sqrt(n_cv)
    std_err.append(std_err_i)

lightgbm_class_tune_results_cleaned = pd.DataFrame({
    'n_estimators': n_estimators,
    'max_depth': max_depth, 
    'learning_rate': learning_rates,
    "mean_test_recall": mean_estimates, 
    "std_err_test_recall": std_err
})

lightgbm_class_tune_results_cleaned = lightgbm_class_tune_results_cleaned.sort_values(by = "mean_test_recall", ascending = False)

lightgbm_class_tune_results_cleaned

Unnamed: 0,n_estimators,max_depth,learning_rate,mean_test_recall,std_err_test_recall
6,150,2,0.05,0.675155,0.00458
8,200,10,0.1,0.660083,0.010704
1,650,2,0.15,0.655471,0.01042
0,550,8,0.05,0.649966,0.008266
2,1100,2,0.15,0.642739,0.011493
4,1850,2,0.1,0.638679,0.011291
3,1550,2,0.15,0.637164,0.009948
9,1350,10,0.2,0.628011,0.008905
5,1650,4,0.15,0.622987,0.013378
7,1750,4,0.15,0.622092,0.013411
