# Import Libraries / Defining Functions

In [1]:
import os
import logging
import sys
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from lightgbm import early_stopping
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
from optuna.samplers import TPESampler

random_state = 68

`cat_enc`: Encodes categorical variables using either One Hot Encoding or Ordinal Encoding.

`lir_impute`: Uses Linear Regression to impute missing values.

`create_var`: Creates new features that are hand-made.

`feature_engineering`: Combines above 3 functions to perform feature engineering on dataset.


Creating a seperate function for feature engineering is important as this feature engineering includes imputation of data, and therefore, it should be done after cv is splitted to reduce contamination of cv data.

In [5]:
def cat_enc(df, ohe, ordi, cats):
    other_var = ["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income", 
                 "cb_person_cred_hist_length", "Type", "loan_status"]
    df_cat1 = pd.DataFrame(ohe.transform(df[cats[0]]), columns = [x for xs in ohe.categories_ for x in xs]).reset_index(drop=True)
    df_cat2 = pd.DataFrame(ordi.transform(df[cats[1]]), columns = cats[1]).reset_index(drop=True)
    df_cat = pd.concat([df_cat1, df_cat2], axis=1)
    df_enc = pd.concat([df[other_var].reset_index(drop=True),df_cat], axis=1)
    return df_enc

def lir_impute(lr, df_nmv, df_mv, features):
    df_mv['loan_int_rate'] = lr.predict(df_mv[features])
    return pd.concat([df_nmv, df_mv]).reset_index(drop=True)

def create_var(data, aita, aite):
    df = data.copy()
    df['loan_int_rate'] = df['loan_int_rate']/100
    df['loan_int_rate_month'] = (1 + df['loan_int_rate'])**(1/12) - 1
    k =  1 + df['loan_int_rate_month']
    P = df['person_income']*df['loan_percent_income']/12
    L = df['loan_amnt']
    tmp = (np.log(P/(P-L*(k-1)))/np.log(k)).replace([np.inf,-np.inf], np.nan)
    df['est_payback_time_in_month'] = tmp.fillna(-999)
    df['avg_income_by_age'] = [aita[x] if x in aita.index else aita[aita.index[abs(aita.index - x).argmin()]] for x in df["person_age"]]
    df['avg_income_by_emp'] = [aite[x] if x in aite.index else aite[aite.index[abs(aite.index - x).argmin()]] for x in df["person_age"]]
    df['income_diff_avg_age'] = df['person_income'] - df['avg_income_by_age']
    df['income_diff_avg_emp'] = df['person_income'] - df['avg_income_by_emp']
    df['risk_flag'] = ((df['Y'] == 1) & (df['loan_grade'].replace(["D","E","F","G"], "low") == "low")).astype("int")
    return df
    
def feature_engineering(data, data_cv, impute=False, min_max = False):
    df = data.copy()
    df_cv = data_cv.copy()
    
    # Step 1: Encoding Categorical Variables
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    ordi = OrdinalEncoder(handle_unknown='error')
    cats = [['person_home_ownership','loan_intent','cb_person_default_on_file'], ['loan_grade']]
    ohe.fit(df[cats[0]])
    ordi.fit(df[cats[1]])
    df_enc = cat_enc(df, ohe, ordi, cats)
    df_cv_enc = cat_enc(df_cv, ohe, ordi, cats)
    
    if impute:
        # Step 2.1: Imputing Missing Values - `loan_int_rate`
        df_mv = df_enc[(df_enc.isna()["loan_int_rate"])]
        df_nmv = df_enc[~(df_enc.isna()["loan_int_rate"])]
        df_cv_mv = df_cv_enc[(df_cv_enc.isna()["loan_int_rate"])]
        df_cv_nmv = df_cv_enc[~(df_cv_enc.isna()["loan_int_rate"])]
        features_infer = df_nmv.columns[~((df_nmv.columns == 'person_emp_length') | (df_nmv.columns == 'loan_int_rate') | (df_nmv.columns == 'loan_status') | (df_nmv.columns == 'Type'))]
        df_nmv_train, df_nmv_infer = df_nmv[features_infer], df_nmv['loan_int_rate']
        lr = LinearRegression().fit(df_nmv_train, df_nmv_infer)
        df_enc = lir_impute(lr, df_nmv, df_mv, features_infer)
        df_cv_enc = lir_impute(lr, df_cv_nmv, df_cv_mv, features_infer)
        
        # Step 2.2: Imputing Missing Values - `person_emp_length`
        df_mv = df_enc[(df_enc.isna()["person_emp_length"])]
        df_nmv = df_enc[~(df_enc.isna()["person_emp_length"])]
        diff_scores = [2.205876158220027e-52, 2.8034878798264012e-36, 7.216755273392921e-12]
        pel_u = []
        for col in ['person_income', 'loan_amnt', 'loan_percent_income']:
            mean = np.mean(df_mv[col])
            std = np.std(df_mv[col])
            d = df_nmv["person_emp_length"][(df_nmv[col] > mean-3*std) & (df_nmv[col] < mean+3*std)]
            pel_u.append(np.mean(d))
        pel_weights = np.log(1/np.array(diff_scores)) # Stronger weights for variables with higher confidence + Softened
        pel_weights = pel_weights / np.sum(pel_weights)
        pel_mean = np.sum(pel_u*pel_weights)
        df_enc['person_emp_length'] = df_enc['person_emp_length'].fillna(pel_mean)
        df_cv_enc['person_emp_length'] = df_cv_enc['person_emp_length'].fillna(pel_mean)
    
    # Step 3: Creating New Variables
    aita = df_enc.groupby("person_age")["person_income"].mean()
    aite = df_enc.groupby("person_emp_length")["person_income"].mean()
    df_fe = create_var(df_enc, aita, aite)
    df_cv_fe = create_var(df_cv_enc, aita, aite)
    
    # Step 4: Scale Normalizing with Scaler
    train_cols = df_fe.columns[[x not in ['loan_status'] for x in df_fe.columns]]
    Xdf_fe = df_fe[train_cols]
    ydf_fe = df_fe['loan_status']
    Xdf_cv_fe = df_cv_fe[train_cols]
    ydf_cv_fe = df_cv_fe['loan_status']
    if min_max:
        scaler = MinMaxScaler().fit(Xdf_fe)
    else:
        scaler = StandardScaler().fit(Xdf_fe)
    cols = Xdf_fe.columns
    Xdf_fe = pd.DataFrame(scaler.transform(Xdf_fe), columns = cols)
    Xdf_cv_fe = pd.DataFrame(scaler.transform(Xdf_cv_fe), columns = cols)
    
    return [Xdf_fe, ydf_fe, Xdf_cv_fe, ydf_cv_fe]

In [None]:
# Set this path to the directory of your data file.
path = "/kaggle/input/loan-approval-prediction-pre/"

# Models - LGBM
Note: For each model trained, the best set of hyperparameters and the best 5 cv iteration rounds will be kept to later create five models, which will be used in aggregate to make final prediction. (For generalization.)

## 1. lbm11
* Data: Train (Unmerged)
* Model: LGBM

In [6]:
X = pd.read_csv(path + "/trains.csv")
y = X['loan_status']
opt_lgbm11 = False
lgbm11_exist = True

if lgbm11_exist: # This is for kaggle (hyperparameter tuning had to be broken down as there weren't enough computation resources)
    os.system('cp /kaggle/input/lgbmoptuna/lgbm11.db /kaggle/working/lgbm11.db')

def objective(trial):
    params = {
        'num_leaves': trial.suggest_categorical('num_leaves',[2,4,8,16,32,64,128,256,512,1024]),
        'subsample': trial.suggest_float('subsample', 0.5,1,step=0.05),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,1,log=True),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 70),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1),
        'verbose': trial.suggest_categorical('verbose', [-1])
    }
    model = lgb.LGBMClassifier(random_state = random_state, **params)
    cv = StratifiedKFold(5, shuffle=True, random_state=random_state)
    cv_splits = cv.split(X, y)
    scores = []
    for train_idx, val_idx in cv_splits:
        Xtrain, Xval = X.loc[train_idx], X.loc[val_idx] # y will be separated later, as feature engineering mix the indices greatly.
        Xtrain, ytrain, Xval, yval = feature_engineering(Xtrain, Xval)
        callbacks = [early_stopping(stopping_rounds=200, verbose=0), lgb.log_evaluation(period=0)]
        model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], callbacks=callbacks, eval_metric = "auc")
        pred = model.predict_proba(Xval)[:,1]
        score = roc_auc_score(yval, pred)
        scores.append(score)
    return np.mean(scores)
sqlite_db = "sqlite:///lgbm11.db"
study_name = "unmerged_ss"
if opt_lgbm11:
    study = optuna.create_study(storage = sqlite_db, study_name = study_name, 
                               sampler = TPESampler(n_startup_trials=35, multivariate=True, seed=random_state),
                               direction="maximize", load_if_exists=True)
    study.optimize(objective, n_trials = 100, gc_after_trial=True)

In [7]:
# Analyze Store (Best) Study Outcome
if opt_lgbm11:
    lgbm11_out = study.trials_dataframe().sort_values(by="value", ascending=False)
    lgbm11_out.columns = [x[7:] if x[:7] == "params_" else x for x in lgbm11_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'min_child_weight', 'n_estimators',
                 'num_leaves', 'reg_alpha', 'reg_lambda', 'subsample', 'verbose', 'state']
    lgbm11_out = lgbm11_out[focus_col]
    display(xgb11_out.head())
if lgbm11_exist:
    lgbm11_out = optuna.load_study(study_name = study_name, storage=sqlite_db).trials_dataframe().sort_values(by="value", ascending=False)
    lgbm11_out.columns = [x[7:] if x[:7] == "params_" else x for x in lgbm11_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'min_child_weight', 'n_estimators',
                 'num_leaves', 'reg_alpha', 'reg_lambda', 'subsample', 'verbose', 'state']
    lgbm11_out = lgbm11_out[focus_col]
    best_param = lgbm11_out.iloc[0,:]
    lgbm11_params = {
        'num_leaves': best_param["num_leaves"],
        'subsample': best_param['subsample'],
        'n_estimators': best_param['n_estimators'],
        'learning_rate': best_param['learning_rate'],
        'reg_lambda': best_param['reg_lambda'],
        'reg_alpha': best_param['reg_alpha'],
        'min_child_weight': best_param['min_child_weight'],
        'colsample_bytree': best_param['colsample_bytree'],
        'verbose': best_param['verbose']
    }
    print("Best Parameters and Score: ")
    display(best_param)
    print("Best CV Iterations: ")
    lgbm11_iter = [1808, 2284, 1532, 1567, 1811]
    print(lgbm11_iter)

Best Parameters and Score: 


value               0.958349
colsample_bytree         1.0
learning_rate       0.035401
min_child_weight           5
n_estimators            6900
num_leaves                 8
reg_alpha                  1
reg_lambda                 6
subsample                0.8
verbose                   -1
state               COMPLETE
Name: 96, dtype: object

Best CV Iterations: 
[1808, 2284, 1532, 1567, 1811]


## 2. lgbm21
* Data: Train (Merged / Dropped)
* Model: LGBM

In [8]:
X = pd.read_csv(path + "/merge_rmvd.csv")
y = X['loan_status']
opt_lgbm21 = False
lgbm21_exist = True

if lgbm21_exist: # This is for kaggle (hyperparameter tuning had to be broken down as there weren't enough computation resources)
    os.system('cp /kaggle/input/lgbmoptuna/lgbm21.db /kaggle/working/lgbm21.db')

def objective(trial):
    params = {
        'num_leaves': trial.suggest_categorical('num_leaves',[2,4,8,16,32,64,128,256,512,1024]),
        'subsample': trial.suggest_float('subsample', 0.5,1,step=0.05),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,1,log=True),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 70),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1),
        'verbose': trial.suggest_categorical('verbose', [-1])
    }
    model = lgb.LGBMClassifier(random_state = random_state, **params)
    cv = StratifiedKFold(5, shuffle=True, random_state=random_state)
    cv_splits = cv.split(X, y)
    scores = []
    for train_idx, val_idx in cv_splits:
        Xtrain, Xval = X.loc[train_idx], X.loc[val_idx] # y will be separated later, as feature engineering mix the indices greatly.
        Xtrain, ytrain, Xval, yval = feature_engineering(Xtrain, Xval)
        callbacks = [early_stopping(stopping_rounds=200, verbose=0), lgb.log_evaluation(period=0)]
        model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], callbacks=callbacks, eval_metric = "auc")
        pred = model.predict_proba(Xval)[:,1]
        score = roc_auc_score(yval, pred)
        scores.append(score)
    return np.mean(scores)
sqlite_db = "sqlite:///lgbm21.db"
study_name = "mergervmd_ss"
if opt_lgbm21:
    study = optuna.create_study(storage = sqlite_db, study_name = study_name, 
                               sampler = TPESampler(n_startup_trials=35, multivariate=True, seed=random_state),
                               direction="maximize", load_if_exists=True)
    study.optimize(objective, n_trials = 100, gc_after_trial=True)

In [9]:
# Analyze Store (Best) Study Outcome
if opt_lgbm21:
    lgbm21_out = study.trials_dataframe().sort_values(by="value", ascending=False)
    lgbm21_out.columns = [x[7:] if x[:7] == "params_" else x for x in lgbm21_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'min_child_weight', 'n_estimators',
                 'num_leaves', 'reg_alpha', 'reg_lambda', 'subsample', 'verbose', 'state']
    lgbm21_out = lgbm21_out[focus_col]
    display(lgbm21_out.head())
if lgbm21_exist:
    lgbm21_out = optuna.load_study(study_name = study_name, storage=sqlite_db).trials_dataframe().sort_values(by="value", ascending=False)
    lgbm21_out.columns = [x[7:] if x[:7] == "params_" else x for x in lgbm21_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'min_child_weight', 'n_estimators',
                 'num_leaves', 'reg_alpha', 'reg_lambda', 'subsample', 'verbose', 'state']
    lgbm21_out = lgbm21_out[focus_col]
    best_param = lgbm21_out.iloc[0,:]
    lgbm21_params = {
        'num_leaves': best_param["num_leaves"],
        'subsample': best_param['subsample'],
        'n_estimators': best_param['n_estimators'],
        'learning_rate': best_param['learning_rate'],
        'reg_lambda': best_param['reg_lambda'],
        'reg_alpha': best_param['reg_alpha'],
        'min_child_weight': best_param['min_child_weight'],
        'colsample_bytree': best_param['colsample_bytree'],
        'verbose': best_param['verbose']
    }
    print("Best Parameters and Score: ")
    display(best_param)
    print("Best CV Iterations: ")
    lgbm21_iter = [1645, 1630, 1708, 2023, 1659]
    print(lgbm21_iter)

Best Parameters and Score: 


value               0.960164
colsample_bytree         0.6
learning_rate       0.070034
min_child_weight          31
n_estimators            8500
num_leaves                16
reg_alpha                  1
reg_lambda                57
subsample                0.9
verbose                   -1
state               COMPLETE
Name: 80, dtype: object

Best CV Iterations: 
[1645, 1630, 1708, 2023, 1659]


## 3. lgbm31
* Data: Train (Merged / Imputed)
* LGBM

In [10]:
X = pd.read_csv(path + "/merged.csv")
y = X['loan_status']
opt_lgbm31 = False
lgbm31_exist = True

if lgbm31_exist: # This is for kaggle (hyperparameter tuning had to be broken down as there weren't enough computation resources)
    os.system('cp /kaggle/input/lgbmoptuna/lgbm31.db /kaggle/working/lgbm31.db')

def objective(trial):
    params = {
        'num_leaves': trial.suggest_categorical('num_leaves',[2,4,8,16,32,64,128,256,512,1024]),
        'subsample': trial.suggest_float('subsample', 0.5,1,step=0.05),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,1,log=True),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 70),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1),
        'verbose': trial.suggest_categorical('verbose', [-1])
    }
    model = lgb.LGBMClassifier(random_state = random_state, **params)
    cv = StratifiedKFold(5, shuffle=True, random_state=random_state)
    cv_splits = cv.split(X, y)
    scores = []
    for train_idx, val_idx in cv_splits:
        Xtrain, Xval = X.loc[train_idx], X.loc[val_idx] # y will be separated later, as feature engineering mix the indices greatly.
        Xtrain, ytrain, Xval, yval = feature_engineering(Xtrain, Xval)
        callbacks = [early_stopping(stopping_rounds=200, verbose=0), lgb.log_evaluation(period=0)]
        model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], callbacks=callbacks, eval_metric = "auc")
        pred = model.predict_proba(Xval)[:,1]
        score = roc_auc_score(yval, pred)
        scores.append(score)
    return np.mean(scores)
sqlite_db = "sqlite:///lgbm31.db"
study_name = "merged_ss"
if opt_lgbm31:
    study = optuna.create_study(storage = sqlite_db, study_name = study_name, 
                               sampler = TPESampler(n_startup_trials=35, multivariate=True, seed=random_state),
                               direction="maximize", load_if_exists=True)
    study.optimize(objective, n_trials = 100, gc_after_trial=True)

In [11]:
# Analyze Store (Best) Study Outcome
if opt_lgbm31:
    lgbm31_out = study.trials_dataframe().sort_values(by="value", ascending=False)
    lgbm31_out.columns = [x[7:] if x[:7] == "params_" else x for x in lgbm31_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'min_child_weight', 'n_estimators',
                 'num_leaves', 'reg_alpha', 'reg_lambda', 'subsample', 'verbose', 'state']
    lgbm31_out = lgbm31_out[focus_col]
    display(lgbm31_out.head())
if lgbm31_exist:
    lgbm31_out = optuna.load_study(study_name = study_name, storage=sqlite_db).trials_dataframe().sort_values(by="value", ascending=False)
    lgbm31_out.columns = [x[7:] if x[:7] == "params_" else x for x in lgbm31_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'min_child_weight', 'n_estimators',
                 'num_leaves', 'reg_alpha', 'reg_lambda', 'subsample', 'verbose', 'state']
    lgbm31_out = lgbm31_out[focus_col]
    best_param = lgbm31_out.iloc[0,:]
    lgbm31_params = {
        'num_leaves': best_param["num_leaves"],
        'subsample': best_param['subsample'],
        'n_estimators': best_param['n_estimators'],
        'learning_rate': best_param['learning_rate'],
        'reg_lambda': best_param['reg_lambda'],
        'reg_alpha': best_param['reg_alpha'],
        'min_child_weight': best_param['min_child_weight'],
        'colsample_bytree': best_param['colsample_bytree'],
        'verbose': best_param['verbose']
    }
    print("Best Parameters and Score: ")
    display(best_param)
    print("Best CV Iterations: ")
    lgbm31_iter = [2155, 2299, 1789, 1743, 1905]
    print(lgbm31_iter)

Best Parameters and Score: 


value                0.95906
colsample_bytree         0.9
learning_rate       0.013924
min_child_weight          13
n_estimators            5200
num_leaves               512
reg_alpha                  3
reg_lambda                48
subsample               0.65
verbose                   -1
state               COMPLETE
Name: 43, dtype: object

Best CV Iterations: 
[2155, 2299, 1789, 1743, 1905]


# LGBM Models Test Outcome (Individual)

## 0. Importing Test Data

In [12]:
test = pd.read_csv(path + "cv.csv")
display(test.head())

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,Type
0,25,45000,RENT,6.0,MEDICAL,C,17000,12.99,0.36,Y,3,1,1
1,33,65000,RENT,3.0,DEBTCONSOLIDATION,B,5000,11.49,0.08,N,5,0,1
2,26,90000,MORTGAGE,9.0,DEBTCONSOLIDATION,A,6500,7.14,0.07,N,2,0,1
3,26,67000,RENT,10.0,PERSONAL,B,10000,12.21,0.15,N,2,0,1
4,22,60000,RENT,6.0,MEDICAL,C,12500,14.22,0.21,Y,2,0,1


In [13]:
def lgbm_pred(models, Xtest):
    y_pred = []
    for model in models:
        y_pred.append(model.predict_proba(Xtest)[:,1])
    y_pred = np.array(y_pred).sum(axis=0)/len(models)
    return y_pred

## 1. `lgbm11` Model

In [14]:
lgbm11_model = []
Xtest = test.copy()
Xtrain = pd.read_csv(path + "/trains.csv")
Xtrain, ytrain, Xtest, ytest = feature_engineering(Xtrain, Xtest)
for i in range(5):
    lgbm11_params['n_estimators'] = lgbm11_iter[i]
    model = lgb.LGBMClassifier(random_state = random_state, **lgbm11_params)
    callbacks = [lgb.log_evaluation(period=0)]
    model.fit(Xtrain, ytrain, callbacks=callbacks, eval_metric = "auc")
    lgbm11_model.append(model)

In [15]:
score = roc_auc_score(ytest, lgbm_pred(lgbm11_model, Xtest))
print("Test ROC AUC Score: " + str(score))

Test ROC AUC Score: 0.9553367977691114


## 2. `lgbm21` Model

In [16]:
lgbm21_model = []
Xtest = test.copy()
Xtrain = pd.read_csv(path + "/merge_rmvd.csv")
Xtrain, ytrain, Xtest, ytest = feature_engineering(Xtrain, Xtest)
for i in range(5):
    lgbm21_params['n_estimators'] = lgbm21_iter[i]
    model = lgb.LGBMClassifier(random_state = random_state, **lgbm21_params)
    callbacks = [lgb.log_evaluation(period=0)]
    model.fit(Xtrain, ytrain, callbacks=callbacks, eval_metric = "auc")
    lgbm21_model.append(model)

In [17]:
score = roc_auc_score(ytest, lgbm_pred(lgbm21_model, Xtest))
print("Test ROC AUC Score: " + str(score))

Test ROC AUC Score: 0.9582726406935467


## 3. `lgbm31` Model

In [18]:
lgbm31_model = []
Xtest = test.copy()
Xtrain = pd.read_csv(path + "/merged.csv")
Xtrain, ytrain, Xtest, ytest = feature_engineering(Xtrain, Xtest)
for i in range(5):
    lgbm31_params['n_estimators'] = lgbm31_iter[i]
    model = lgb.LGBMClassifier(random_state = random_state, **lgbm31_params)
    callbacks = [lgb.log_evaluation(period=0)]
    model.fit(Xtrain, ytrain, callbacks=callbacks, eval_metric = "auc")
    lgbm31_model.append(model)

In [19]:
score = roc_auc_score(ytest, lgbm_pred(lgbm31_model, Xtest))
print("Test ROC AUC Score: " + str(score))

Test ROC AUC Score: 0.9571304615050887


# Save Models

The three trained models `1gbm11`, `lgbm21`, `lgbm31` are stored as they will be later aggregated by diverse methods in the effort to increase the score.

In [20]:
# import joblib
# all_models = [lgbm11_model, lgbm21_model, lgbm31_model]
# models_name = ["lgbm11", "lgbm21", "lgbm31"]
# for i in range(3):
#     for j in range(5):
#         name = models_name[i] + "_" + str(j) + ".pkl"
#         joblib.dump(all_models[i][j], name)