# Import Libraries / Functions

In [1]:
import os
import logging
import sys
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
from optuna.samplers import TPESampler

random_state = 68

`cat_enc`: Encodes categorical variables using either One Hot Encoding or Ordinal Encoding.

`lir_impute`: Uses Linear Regression to impute missing values.

`create_var`: Creates new features that are hand-made.

`feature_engineering`: Combines above 3 functions to perform feature engineering on dataset.


Creating a seperate function for feature engineering is important as this feature engineering includes imputation of data, and therefore, it should be done after cv is splitted to reduce contamination of cv data.

In [5]:
def cat_enc(df, ohe, ordi, cats):
    other_var = ["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income", 
                 "cb_person_cred_hist_length", "Type", "loan_status"]
    df_cat1 = pd.DataFrame(ohe.transform(df[cats[0]]), columns = [x for xs in ohe.categories_ for x in xs]).reset_index(drop=True)
    df_cat2 = pd.DataFrame(ordi.transform(df[cats[1]]), columns = cats[1]).reset_index(drop=True)
    df_cat = pd.concat([df_cat1, df_cat2], axis=1)
    df_enc = pd.concat([df[other_var].reset_index(drop=True),df_cat], axis=1)
    return df_enc

def lir_impute(lr, df_nmv, df_mv, features):
    df_mv['loan_int_rate'] = lr.predict(df_mv[features])
    return pd.concat([df_nmv, df_mv]).reset_index(drop=True)

def create_var(data, aita, aite):
    df = data.copy()
    df['loan_int_rate'] = df['loan_int_rate']/100
    df['loan_int_rate_month'] = (1 + df['loan_int_rate'])**(1/12) - 1
    k =  1 + df['loan_int_rate_month']
    P = df['person_income']*df['loan_percent_income']/12
    L = df['loan_amnt']
    tmp = (np.log(P/(P-L*(k-1)))/np.log(k)).replace([np.inf,-np.inf], np.nan)
    df['est_payback_time_in_month'] = tmp.fillna(-999)
    df['avg_income_by_age'] = [aita[x] if x in aita.index else aita[aita.index[abs(aita.index - x).argmin()]] for x in df["person_age"]]
    df['avg_income_by_emp'] = [aite[x] if x in aite.index else aite[aite.index[abs(aite.index - x).argmin()]] for x in df["person_age"]]
    df['income_diff_avg_age'] = df['person_income'] - df['avg_income_by_age']
    df['income_diff_avg_emp'] = df['person_income'] - df['avg_income_by_emp']
    df['risk_flag'] = ((df['Y'] == 1) & (df['loan_grade'].replace(["D","E","F","G"], "low") == "low")).astype("int")
    return df
    
def feature_engineering(data, data_cv, impute=False, min_max = False):
    df = data.copy()
    df_cv = data_cv.copy()
    
    # Step 1: Encoding Categorical Variables
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    ordi = OrdinalEncoder(handle_unknown='error')
    cats = [['person_home_ownership','loan_intent','cb_person_default_on_file'], ['loan_grade']]
    ohe.fit(df[cats[0]])
    ordi.fit(df[cats[1]])
    df_enc = cat_enc(df, ohe, ordi, cats)
    df_cv_enc = cat_enc(df_cv, ohe, ordi, cats)
    
    if impute:
        # Step 2.1: Imputing Missing Values - `loan_int_rate`
        df_mv = df_enc[(df_enc.isna()["loan_int_rate"])]
        df_nmv = df_enc[~(df_enc.isna()["loan_int_rate"])]
        df_cv_mv = df_cv_enc[(df_cv_enc.isna()["loan_int_rate"])]
        df_cv_nmv = df_cv_enc[~(df_cv_enc.isna()["loan_int_rate"])]
        features_infer = df_nmv.columns[~((df_nmv.columns == 'person_emp_length') | (df_nmv.columns == 'loan_int_rate') | (df_nmv.columns == 'loan_status') | (df_nmv.columns == 'Type'))]
        df_nmv_train, df_nmv_infer = df_nmv[features_infer], df_nmv['loan_int_rate']
        lr = LinearRegression().fit(df_nmv_train, df_nmv_infer)
        df_enc = lir_impute(lr, df_nmv, df_mv, features_infer)
        df_cv_enc = lir_impute(lr, df_cv_nmv, df_cv_mv, features_infer)
        
        # Step 2.2: Imputing Missing Values - `person_emp_length`
        df_mv = df_enc[(df_enc.isna()["person_emp_length"])]
        df_nmv = df_enc[~(df_enc.isna()["person_emp_length"])]
        diff_scores = [2.205876158220027e-52, 2.8034878798264012e-36, 7.216755273392921e-12]
        pel_u = []
        for col in ['person_income', 'loan_amnt', 'loan_percent_income']:
            mean = np.mean(df_mv[col])
            std = np.std(df_mv[col])
            d = df_nmv["person_emp_length"][(df_nmv[col] > mean-3*std) & (df_nmv[col] < mean+3*std)]
            pel_u.append(np.mean(d))
        pel_weights = np.log(1/np.array(diff_scores)) # Stronger weights for variables with higher confidence + Softened
        pel_weights = pel_weights / np.sum(pel_weights)
        pel_mean = np.sum(pel_u*pel_weights)
        df_enc['person_emp_length'] = df_enc['person_emp_length'].fillna(pel_mean)
        df_cv_enc['person_emp_length'] = df_cv_enc['person_emp_length'].fillna(pel_mean)
    
    # Step 3: Creating New Variables
    aita = df_enc.groupby("person_age")["person_income"].mean()
    aite = df_enc.groupby("person_emp_length")["person_income"].mean()
    df_fe = create_var(df_enc, aita, aite)
    df_cv_fe = create_var(df_cv_enc, aita, aite)
    
    # Step 4: Scale Normalizing with Scaler
    train_cols = df_fe.columns[[x not in ['loan_status'] for x in df_fe.columns]]
    Xdf_fe = df_fe[train_cols]
    ydf_fe = df_fe['loan_status']
    Xdf_cv_fe = df_cv_fe[train_cols]
    ydf_cv_fe = df_cv_fe['loan_status']
    if min_max:
        scaler = MinMaxScaler().fit(Xdf_fe)
    else:
        scaler = StandardScaler().fit(Xdf_fe)
    cols = Xdf_fe.columns
    Xdf_fe = pd.DataFrame(scaler.transform(Xdf_fe), columns = cols)
    Xdf_cv_fe = pd.DataFrame(scaler.transform(Xdf_cv_fe), columns = cols)
    
    return [Xdf_fe, ydf_fe, Xdf_cv_fe, ydf_cv_fe]

In [None]:
# Set this path to the directory of your data file.
path = "/kaggle/input/loan-approval-prediction-pre/"

# Models - XGB

## 1. xgb11
* Data: Train (Unmerged)
* Model: XGB

In [6]:
X = pd.read_csv(path + "/trains.csv")
y = X['loan_status']
opt_xgb11 = False
xgb11_exist = True

if xgb11_exist:
    os.system('cp /kaggle/input/xgboptuna/xgb11.db /kaggle/working/xgb11.db')

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth',2,15) ,
        'subsample': trial.suggest_float('subsample', 0.5,1,step=0.05),
        'n_estimators': 10000,
        'learning_rate': trial.suggest_float('learning_rate', 0.001,1,log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 50, log=True),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 70),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1),
    }
    model = XGBClassifier(random_state = random_state, **params)
    cv = StratifiedKFold(5, shuffle=True, random_state=random_state)
    cv_splits = cv.split(X, y)
    scores = []
    for train_idx, val_idx in cv_splits:
        Xtrain, Xval = X.loc[train_idx], X.loc[val_idx] # y will be separated later, as feature engineering mix the indices greatly.
        Xtrain, ytrain, Xval, yval = feature_engineering(Xtrain, Xval)
        model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], early_stopping_rounds = 200, verbose=False, eval_metric = "auc")
        pred = model.predict_proba(Xval)[:,1]
        score = roc_auc_score(yval, pred)
        scores.append(score)
    return np.mean(scores)
sqlite_db = "sqlite:///xgb11.db"
study_name = "unmerged_ss"
if opt_xgb11:
    study = optuna.create_study(storage = sqlite_db, study_name = study_name, 
                               sampler = TPESampler(n_startup_trials=35, multivariate=True, seed=random_state),
                               direction="maximize", load_if_exists=True)
    study.optimize(objective, n_trials = 10, gc_after_trial=True)

In [7]:
# Analyze Store (Best) Study Outcome
if opt_xgb11:
    xgb11_out = study.trials_dataframe().sort_values(by="value", ascending=False)
    xgb11_out.columns = [x[7:] if x[:7] == "params_" else x for x in xgb11_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight',
                 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
    xgb11_out = xgb11_out[focus_col]
    display(xgb11_out.head())
if xgb11_exist:
    xgb11_out = optuna.load_study(study_name = study_name, storage=sqlite_db).trials_dataframe().sort_values(by="value", ascending=False)
    xgb11_out.columns = [x[7:] if x[:7] == "params_" else x for x in xgb11_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight',
                 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
    xgb11_out = xgb11_out[focus_col]
    best_param = xgb11_out.iloc[0,:]
    xgb11_params = {
        'max_depth': best_param['max_depth'].astype("int"),
        'subsample': best_param['subsample'],
        'n_estimators': best_param['n_estimators'].astype("int"),
        'learning_rate': best_param['learning_rate'],
        'reg_lambda': best_param['reg_lambda'],
        'reg_alpha': best_param['reg_alpha'],
        'min_child_weight': best_param['min_child_weight'].astype("int"),
        'colsample_bytree': best_param['colsample_bytree'],
    }
    print("Best Parameters and Score: ")
    display(best_param)
    print("Best CV Iterations: ")
    xgb11_iter = [472, 562, 548, 522, 508]
    print(xgb11_iter)

Best Parameters and Score: 


value                  0.956410
colsample_bytree       0.900000
learning_rate          0.119795
max_depth              6.000000
min_child_weight      13.000000
n_estimators        8500.000000
reg_alpha              2.000000
reg_lambda            57.000000
subsample              1.000000
Name: 73, dtype: float64

Best CV Iterations: 
[472, 562, 548, 522, 508]


## 2. xgb21
* Data: Train (Merged / Dropped)
* Model: XGB

In [8]:
X = pd.read_csv(path + "/merge_rmvd.csv")
y = X['loan_status']
opt_xgb21 = False
xgb21_exist = True

if xgb21_exist: # This is for kaggle (hyperparameter tuning had to be broken down as there weren't enough computation resources)
    os.system('cp /kaggle/input/xgboptuna/xgb21.db /kaggle/working/xgb21.db')

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth',2,15) ,
        'subsample': trial.suggest_float('subsample', 0.5,1,step=0.05),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,1,log=True),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 70),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1),
    }
    model = XGBClassifier(random_state = random_state, **params)
    cv = StratifiedKFold(5, shuffle=True, random_state=random_state)
    cv_splits = cv.split(X, y)
    scores = []
    for train_idx, val_idx in cv_splits:
        Xtrain, Xval = X.loc[train_idx], X.loc[val_idx] # y will be separated later, as feature engineering mix the indices greatly.
        Xtrain, ytrain, Xval, yval = feature_engineering(Xtrain, Xval)
        model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], early_stopping_rounds = 200, verbose=False, eval_metric = "auc")
        pred = model.predict_proba(Xval)[:,1]
        score = roc_auc_score(yval, pred)
        scores.append(score)
    return np.mean(scores)
sqlite_db = "sqlite:///xgb21.db"
study_name = "mergermvd_ss"
if opt_xgb21:
    study = optuna.create_study(storage = sqlite_db, study_name = study_name, 
                               sampler = TPESampler(n_startup_trials=35, multivariate=True, seed=random_state),
                               direction="maximize", load_if_exists=True)
    study.optimize(objective, n_trials = 100, gc_after_trial=True)

In [9]:
# Analyze Store (Best) Study Outcome
if opt_xgb21:
    xgb21_out = study.trials_dataframe().sort_values(by="value", ascending=False)
    xgb21_out.columns = [x[7:] if x[:7] == "params_" else x for x in xgb21_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight',
                 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
    xgb21_out = xgb21_out[focus_col]
    display(xgb21_out.head())
if xgb21_exist:
    xgb21_out = optuna.load_study(study_name = study_name, storage=sqlite_db).trials_dataframe().sort_values(by="value", ascending=False)
    xgb21_out.columns = [x[7:] if x[:7] == "params_" else x for x in xgb21_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight',
                 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
    xgb21_out = xgb21_out[focus_col]
    best_param = xgb21_out.iloc[0,:]
    xgb21_params = {
        'max_depth': best_param['max_depth'].astype("int"),
        'subsample': best_param['subsample'],
        'n_estimators': best_param['n_estimators'].astype("int"),
        'learning_rate': best_param['learning_rate'],
        'reg_lambda': best_param['reg_lambda'],
        'reg_alpha': best_param['reg_alpha'],
        'min_child_weight': best_param['min_child_weight'].astype("int"),
        'colsample_bytree': best_param['colsample_bytree'],
    }
    print("The Best Barameters and Score: ")
    display(best_param)
    print("Best CV Iterations: ")
    xgb21_iter = [1028, 856, 651, 926, 825]
    print(xgb21_iter)

The Best Barameters and Score: 


value                  0.959692
colsample_bytree       0.900000
learning_rate          0.070143
max_depth              6.000000
min_child_weight      19.000000
n_estimators        6400.000000
reg_alpha              1.000000
reg_lambda            12.000000
subsample              1.000000
Name: 83, dtype: float64

Best CV Iterations: 
[1028, 856, 651, 926, 825]


## 3. xgb31
* Data: Train (Merged / Imputed)
* Model: XGB

In [10]:
X = pd.read_csv(path + "/merged.csv")
y = X['loan_status']
opt_xgb31 = False
xgb31_exist = True

if xgb31_exist: # This is for kaggle (hyperparameter tuning had to be broken down as there weren't enough computation resources)
    os.system('cp /kaggle/input/xgboptuna/xgb31.db /kaggle/working/xgb31.db')

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth',2,15) ,
        'subsample': trial.suggest_float('subsample', 0.5,1,step=0.05),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,1,log=True),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 70),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1),
    }
    model = XGBClassifier(random_state = random_state, **params)
    cv = StratifiedKFold(5, shuffle=True, random_state=random_state)
    cv_splits = cv.split(X, y)
    scores = []
    for train_idx, val_idx in cv_splits:
        Xtrain, Xval = X.loc[train_idx], X.loc[val_idx] # y will be separated later, as feature engineering mix the indices greatly.
        Xtrain, ytrain, Xval, yval = feature_engineering(Xtrain, Xval, impute=True)
        model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], early_stopping_rounds = 200, verbose=False, eval_metric = "auc")
        pred = model.predict_proba(Xval)[:,1]
        score = roc_auc_score(yval, pred)
        scores.append(score)
    return np.mean(scores)
sqlite_db = "sqlite:///xgb31.db"
study_name = "merged_ss"
if opt_xgb31:
    study = optuna.create_study(storage = sqlite_db, study_name = study_name, 
                               sampler = TPESampler(n_startup_trials=35, multivariate=True, seed=random_state),
                               direction="maximize", load_if_exists=True)
    study.optimize(objective, n_trials = 100, gc_after_trial=True)

In [11]:
# Analyze Store (Best) Study Outcome
if opt_xgb31:
    xgb31_out = study.trials_dataframe().sort_values(by="value", ascending=False)
    xgb31_out.columns = [x[7:] if x[:7] == "params_" else x for x in xgb31_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight',
                 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
    xgb31_out = xgb31_out[focus_col]
    display(xgb31_out.head())
if xgb31_exist:
    xgb31_out = optuna.load_study(study_name = study_name, storage=sqlite_db).trials_dataframe().sort_values(by="value", ascending=False)
    xgb31_out.columns = [x[7:] if x[:7] == "params_" else x for x in xgb31_out.columns]
    focus_col = ['value', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight',
                 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
    xgb31_out = xgb31_out[focus_col]
    best_param = xgb31_out.iloc[0,:]
    xgb31_params = {
        'max_depth': best_param['max_depth'].astype("int"),
        'subsample': best_param['subsample'],
        'n_estimators': best_param['n_estimators'].astype("int"),
        'learning_rate': best_param['learning_rate'],
        'reg_lambda': best_param['reg_lambda'],
        'reg_alpha': best_param['reg_alpha'],
        'min_child_weight': best_param['min_child_weight'].astype("int"),
        'colsample_bytree': best_param['colsample_bytree'],
    }
    print("The best parameters and score: ")
    display(best_param)
    print("Best CV Iterations: ")
    xgb31_iter = [1119, 693, 781, 1024, 834]
    print(xgb31_iter)

The best parameters and score: 


value                  0.960076
colsample_bytree       0.600000
learning_rate          0.012685
max_depth              8.000000
min_child_weight       5.000000
n_estimators        6500.000000
reg_alpha              2.000000
reg_lambda            17.000000
subsample              1.000000
Name: 95, dtype: float64

Best CV Iterations: 
[1119, 693, 781, 1024, 834]


# XGB Models Test Outcome (Individual)

## 0. Importing Test Data (Splitted from Xtrain)

In [12]:
test = pd.read_csv(path + "/cv.csv")
display(test.head())

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,Type
0,25,45000,RENT,6.0,MEDICAL,C,17000,12.99,0.36,Y,3,1,1
1,33,65000,RENT,3.0,DEBTCONSOLIDATION,B,5000,11.49,0.08,N,5,0,1
2,26,90000,MORTGAGE,9.0,DEBTCONSOLIDATION,A,6500,7.14,0.07,N,2,0,1
3,26,67000,RENT,10.0,PERSONAL,B,10000,12.21,0.15,N,2,0,1
4,22,60000,RENT,6.0,MEDICAL,C,12500,14.22,0.21,Y,2,0,1


In [13]:
def xgb_pred(models, Xtest):
    y_pred = []
    for model in models:
        y_pred.append(model.predict_proba(Xtest)[:,1])
    y_pred = np.array(y_pred).sum(axis=0)/len(models)
    return y_pred

## 1. `xgb11` Model

In [14]:
xgb11_model = []
Xtest = test.copy()
Xtrain = pd.read_csv(path + "/trains.csv")
Xtrain, ytrain, Xtest, ytest = feature_engineering(Xtrain, Xtest)
for i in range(5):
    xgb11_params['n_estimators'] = xgb11_iter[i]
    model = XGBClassifier(random_state = random_state, **xgb11_params)
    model.fit(Xtrain, ytrain, verbose=False, eval_metric = "auc")
    xgb11_model.append(model)

In [15]:
score = roc_auc_score(ytest, xgb_pred(xgb11_model, Xtest))
print("Test ROC AUC Score: " + str(score))

Test ROC AUC Score: 0.9515211296014218


## 2. `xgb21` Model

In [16]:
xgb21_model = []
Xtest = test.copy()
Xtrain = pd.read_csv(path + "/merge_rmvd.csv")
Xtrain, ytrain, Xtest, ytest = feature_engineering(Xtrain, Xtest)
for i in range(5):
    xgb21_params['n_estimators'] = xgb21_iter[i]
    model = XGBClassifier(random_state = random_state, **xgb21_params)
    model.fit(Xtrain, ytrain, verbose=False, eval_metric = "auc")
    xgb21_model.append(model)

In [17]:
score = roc_auc_score(ytest, xgb_pred(xgb21_model, Xtest))
print("Test ROC AUC Score: " + str(score))

Test ROC AUC Score: 0.9559031302655127


## 3. `xgb31` Model

In [18]:
xgb31_model = []
Xtest = test.copy()
Xtrain = pd.read_csv(path + "/merged.csv")
Xtrain, ytrain, Xtest, ytest = feature_engineering(Xtrain, Xtest)
for i in range(5):
    xgb31_params['n_estimators'] = xgb31_iter[i]
    model = XGBClassifier(random_state = random_state, **xgb31_params)
    model.fit(Xtrain, ytrain, verbose=False, eval_metric = "auc")
    xgb31_model.append(model)

In [19]:
score = roc_auc_score(ytest, xgb_pred(xgb31_model, Xtest))
print("Test ROC AUC Score: " + str(score))

Test ROC AUC Score: 0.9544531668475948


# Save Models
The three trained models `1gbm11`, `lgbm21`, `lgbm31` are stored as they will be later aggregated by diverse methods in the effort to increase the score.

In [20]:
# all_models = [xgb11_model, xgb21_model, xgb31_model]
# models_name = ["xgb11", "xgb21", "xgb31"]
# for i in range(3):
#     for j in range(5):
#         name = models_name[i] + "_" + str(j) + ".json"
#         all_models[i][j].save_model(name)