# <center> **Home Credit Default Risk Assessment**
# <center> **Final Modeling**

# **Introduction**

In this part of the project, I compared three models: Logistic Regression, Random Forest and LightGBM and compared their performance using ROC-AUC socres. LightGBM performed better than the other two models. I used Optuna for hyperparameter tuning of the LightGBM model and increaed performance slightly.

# **Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import RFECV

import optuna
import pickle
import warnings
import time

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

# **Load Data**

In [24]:
pd.set_option('use_inf_as_na', True)

data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\Data\data 27.csv",
    index_col=False
)

data = data.drop('SK_ID_CURR', axis=1)

## **Variables**

In [4]:
random_state = 101
target = 'TARGET'

## **Random Sampling**

In [25]:
data = data.sample(frac=0.2, random_state=42)

## **Remove Infinity Values**

In [26]:
data = data.replace([np.inf, -np.inf], np.nan)

## **Imputation**

In [27]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(data)
data = ani.transform(data)

In [28]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(data)
data = ci.transform(data)

## **Modeling**

In [29]:
X = data.drop(target, axis=1)
y = data[target]

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [11]:
columns_to_scale = ['ANNUITY_TO_CREDIT_RATIO', 
                    'EXT_SOURCE_3',
                    'EXT_SOURCE_2',
                    'EXT_SOURCE_1',
                    'EXT_SOURCE_MEAN',
                    'ANNUAL_PAYMENT_TO_CREDIT_RATIO',
                    'AGE',
                    'YEARS_ID_PUBLISH',
                    'AMT_ANNUITY',
                    'AMT_GOODS_PRICE',
                    'ANNUITY_TO_INCOME_RATIO',
                    'YEARS_REGISTRATION',
                    'YEARS_LAST_PHONE_CHANGE',
                    'YEARS_EMPLOYED_AGE_PRODUCT',
                    'INCOME_TO_AGE_RATIO',
                    'REGION_POPULATION_RELATIVE',
                    'AVG_MAX_DPD',
                    'TOTAL_DEBIT',
                    'TOTAL_CREDIT_AMT',
                    'DEBT_CREDIT_RATIO',
                    'AVG_ANNUITY_AMOUNT',
                    'AVG_DAYS_DECISION',
                    'RANGE_DAYS_FIRST_DUE',
                    'RANGE_DAYS_LAST_DUE',
                    'SUM_AMT_INSTALMENT',
                    'AVG_AMT_INSTALMENT',
                    'SUM_AMT_PAYMENT',
                    'AVG_AMT_PAYMENT',
                    'MAX_AMT_PAYMENT',
                    'MIN_AMT_PAYMENT',
                    'SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT',
                    'MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT'
                    ]

columns_to_encode =  ['ORGANIZATION_TYPE']

preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), columns_to_scale),
        ('encoder', WoEEncoder(fill_value=.000001), columns_to_encode)
    ],
    remainder='passthrough' 
)


lg_model = LogisticRegression(class_weight='balanced', random_state=random_state, max_iter=5000)
lg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lg', lg_model)
])


rf_model = RandomForestClassifier(class_weight='balanced', random_state=random_state)
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', rf_model)
])


lgbm_model = LGBMClassifier(class_weight='balanced', random_state=random_state, verbose=0)
lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', lgbm_model)
])


pipelines = {
    "Logistic Regression": lg_pipeline,
    "Random Forest": rf_pipeline,
    "LightGBM": lgbm_pipeline
}

for name, pipeline in pipelines.items():
    start_time = time.time()

    y_pred_proba = cross_val_predict(pipeline, X, y, cv=10, method='predict_proba')[:, 1]
    
    roc_auc = roc_auc_score(y, y_pred_proba)
    
    end_time = time.time()
    elapsed_time = (end_time - start_time) / 60

    print(f"{name}: ROC AUC = {roc_auc:.2f} ({elapsed_time:.2f} minutes)")


Logistic Regression: ROC AUC = 0.65 (8.42 minutes)
Random Forest: ROC AUC = 0.73 (4.75 minutes)
LightGBM: ROC AUC = 0.75 (0.31 minutes)


# **Optuna**   

In [30]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
encoded_data = woe.transform(data)

In [31]:
X = encoded_data.drop(target, axis=1)
y = encoded_data[target]

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [32]:
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'featur e_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)  # Validation data for early stopping

   
    gbm = lgb.train(
        param,
        train_data,
        valid_sets=[valid_data],  
        num_boost_round=100,
        callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
    )

   
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

    roc_auc = roc_auc_score(y_test, y_pred)
    
    return roc_auc


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)


print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2024-10-18 18:25:38,652] A new study created in memory with name: no-name-5f6e8c55-6549-4050-93cc-7d308c233570




[I 2024-10-18 18:25:41,413] Trial 0 finished with value: 0.7319250548874464 and parameters: {'lambda_l1': 0.00028183867526759804, 'lambda_l2': 2.6937723008149318e-08, 'num_leaves': 220, 'max_depth': 7, 'feature_fraction': 0.9518664159657169, 'bagging_fraction': 0.6638799912726244, 'bagging_freq': 3, 'learning_rate': 0.00117656029227979, 'min_child_samples': 59}. Best is trial 0 with value: 0.7319250548874464.




[I 2024-10-18 18:25:43,743] Trial 1 finished with value: 0.7306503115187386 and parameters: {'lambda_l1': 0.007179994462105893, 'lambda_l2': 3.79759399744031e-08, 'num_leaves': 116, 'max_depth': 6, 'feature_fraction': 0.7946323175148067, 'bagging_fraction': 0.6536001747843604, 'bagging_freq': 6, 'learning_rate': 1.2809076575064419e-06, 'min_child_samples': 12}. Best is trial 0 with value: 0.7319250548874464.




[I 2024-10-18 18:25:47,057] Trial 2 finished with value: 0.7372331842500077 and parameters: {'lambda_l1': 1.782281125608089e-05, 'lambda_l2': 3.077462255256243e-08, 'num_leaves': 286, 'max_depth': 8, 'feature_fraction': 0.828563591827784, 'bagging_fraction': 0.8124637007802207, 'bagging_freq': 1, 'learning_rate': 0.03942474766228542, 'min_child_samples': 49}. Best is trial 2 with value: 0.7372331842500077.




[I 2024-10-18 18:25:48,875] Trial 3 finished with value: 0.7404769464205945 and parameters: {'lambda_l1': 0.11087578642262091, 'lambda_l2': 8.541642229938166e-06, 'num_leaves': 31, 'max_depth': 7, 'feature_fraction': 0.6794565799558816, 'bagging_fraction': 0.6837156959051145, 'bagging_freq': 4, 'learning_rate': 0.022499172246945087, 'min_child_samples': 17}. Best is trial 3 with value: 0.7404769464205945.




[I 2024-10-18 18:25:50,871] Trial 4 finished with value: 0.7284794494755767 and parameters: {'lambda_l1': 0.005410233385964971, 'lambda_l2': 7.158678953336779e-05, 'num_leaves': 155, 'max_depth': 6, 'feature_fraction': 0.6145188687394914, 'bagging_fraction': 0.6723034204085299, 'bagging_freq': 4, 'learning_rate': 4.534637411885632e-05, 'min_child_samples': 83}. Best is trial 3 with value: 0.7404769464205945.




[I 2024-10-18 18:25:51,503] Trial 5 finished with value: 0.6976274651073171 and parameters: {'lambda_l1': 2.1563851384471643e-06, 'lambda_l2': 1.6547919889771818e-07, 'num_leaves': 149, 'max_depth': 8, 'feature_fraction': 0.77368201503738, 'bagging_fraction': 0.7025396001860689, 'bagging_freq': 1, 'learning_rate': 0.5648758237935443, 'min_child_samples': 77}. Best is trial 3 with value: 0.7404769464205945.




[I 2024-10-18 18:25:53,092] Trial 6 finished with value: 0.7223167344305238 and parameters: {'lambda_l1': 0.015621875829999715, 'lambda_l2': 0.003322285458669088, 'num_leaves': 42, 'max_depth': 4, 'feature_fraction': 0.6494090906839344, 'bagging_fraction': 0.7342240103050879, 'bagging_freq': 4, 'learning_rate': 4.540626847784506e-08, 'min_child_samples': 66}. Best is trial 3 with value: 0.7404769464205945.




[I 2024-10-18 18:25:56,562] Trial 7 finished with value: 0.7293754971534636 and parameters: {'lambda_l1': 0.09583847867469601, 'lambda_l2': 3.55468409814322e-05, 'num_leaves': 157, 'max_depth': 8, 'feature_fraction': 0.7134312209847173, 'bagging_fraction': 0.7523989383458118, 'bagging_freq': 5, 'learning_rate': 5.678586898494803e-05, 'min_child_samples': 50}. Best is trial 3 with value: 0.7404769464205945.




[I 2024-10-18 18:25:57,098] Trial 8 finished with value: 0.7440057598095247 and parameters: {'lambda_l1': 0.000397160459511651, 'lambda_l2': 0.0032087489960323508, 'num_leaves': 116, 'max_depth': 3, 'feature_fraction': 0.49501757343571523, 'bagging_fraction': 0.6660972940262576, 'bagging_freq': 6, 'learning_rate': 0.43834679654316316, 'min_child_samples': 80}. Best is trial 8 with value: 0.7440057598095247.




[I 2024-10-18 18:25:59,643] Trial 9 finished with value: 0.7210801872151086 and parameters: {'lambda_l1': 2.657057923097299e-07, 'lambda_l2': 3.948165792399839, 'num_leaves': 62, 'max_depth': 8, 'feature_fraction': 0.4554214166344962, 'bagging_fraction': 0.9571839273164114, 'bagging_freq': 4, 'learning_rate': 0.00010003748651265107, 'min_child_samples': 20}. Best is trial 8 with value: 0.7440057598095247.


Best trial:
  Value: 0.7440057598095247
  Params: 
    lambda_l1: 0.000397160459511651
    lambda_l2: 0.0032087489960323508
    num_leaves: 116
    max_depth: 3
    feature_fraction: 0.49501757343571523
    bagging_fraction: 0.6660972940262576
    bagging_freq: 6
    learning_rate: 0.43834679654316316
    min_child_samples: 80


### **LGBM Pipeline Optuna Optimized**

In [33]:
X = data.drop(target, axis=1)
y = data[target]

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [34]:
columns_to_scale = ['ANNUITY_TO_CREDIT_RATIO', 
                    'EXT_SOURCE_3',
                    'EXT_SOURCE_2',
                    'EXT_SOURCE_1',
                    'EXT_SOURCE_MEAN',
                    'ANNUAL_PAYMENT_TO_CREDIT_RATIO',
                    'AGE',
                    'YEARS_ID_PUBLISH',
                    'AMT_ANNUITY',
                    'AMT_GOODS_PRICE',
                    'ANNUITY_TO_INCOME_RATIO',
                    'YEARS_REGISTRATION',
                    'YEARS_LAST_PHONE_CHANGE',
                    'YEARS_EMPLOYED_AGE_PRODUCT',
                    'INCOME_TO_AGE_RATIO',
                    'REGION_POPULATION_RELATIVE',
                    'AVG_MAX_DPD',
                    'TOTAL_DEBIT',
                    'TOTAL_CREDIT_AMT',
                    'DEBT_CREDIT_RATIO',
                    'AVG_ANNUITY_AMOUNT',
                    'AVG_DAYS_DECISION',
                    'RANGE_DAYS_FIRST_DUE',
                    'RANGE_DAYS_LAST_DUE',
                    'SUM_AMT_INSTALMENT',
                    'AVG_AMT_INSTALMENT',
                    'SUM_AMT_PAYMENT',
                    'AVG_AMT_PAYMENT',
                    'MAX_AMT_PAYMENT',
                    'MIN_AMT_PAYMENT',
                    'SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT',
                    'MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT'
                    ]

columns_to_encode = ['ORGANIZATION_TYPE']

preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), columns_to_scale),
        ('encoder', WoEEncoder(fill_value=.000001), columns_to_encode)
    ],
    remainder='passthrough'
)

lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           lambda_l1=9.076532104622193e-07,
                           lambda_l2=2.674112916972291e-08,
                           num_leaves=223,
                           max_depth=4, 
                           feature_fraction=0.8114544291399193,
                           bagging_fraction=0.980792477313469,
                           bagging_freq=2,
                           learning_rate= 0.027659573358461607,
                           min_child_samples=66,
                           verbose=-1,
                           class_weight='balanced')

lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', lgbm_model)
])

pipelines = {
    "lgbm": lgbm_pipeline,
}


for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_prob = pipeline.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_prob)
    print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.74


# **Create a Pickle File for Streamlit Deployment**

In [None]:
with open('lgbm_pipeline.pkl', 'wb') as file:
    pickle.dump(lgbm_pipeline, file)

## **Default Prediction Web Application**

This deployment is made for demonstration purposes using 3 features only.

# <center> [Default Prediction](https://k7wwqra2pgijjzzhg3pakz.streamlit.app/)

# **Summary**

> * **Modeling** — The best performing model was LightGBM with a ROC-AUC score of over 75%. 
> * **Hyperparameter Tuning** — After hyperparameter tuning using Optuna, I was able to improve the ROC-AUC score by about 1%. 
> * **Recursive Feature Elimination** — This tool basically proved that the 48 features chosen earlier are the right 48 features. 
> * **Pickle File** — I created a Pickle file for Streamlit deployment. 