# <center> **Home Credit Default Risk Assessment**
# <center> **Final Modeling**

# **Introduction**

In this part of the project, I compared three models: Logistic Regression, Random Forest and LightGBM and compared their performance using ROC-AUC socres. LightGBM performed better than the other two models. I used Optuna for hyperparameter tuning of the LightGBM model and increaed performance slightly.

# **Libraries**

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFECV

import optuna
import pickle
import warnings
import time

# **Display**

In [22]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

# **Load Data**

In [23]:
pd.set_option('use_inf_as_na', True)

data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\Data\data 27.csv",
    index_col=False
)

data = data.drop('SK_ID_CURR', axis=1)

## **Variables**

In [26]:
random_state = 101
target = 'TARGET'

## **Random Sampling**

In [24]:
data = data.sample(frac=0.2, random_state=42)

## **Remove Infinity Values**

In [25]:
data = data.replace([np.inf, -np.inf], np.nan)

## **Imputation**

In [27]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(data)
data = ani.transform(data)

In [28]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(data)
data = ci.transform(data)

## **Train Test Split**

In [29]:
X = data.drop(target, axis=1)
y = data[target]

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

## **Recursive Feature Elimination**

In [None]:
lgb_clf = LGBMClassifier(verbosity=-1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfecv = RFECV(estimator=lgb_clf, step=1, cv=5, scoring='roc_auc')

rfecv.fit(X_train_scaled, y_train)

print("Optimal number of features (LightGBM): ", rfecv.n_features_)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (roc_auc)")

if hasattr(rfecv, 'grid_scores_'):
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
else:
    plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])

plt.show()

## **Modeling**

In [30]:
columns_to_scale = ['ANNUITY_TO_CREDIT_RATIO', 
                    'EXT_SOURCE_3',
                    'EXT_SOURCE_2',
                    'EXT_SOURCE_1',
                    'EXT_SOURCE_MEAN',
                    'ANNUAL_PAYMENT_TO_CREDIT_RATIO',
                    'AGE',
                    'YEARS_ID_PUBLISH',
                    'AMT_ANNUITY',
                    'AMT_GOODS_PRICE',
                    'ANNUITY_TO_INCOME_RATIO',
                    'YEARS_REGISTRATION',
                    'YEARS_LAST_PHONE_CHANGE',
                    'YEARS_EMPLOYED_AGE_PRODUCT',
                    'INCOME_TO_AGE_RATIO',
                    'REGION_POPULATION_RELATIVE',
                    'AVG_MAX_DPD',
                    'TOTAL_DEBIT',
                    'TOTAL_CREDIT_AMT',
                    'DEBT_CREDIT_RATIO',
                    'AVG_ANNUITY_AMOUNT',
                    'AVG_DAYS_DECISION',
                    'RANGE_DAYS_FIRST_DUE',
                    'RANGE_DAYS_LAST_DUE',
                    'SUM_AMT_INSTALMENT',
                    'AVG_AMT_INSTALMENT',
                    'SUM_AMT_PAYMENT',
                    'AVG_AMT_PAYMENT',
                    'MAX_AMT_PAYMENT',
                    'MIN_AMT_PAYMENT',
                    'SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT',
                    'MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT'
                    ]

columns_to_encode =  ['ORGANIZATION_TYPE']

preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), columns_to_scale),
        ('encoder', WoEEncoder(fill_value=.000001), columns_to_encode)
    ],
    remainder='passthrough' 
)


lg_model = LogisticRegression(class_weight='balanced', random_state=random_state, max_iter=5000)
lg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lg', lg_model)
])


rf_model = RandomForestClassifier(class_weight='balanced', random_state=random_state)
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', rf_model)
])


lgbm_model = LGBMClassifier(class_weight='balanced', random_state=random_state, verbose=0)
lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', lgbm_model)
])


pipelines = {
    "Logistic Regression": lg_pipeline,
    "Random Forest": rf_pipeline,
    "LightGBM": lgbm_pipeline
}

for name, pipeline in pipelines.items():
    start_time = time.time()

    y_pred_proba = cross_val_predict(pipeline, X, y, cv=10, method='predict_proba')[:, 1]
    
    roc_auc = roc_auc_score(y, y_pred_proba)
    
    end_time = time.time()
    elapsed_time = (end_time - start_time) / 60

    print(f"{name}: ROC AUC = {roc_auc:.2f} ({elapsed_time:.2f} minutes)")


Logistic Regression: ROC AUC = 0.65 (18.73 minutes)
Random Forest: ROC AUC = 0.73 (18.33 minutes)
LightGBM: ROC AUC = 0.75 (1.30 minutes)


## **Optuna**   

In [None]:
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)  # Validation data for early stopping

   
    gbm = lgb.train(
        param,
        train_data,
        valid_sets=[valid_data],  
        num_boost_round=100,
        callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
    )

   
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

    roc_auc = roc_auc_score(y_test, y_pred)
    
    return roc_auc


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)


print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


### **LGBM Pipeline Optuna Optimized**

In [32]:
columns_to_scale = ['ANNUITY_TO_CREDIT_RATIO', 
                    'EXT_SOURCE_3',
                    'EXT_SOURCE_2',
                    'EXT_SOURCE_1',
                    'EXT_SOURCE_MEAN',
                    'ANNUAL_PAYMENT_TO_CREDIT_RATIO',
                    'AGE',
                    'YEARS_ID_PUBLISH',
                    'AMT_ANNUITY',
                    'AMT_GOODS_PRICE',
                    'ANNUITY_TO_INCOME_RATIO',
                    'YEARS_REGISTRATION',
                    'YEARS_LAST_PHONE_CHANGE',
                    'YEARS_EMPLOYED_AGE_PRODUCT',
                    'INCOME_TO_AGE_RATIO',
                    'REGION_POPULATION_RELATIVE',
                    'AVG_MAX_DPD',
                    'TOTAL_DEBIT',
                    'TOTAL_CREDIT_AMT',
                    'DEBT_CREDIT_RATIO',
                    'AVG_ANNUITY_AMOUNT',
                    'AVG_DAYS_DECISION',
                    'RANGE_DAYS_FIRST_DUE',
                    'RANGE_DAYS_LAST_DUE',
                    'SUM_AMT_INSTALMENT',
                    'AVG_AMT_INSTALMENT',
                    'SUM_AMT_PAYMENT',
                    'AVG_AMT_PAYMENT',
                    'MAX_AMT_PAYMENT',
                    'MIN_AMT_PAYMENT',
                    'SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT',
                    'MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT'
                    ]

columns_to_encode = ['ORGANIZATION_TYPE']

preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), columns_to_scale),
        ('encoder', WoEEncoder(fill_value=.000001), columns_to_encode)
    ],
    remainder='passthrough'
)

lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           lambda_l1=9.076532104622193e-07,
                           lambda_l2=2.674112916972291e-08,
                           num_leaves=223,
                           max_depth=4, 
                           feature_fraction=0.8114544291399193,
                           bagging_fraction=0.980792477313469,
                           bagging_freq=2,
                           learning_rate= 0.027659573358461607,
                           min_child_samples=66,
                           verbose=-1,
                           class_weight='balanced')

lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', lgbm_model)
])

pipelines = {
    "lgbm": lgbm_pipeline,
}


for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_prob = pipeline.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_prob)
    print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.74


# **Create a Pickle File for Streamlit Deployment**

In [None]:
with open('lgbm_pipeline.pkl', 'wb') as file:
    pickle.dump(lgbm_pipeline, file)

# **Summary**

> * **Modeling** — The best performing model was LightGBM with a ROC-AUC score of over 75%. 
> * **Hyperparameter Tuning** — After hyperparameter tuning using Optuna, I was able to improve the ROC-AUC score by about 1%. 
> * **Recursive Feature Elimination** — This tool basically proved that the 48 features chosen earlier are the right 48 features. 
> * **Pickle File** — I created a Pickle file for Streamlit deployment. 