# **Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from feature_engine.imputation import ArbitraryNumberImputer

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFECV

import pickle
import warnings
import time

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

# **Load Data**

In [3]:
pd.set_option('use_inf_as_na', True)

data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\Data\data 27.csv",
    index_col=False
)

data = data.drop('SK_ID_CURR', axis=1)

## **Random Sampling**

In [4]:
data = data.sample(frac=0.2, random_state=42)

In [6]:
data.head()


Unnamed: 0,ANNUITY_TO_CREDIT_RATIO,EXT_SOURCE_3,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_MEAN,ANNUAL_PAYMENT_TO_CREDIT_RATIO,AGE,YEARS_ID_PUBLISH,AMT_ANNUITY,AMT_GOODS_PRICE,ANNUITY_TO_INCOME_RATIO,ORGANIZATION_TYPE,YEARS_REGISTRATION,YEARS_LAST_PHONE_CHANGE,YEARS_EMPLOYED_AGE_PRODUCT,INCOME_TO_AGE_RATIO,REGION_POPULATION_RELATIVE,TARGET,NUM_LOANS,TOTAL_NUM_MONTHS,TOTAL_SUM_STATUSES,AVG_MAX_DPD,TOTAL_NUM_CLOSED,TOTAL_NUM_UNKNOWN,NUM_ACTIVE_LOANS,TOTAL_DEBIT,TOTAL_CREDIT_AMT,DEBT_CREDIT_RATIO,TOTAL_OVERDUE,MAX_OVERDUE,AVG_DAYS_OVERDUE,NUM_PROLONGED_LOANS,NUM_PREVIOUS_APPLICATIONS_x,AVG_ANNUITY_AMOUNT,AVG_DAYS_DECISION,MAX_DAYS_DECISION,MIN_DAYS_DECISION,SUM_CNT_PAYMENT,RANGE_DAYS_FIRST_DUE,RANGE_DAYS_LAST_DUE,NUM_PREVIOUS_APPLICATIONS_y,SUM_AMT_INSTALMENT,AVG_AMT_INSTALMENT,SUM_AMT_PAYMENT,AVG_AMT_PAYMENT,MAX_AMT_PAYMENT,MIN_AMT_PAYMENT,SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT,MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT
211936,0.073,0.597,,0.748,0.673,0.8765,29.48,9.41,37197.0,450000.0,0.2362,0.3486,1.416,1.611,43.53,5341.8125,0.00685,1,3,27.0,0.0,0.0,0.0,24.0,2,1166760.0,1557450.0,0.749148,0.0,-99999.0,0.0,0,3,-22820.701,-806.5,-400,-1431,,99441.0,99651.0,28,593655.7,21201.988,548003.56,19571.557,224739.45,1389.825,0.9231,-1630.4316
220980,0.0485,0.6626,,0.4172,0.54,0.582,48.47,3.396,15808.5,247500.0,0.1405,-0.05948,17.12,5.535,915.5,2321.0833,0.01015,0,11,267.0,0.0,0.0,5.0,182.0,4,-96979.5,1414291.5,-0.068571,0.0,-99999.0,0.0,0,3,-28943.58,-1031.0,-447,-2021,,99404.0,99614.0,32,375341.97,11729.437,375341.97,11729.437,169096.64,6465.6,1.0,0.0
235552,0.1063,0.3108,,0.3352,0.323,1.275,50.56,5.438,10525.5,99000.0,0.0731,0.2329,25.14,7.32,501.2,2847.9604,0.0264,0,2,67.0,12.0,1.0,0.0,1.0,1,76488.97,266724.0,0.286772,0.0,9.0,0.0,0,9,-4076.4106,-1294.0,-32,-2673,,99503.0,99623.0,110,350550.75,3186.825,321055.2,2918.6836,33578.1,8.685,0.915859,-268.14136
125976,0.05,0.501,0.543,0.701,0.5815,0.6,23.02,2.924,9000.0,180000.0,0.05713,0.1725,22.03,0.77,36.72,6843.1772,0.01662,0,1,0.0,0.0,,0.0,0.0,1,102820.5,112500.0,0.91396,0.0,-99999.0,0.0,0,2,9962.415,-281.0,-281,-281,24.0,99749.0,99989.0,9,113552.77,12616.975,113552.77,12616.975,37295.773,9532.125,1.0,0.0
150485,0.06158,0.519,0.1931,0.0848,0.2656,0.7393,26.5,6.42,19264.5,270000.0,0.1157,0.1725,9.69,0.02191,159.5,6283.019,0.01522,0,2,47.0,0.0,0.0,0.0,27.0,1,-99999.0,250416.0,-0.399332,0.0,-99999.0,0.0,0,6,-27922.396,-559.0,-168,-1348,,99861.0,99857.0,28,259566.7,9270.239,259566.7,9270.239,47520.63,4527.315,1.0,0.0


## **Remove Infinity Values**

In [7]:
data = data.replace([np.inf, -np.inf], np.nan)

## **Variables**

In [8]:
random_state = 101
target = 'TARGET'

## **Imputation**

In [9]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(data)
data = ani.transform(data)

## **Train Test Split**

In [10]:
X = data.drop(target, axis=1)
y = data[target]

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

## **Recursive Feature Elimination**

In [None]:
lgb_clf = LGBMClassifier(verbosity=-1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfecv = RFECV(estimator=lgb_clf, step=1, cv=5, scoring='roc_auc')

rfecv.fit(X_train_scaled, y_train)

print("Optimal number of features (LightGBM): ", rfecv.n_features_)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (roc_auc)")

if hasattr(rfecv, 'grid_scores_'):
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
else:
    plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])

plt.show()

## **Modeling**

In [13]:
lg_model = LogisticRegression(class_weight='balanced', random_state=random_state, max_iter=5000)
lg_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('lg', lg_model)
])

rf_model = RandomForestClassifier(class_weight='balanced', random_state=random_state)
rf_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('random_forest', rf_model)
])

lgbm_model = LGBMClassifier(class_weight='balanced', random_state=random_state, verbose=0)
lgbm_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('lgbm', lgbm_model)
])

pipelines = {
    "Logistic Regression": lg_pipeline,
    "Random Forest": rf_pipeline,
    "LightGBM": lgbm_pipeline
}

for name, pipeline in pipelines.items():
    start_time = time.time()

    y_pred_proba = cross_val_predict(pipeline, X, y, cv=10, method='predict_proba')[:, 1]
    
    roc_auc = roc_auc_score(y, y_pred_proba)
    
    end_time = time.time()
    elapsed_time = (end_time - start_time) / 60

    print(f"{name}: ROC AUC = {roc_auc:.2f} ({elapsed_time:.2f} minutes)")


Logistic Regression: ROC AUC = 0.69 (0.09 minutes)
Random Forest: ROC AUC = 0.73 (4.67 minutes)
LightGBM: ROC AUC = 0.75 (0.28 minutes)


## **Optuna**   

In [16]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)  # Validation data for early stopping

   
    gbm = lgb.train(
        param,
        train_data,
        valid_sets=[valid_data],  
        num_boost_round=100,
        callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
    )

   
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

    roc_auc = roc_auc_score(y_test, y_pred)
    
    return roc_auc


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)


print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2024-10-17 10:48:32,052] A new study created in memory with name: no-name-04f8935b-4f12-4479-aba6-b7e2fec8f7d6




[I 2024-10-17 10:48:33,540] Trial 0 finished with value: 0.7178982630547779 and parameters: {'lambda_l1': 3.2527331998400304, 'lambda_l2': 0.30370329973625193, 'num_leaves': 250, 'max_depth': 3, 'feature_fraction': 0.8405336589548799, 'bagging_fraction': 0.9254540845831286, 'bagging_freq': 3, 'learning_rate': 7.630483037247582e-08, 'min_child_samples': 20}. Best is trial 0 with value: 0.7178982630547779.




[I 2024-10-17 10:48:36,716] Trial 1 finished with value: 0.7426374429649075 and parameters: {'lambda_l1': 5.830750934894436, 'lambda_l2': 0.06662172185537019, 'num_leaves': 149, 'max_depth': 8, 'feature_fraction': 0.9067699404823414, 'bagging_fraction': 0.4305211624513066, 'bagging_freq': 1, 'learning_rate': 0.019887486799100823, 'min_child_samples': 37}. Best is trial 1 with value: 0.7426374429649075.




[I 2024-10-17 10:48:38,513] Trial 2 finished with value: 0.7310591807507291 and parameters: {'lambda_l1': 6.5420352831118045e-06, 'lambda_l2': 1.051241236083444, 'num_leaves': 57, 'max_depth': 5, 'feature_fraction': 0.5775161479070663, 'bagging_fraction': 0.7099581944142175, 'bagging_freq': 6, 'learning_rate': 6.63829303666979e-07, 'min_child_samples': 89}. Best is trial 1 with value: 0.7426374429649075.




[I 2024-10-17 10:48:39,686] Trial 3 finished with value: 0.7414712533478433 and parameters: {'lambda_l1': 0.003936766897283086, 'lambda_l2': 1.4486769877455457e-07, 'num_leaves': 285, 'max_depth': 5, 'feature_fraction': 0.4523256169682906, 'bagging_fraction': 0.7203237608344277, 'bagging_freq': 6, 'learning_rate': 0.08185208993380218, 'min_child_samples': 89}. Best is trial 1 with value: 0.7426374429649075.




[I 2024-10-17 10:48:40,906] Trial 4 finished with value: 0.7324223148459219 and parameters: {'lambda_l1': 0.00046158575095740523, 'lambda_l2': 2.2040743334691565e-06, 'num_leaves': 48, 'max_depth': 5, 'feature_fraction': 0.445529233319159, 'bagging_fraction': 0.609592058044137, 'bagging_freq': 6, 'learning_rate': 8.708059202223738e-06, 'min_child_samples': 50}. Best is trial 1 with value: 0.7426374429649075.




[I 2024-10-17 10:48:42,947] Trial 5 finished with value: 0.7296602643855374 and parameters: {'lambda_l1': 1.6210241824699174, 'lambda_l2': 2.7369826740638055, 'num_leaves': 103, 'max_depth': 6, 'feature_fraction': 0.9297251015219034, 'bagging_fraction': 0.7773554039502439, 'bagging_freq': 7, 'learning_rate': 0.00010414893895568219, 'min_child_samples': 93}. Best is trial 1 with value: 0.7426374429649075.
[I 2024-10-17 10:48:45,665] Trial 6 finished with value: 0.7338763245474608 and parameters: {'lambda_l1': 0.17125579491433612, 'lambda_l2': 2.1653691673864146, 'num_leaves': 77, 'max_depth': 8, 'feature_fraction': 0.8803961988565328, 'bagging_fraction': 0.5787871460237696, 'bagging_freq': 6, 'learning_rate': 7.348242854713017e-06, 'min_child_samples': 77}. Best is trial 1 with value: 0.7426374429649075.




[I 2024-10-17 10:48:47,852] Trial 7 finished with value: 0.7368665579756549 and parameters: {'lambda_l1': 0.38188570517660314, 'lambda_l2': 0.14115529741534827, 'num_leaves': 238, 'max_depth': 8, 'feature_fraction': 0.4828975942153819, 'bagging_fraction': 0.45594783755852575, 'bagging_freq': 1, 'learning_rate': 2.5305249853714686e-07, 'min_child_samples': 77}. Best is trial 1 with value: 0.7426374429649075.




[I 2024-10-17 10:48:48,818] Trial 8 finished with value: 0.723028049039885 and parameters: {'lambda_l1': 1.1741437999729545, 'lambda_l2': 8.360290381158808e-06, 'num_leaves': 277, 'max_depth': 3, 'feature_fraction': 0.40601828685535374, 'bagging_fraction': 0.8090996060592013, 'bagging_freq': 4, 'learning_rate': 2.471668627462089e-08, 'min_child_samples': 94}. Best is trial 1 with value: 0.7426374429649075.




[I 2024-10-17 10:48:50,159] Trial 9 finished with value: 0.7300459177399691 and parameters: {'lambda_l1': 9.142470491114499e-08, 'lambda_l2': 7.668162577828707e-07, 'num_leaves': 253, 'max_depth': 5, 'feature_fraction': 0.7098218855423679, 'bagging_fraction': 0.5596212205561003, 'bagging_freq': 5, 'learning_rate': 1.696574738126176e-08, 'min_child_samples': 54}. Best is trial 1 with value: 0.7426374429649075.


Best trial:
  Value: 0.7426374429649075
  Params: 
    lambda_l1: 5.830750934894436
    lambda_l2: 0.06662172185537019
    num_leaves: 149
    max_depth: 8
    feature_fraction: 0.9067699404823414
    bagging_fraction: 0.4305211624513066
    bagging_freq: 1
    learning_rate: 0.019887486799100823
    min_child_samples: 37


### **LGBM**

In [17]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           lambda_l1=9.076532104622193e-07,
                           lambda_l2=2.674112916972291e-08,
                           num_leaves=223,
                           max_depth=4, 
                           feature_fraction=0.8114544291399193,
                           bagging_fraction=0.980792477313469,
                           bagging_freq=2,
                           learning_rate= 0.027659573358461607,
                           min_child_samples=66,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.74


In [18]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           lambda_l1=9.076532104622193e-07,
                           lambda_l2=2.674112916972291e-08,
                           num_leaves=223,
                           max_depth=4, 
                           feature_fraction=0.8114544291399193,
                           bagging_fraction=0.980792477313469,
                           bagging_freq=2,
                           learning_rate= 0.027659573358461607,
                           min_child_samples=66,
                           verbose=-1)

model.fit(X, y)

# **Create a Pickle File for Streamlit Deployment**

In [19]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [20]:
model.n_features_

48