In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import joblib
import shap
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 100)

In [2]:
df = pd.read_csv("../data/processed/loan_data_processed.csv")
df.head()

Unnamed: 0,age,years_employed,annual_income,credit_score,credit_history_years,savings_assets,current_debt,defaults_on_file,delinquencies_last_2yrs,derogatory_marks,loan_amount,interest_rate,debt_to_income_ratio,loan_to_income_ratio,payment_to_income_ratio,loan_status,occupation_status_Employed,occupation_status_Self-Employed,occupation_status_Student,product_type_Credit Card,product_type_Line of Credit,product_type_Personal Loan,loan_intent_Business,loan_intent_Debt Consolidation,loan_intent_Education,loan_intent_Home Improvement,loan_intent_Medical,loan_intent_Personal
0,0.461368,1.303728,-0.774402,0.760706,-0.398242,-0.291487,-0.267001,-0.237701,-0.68051,-0.364349,-1.219615,0.373982,0.869417,-1.437254,-1.439204,1,True,False,False,True,False,False,True,False,False,False,False,False
1,-0.174175,-0.016347,-0.213607,-0.261277,-0.648855,-0.388723,0.19507,-0.237701,0.573932,-0.364349,0.775693,-0.346758,0.623086,1.152292,1.150031,0,True,False,False,False,False,True,False,False,False,True,False,False
2,0.642952,-0.843061,-0.926197,0.713537,0.033369,-0.409081,-0.506342,-0.237701,-0.68051,-0.364349,-1.185147,0.697327,0.578872,-1.293031,-1.290694,1,False,False,True,True,False,False,False,True,False,False,False,False
3,1.641664,-0.923065,-0.660116,0.760706,0.22829,-0.213135,-0.203859,-0.237701,0.573932,-0.364349,-1.154509,0.798527,0.711512,-1.297336,-1.297151,1,False,False,True,True,False,False,True,False,False,False,False,False
4,-0.264967,0.677026,0.445267,-0.214109,-0.133707,-0.383366,-0.137653,-0.237701,-0.68051,-0.364349,2.548875,-0.391187,-0.570674,1.858337,1.860294,1,True,False,False,False,False,True,False,False,True,False,False,False


In [3]:
X=df.drop('loan_status',axis=1)
y=df['loan_status']

In [4]:
target = "loan_status"
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape,y_train.shape,y_test.shape

((40000, 27), (10000, 27), (40000,), (10000,))

In [5]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(
        eval_metric="logloss", 
        n_estimators=300, 
        learning_rate=0.05, 
        max_depth=5
    )
}

In [6]:
results = {}

for name, model in models.items():
    print(f"\nüî∑ Training {name} ...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    roc = roc_auc_score(y_test, preds)

    print("Accuracy:", acc)
    print("ROC-AUC:", roc)
    print(classification_report(y_test, preds))

    results[name] = {
        "model": model,
        "accuracy": acc,
        "roc_auc": roc
    }


üî∑ Training Logistic Regression ...
Accuracy: 0.8651
ROC-AUC: 0.8626586812069925
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      4495
           1       0.87      0.89      0.88      5505

    accuracy                           0.87     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.87      0.86     10000


üî∑ Training Random Forest ...
Accuracy: 0.9113
ROC-AUC: 0.9100082946133506
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      4495
           1       0.92      0.92      0.92      5505

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000


üî∑ Training XGBoost ...
Accuracy: 0.9272
ROC-AUC: 0.9251639979430166
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      4495
   

In [7]:
best_model_name = max(results, key=lambda x: results[x]["roc_auc"])
best_model = results[best_model_name]["model"]

print("üèÜ Best Model:", best_model_name)
print(results[best_model_name])

üèÜ Best Model: XGBoost
{'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, ...), 'accuracy': 0.9272, 'roc_auc': 0.9251639979430166}


In [8]:
param_grid = {
        "learning_rate": [0.05, 0.1],
        "max_depth": [4, 5, 6],
        "n_estimators": [200, 300, 400]
    }

grid = GridSearchCV(
    estimator=best_model,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1
)

print("\nüîç Running Grid Search‚Ä¶")
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

best_model = grid.best_estimator_


üîç Running Grid Search‚Ä¶
Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
Best Score: 0.9843676533080806


In [9]:
preds = best_model.predict(X_test)

print("\nüìå Final Model Evaluation")
print("Accuracy:", accuracy_score(y_test, preds))
print("ROC-AUC:", roc_auc_score(y_test, preds))
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))


üìå Final Model Evaluation
Accuracy: 0.9273
ROC-AUC: 0.9254384981193151
[[4077  418]
 [ 309 5196]]
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      4495
           1       0.93      0.94      0.93      5505

    accuracy                           0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000



In [10]:
joblib.dump(best_model, "../models/model.pkl")
print("‚úÖ Model saved to ../models/model.pkl")

‚úÖ Model saved to ../models/model.pkl
