### Loan Default Prediction
Aim is to predict that a borrower will default on a loan.

Constraints:
- Class imbalance (~17% default rate)
- False negatives (missed defaults) are more costly than false positives
- Probabilities must be calibrated for business use

Success Criteria:
- High recall on defaults (≥ 65%)
- Stable ROC-AUC
- Well-calibrated probabilities

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed/encoded_loan_data.csv")
print(df.shape)
print(df.head())

(15000, 33)
   loan_amnt  int_rate  annual_inc    dti  fico_range_low  revol_util  \
0     3600.0     13.99     55000.0   5.91           675.0        29.7   
1    24700.0     11.99     65000.0  16.06           715.0        19.2   
2    20000.0     10.78     63000.0  10.78           695.0        56.2   
3    35000.0     14.85    110000.0  17.06           785.0        11.6   
4    10400.0     22.45    104433.0  25.37           695.0        64.5   

   loan_to_income  grade_encoded  purpose_credit_card  \
0        0.065455              2                    0   
1        0.380000              2                    0   
2        0.317460              1                    0   
3        0.318182              2                    0   
4        0.099585              5                    0   

   purpose_debt_consolidation  ...  emp_length_2 years  emp_length_3 years  \
0                           1  ...                   0                   0   
1                           0  ...                

In [2]:
df.isna().sum().sort_values(ascending=False).head(10)


loan_amnt                 0
purpose_small_business    0
emp_length_< 1 year       0
emp_length_9 years        0
emp_length_8 years        0
emp_length_7 years        0
emp_length_6 years        0
emp_length_5 years        0
emp_length_4 years        0
emp_length_3 years        0
dtype: int64

In [3]:
X = df.drop(columns=["default"])
y = df["default"]


In [4]:
core_features = [
    "loan_amnt",
    "int_rate",
    "annual_inc",
    "dti",
    "fico_range_low",
    "revol_util",
    "loan_to_income",
    "grade_encoded"
]
X = X[core_features].copy()
X

Unnamed: 0,loan_amnt,int_rate,annual_inc,dti,fico_range_low,revol_util,loan_to_income,grade_encoded
0,3600.0,13.99,55000.0,5.91,675.0,29.7,0.065455,2
1,24700.0,11.99,65000.0,16.06,715.0,19.2,0.380000,2
2,20000.0,10.78,63000.0,10.78,695.0,56.2,0.317460,1
3,35000.0,14.85,110000.0,17.06,785.0,11.6,0.318182,2
4,10400.0,22.45,104433.0,25.37,695.0,64.5,0.099585,5
...,...,...,...,...,...,...,...,...
14995,8000.0,12.59,79875.0,7.59,705.0,27.3,0.100156,2
14996,35000.0,17.86,160000.0,12.56,680.0,85.0,0.218750,3
14997,7275.0,11.22,30000.0,26.68,695.0,56.3,0.242500,1
14998,27000.0,9.76,57000.0,25.90,700.0,44.4,0.473684,1


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(y_train.mean(), y_test.mean())


0.17391666666666666 0.174


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

baseline_lr = LogisticRegression(max_iter=1000, random_state=42)
baseline_lr.fit(X_train_scaled, y_train)

y_proba = baseline_lr.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)


print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.83      0.99      0.90      2478
           1       0.39      0.03      0.05       522

    accuracy                           0.82      3000
   macro avg       0.61      0.51      0.48      3000
weighted avg       0.75      0.82      0.75      3000

ROC AUC: 0.6695935728665128


In [8]:
lr = LogisticRegression(
    max_iter=1000,
    class_weight= {0: 1, 1: 5},
    random_state=42
)

lr.fit(X_train_scaled, y_train)

y_proba = lr.predict_proba(X_test_scaled)[:,1]

In [10]:
from sklearn.metrics import recall_score, precision_score

print("Recall:", recall_score(y_test, y_proba >= 0.5))
print("Precision:", precision_score(y_test, y_proba >= 0.5))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


Recall: 0.632183908045977
Precision: 0.2542372881355932
ROC AUC: 0.6693206732657346


In [12]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

threshold_df = pd.DataFrame({
    "threshold": thresholds,
    "precision": precision[:-1],
    "recall": recall[:-1]
})

threshold_df.sort_values("recall", ascending=False).head(10)


Unnamed: 0,threshold,precision,recall
0,0.101255,0.174,1.0
78,0.185418,0.178645,1.0
91,0.194772,0.179443,1.0
90,0.192761,0.179381,1.0
89,0.192489,0.17932,1.0
88,0.192316,0.179258,1.0
87,0.192262,0.179197,1.0
86,0.192254,0.179135,1.0
85,0.191757,0.179074,1.0
84,0.1914,0.179012,1.0


In [13]:
BEST_THRESHOLD = 0.30


In [14]:
from sklearn.calibration import CalibratedClassifierCV

calibrated_lr = CalibratedClassifierCV(
    lr,
    method="isotonic",
    cv=5
)

calibrated_lr.fit(X_train_scaled, y_train)

y_proba_cal = calibrated_lr.predict_proba(X_test_scaled)[:,1]


In [15]:
print("ROC AUC:", roc_auc_score(y_test, y_proba_cal))


ROC AUC: 0.6692742880644693


In [16]:
LOAN_AMOUNT = 15000
RECOVERY_RATE = 0.4

expected_loss = LOAN_AMOUNT * y_proba_cal.mean() * (1 - RECOVERY_RATE)
expected_loss


1621.1373950175064

In [18]:
import pickle
import os

os.makedirs("../models", exist_ok=True)

with open("../models/logistic_regression.pkl", "wb") as f:
    pickle.dump(calibrated_lr, f)

with open("../models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("../models/feature_names.pkl", "wb") as f:
    pickle.dump(core_features, f)

with open("../models/threshold.pkl", "wb") as f:
    pickle.dump(BEST_THRESHOLD, f)


In [19]:
test_input = pd.DataFrame([{
    "loan_amnt": 15000,
    "int_rate": 12.0,
    "annual_inc": 55000,
    "dti": 18.0,
    "fico_range_low": 690,
    "revol_util": 50.0,
    "loan_to_income": 15000/55000,
    "grade_encoded": 3
}])

test_scaled = scaler.transform(test_input)
proba = calibrated_lr.predict_proba(test_scaled)[0,1]
proba


0.1745530407817169