# Model Development

In [5]:
# Import necessary libraries
import numpy as np
import pandas as pd

# Modeling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, RocCurveDisplay, average_precision_score)
from sklearn.inspection import permutation_importance

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier

# Plots
import matplotlib.pyplot as plt
import seaborn as sns



## 2. Define features, target, train and test splits

In [6]:
# Target: map to binary (default=1, good=0)
df2=pd.read_csv(r"C:\Users\Bonareri\Machine_Learning_Powered_Credit_Risk_Scoring_System\data\clean_df")
df_model = df2.copy()
df_model["target"] = (df_model["loan_status"] == "Charged Off").astype(int)

# Feature lists (based on our EDA selection)
cat_cols = ["home_ownership","verification_status","purpose","addr_state","grade","sub_grade"]
num_cols = ["annual_inc","emp_length","loan_amnt","term","int_rate","installment",
            "dti","delinq_2yrs","inq_last_6mths","open_acc","pub_rec","revol_bal",
            "revol_util","total_acc"]

X = df_model[cat_cols + num_cols]
y = df_model["target"]


In [7]:
df_model.target.value_counts()/ len(df_model) * 100
# imbalanced dataset

target
0    85.641518
1    14.358482
Name: count, dtype: float64

# 3. Preprocessing pipeline

In [8]:
# Column transformer: OHE for cats, scale numerics
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", StandardScaler(), num_cols),
    ],
)


In [9]:
X = preprocessor.fit_transform(X)

In [10]:
# Split the data into training and testing sets
# Using stratified split to maintain the proportion of target classes
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

X_train.shape, X_test.shape, y_train.mean(), y_test.mean()

((29997, 128), (7500, 128), 0.14358102476914358, 0.1436)

# 4. Define models

In [9]:
models = {}

# Logistic Regression (interpretable baseline)
models["logreg"] = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=None))
])

# Random Forest (non-linear baseline)
models["rf"] = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=400, max_depth=None, min_samples_leaf=2,
        class_weight="balanced", random_state=42, n_jobs=-1
    ))
])

# HistGradientBoosting (fast, strong tabular baseline)
models["hgb"] = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", HistGradientBoostingClassifier(
        learning_rate=0.08, max_depth=None, max_leaf_nodes=31,
        random_state=42
    ))
])

# XGBoost 
models["xgb"] = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", XGBClassifier(
        n_estimators=500, max_depth=6, learning_rate=0.07,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        eval_metric="logloss", random_state=42, n_jobs=-1,
        scale_pos_weight=float((y_train==0).sum()/(y_train==1).sum())  # imbalance
    ))
])

list(models.keys())


['logreg', 'rf', 'hgb', 'xgb']

In [64]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

def evaluate_model(y_true, y_pred, y_proba=None):
    """
    Evaluate model performance.
    y_true  : true labels
    y_pred  : predicted labels (0/1)
    y_proba : predicted probabilities (for ROC-AUC, PR-AUC)
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    if y_proba is not None:  # use probabilities for ROC-AUC and PR-AUC
        roc_auc = roc_auc_score(y_true, y_proba)
        pr_auc = average_precision_score(y_true, y_proba)
    else:  # fallback to hard predictions
        roc_auc = roc_auc_score(y_true, y_pred)
        pr_auc = average_precision_score(y_true, y_pred)

    return accuracy, precision, recall, f1, roc_auc, pr_auc


# 5. Train, Evaluate, compare

In [None]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=None),
    "Hist Gradient Boosting Classifier": HistGradientBoostingClassifier(#learning_rate=0.08, max_depth=None, max_leaf_nodes=31,
        random_state=42),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=400, max_depth=None, min_samples_leaf=2,
        class_weight="balanced", random_state=42, n_jobs=-1),
    "XGBClassifier": XGBClassifier(n_estimators=500, max_depth=6, learning_rate=0.07,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        eval_metric="logloss", random_state=42, n_jobs=-1,
        scale_pos_weight=float((y_train==0).sum()/(y_train==1).sum())) 
    
}
model_list = []
metrics_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy, model_train_precision, model_train_recall, model_train_f1, model_train_roc_auc, model_train_pr_auc = evaluate_model(y_train, y_train_pred)

    model_test_accuracy, model_test_precision, model_test_recall, model_test_f1, model_test_roc_auc, model_test_pr_auc = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy score: {:.4f}".format(model_train_accuracy))
    print("- Precision score: {:.4f}".format(model_train_precision))
    print("- Recall score: {:.4f}".format(model_train_recall))
    print("- F1 score: {:.4f}".format(model_train_f1))
    print("- Roc_auc score: {:.4f}".format(model_train_roc_auc))
    print("- Average precision score: {:.4f}".format(model_train_pr_auc))

    print('----------------------------------')
    
    print('Model performance for Testing set')
    print("- Accuracy score: {:.4f}".format(model_test_accuracy))
    print("- Precision score: {:.4f}".format(model_test_precision))
    print("- Recall score: {:.4f}".format(model_test_recall))
    print("- F1 score: {:.4f}".format(model_test_f1))
    print("- Roc_auc score: {:.4f}".format(model_test_roc_auc))
    print("- Average precision score: {:.4f}".format(model_test_pr_auc))
    metrics_list.append(model_test_accuracy)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy score: 0.6517
- Precision score: 0.2402
- Recall score: 0.6594
- F1 score: 0.3522
- Roc_auc score: 0.6549
- Average precision score: 0.2073
----------------------------------
Model performance for Training set
- Accuracy score: 0.6585
- Precision score: 0.2450
- Recall score: 0.6620
- F1 score: 0.3577
- Roc_auc score: 0.6600
- Average precision score: 0.2107


Hist Gradient Boosting Classifier
Model performance for Training set
- Accuracy score: 0.8638
- Precision score: 0.8746
- Recall score: 0.0599
- F1 score: 0.1121
- Roc_auc score: 0.5292
- Average precision score: 0.1874
----------------------------------
Model performance for Training set
- Accuracy score: 0.8571
- Precision score: 0.5472
- Recall score: 0.0269
- F1 score: 0.0513
- Roc_auc score: 0.5116
- Average precision score: 0.1545


Random Forest Classifier
Model performance for Training set
- Accuracy score: 0.9994
- Precision score: 0.9991
- Recall score: 0

### Interpretation
1. Logistic Regression
Train vs Test:
Train accuracy: 0.65 | Test accuracy: 0.66 - very similar No overfitting.
Precision is low (0.24) but recall is fairly high (0.66). The model catches many positives but with many false alarms.
F1 is low (0.35) because precision drags it down.
ROC-AUC 0.65 - slightly better than random guessing, but weak.

Interpretation: Logistic regression is providing balanced but weak baseline. It’s stable but doesn’t separate classes well.

2. HistGradientBoosting
Train vs Test:
Accuracy: 0.86 (train), 0.85 (test) - Generalizes well.
But recall is terrible (0.06 train, 0.02 test) - It’s predicting almost all negatives.
Precision is misleadingly high on train (0.87) but collapses on test (0.54).
ROC-AUC 0.52 (train/test) - basically random performance.

Interpretation: The model is too conservative, it avoids predicting positives, so recall suffers badly.

3. Random Forest
Train vs Test:
Train accuracy: 0.999 - Clear overfitting.
Test accuracy: 0.85, but recall only 0.07 - terrible recall.
ROC-AUC 0.52 on test - random.

Interpretation: Random forest memorized the training set (overfit) but fails to generalize.

4. XGBoost
Train vs Test:
Train accuracy: 0.90 | Test accuracy: 0.75 - Some overfitting.
Recall on train: 0.95 - excellent, but test recall: 0.41 - much lower.
Precision test 0.27 - a lot of false positives.
ROC-AUC test 0.61 - weak, but better than RF/HGB.

Interpretation: This is our best model so far because it at least captures recall, but it still overfits and loses power on test.

Key Takeaway: Logistic Regression is our best algorithm, we will pick it and optimize.

Since we are working on credit scoring, catching defaults (positives) is more important. optimize for recall, not accuracy.


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
import numpy as np

# ----------------------------
# 1. Baseline Logistic Regression
# ----------------------------
logreg = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
logreg.fit(X_train, y_train)

y_pred_baseline = logreg.predict(X_test)
print("==== Baseline Model (Threshold=0.5) ====")
print(classification_report(y_test, y_pred_baseline))

# ----------------------------
# 2. Adjust Threshold
# ----------------------------
y_probs = logreg.predict_proba(X_test)[:, 1]
y_pred_thresh = (y_probs >= 0.3).astype(int)   # lower threshold

print("\n==== After Lowering Threshold to 0.3 ====")
print(classification_report(y_test, y_pred_thresh))

# ----------------------------
# 3. Handle Imbalance with SMOTE
# ----------------------------
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

logreg_smote = LogisticRegression(max_iter=2000, random_state=42)
logreg_smote.fit(X_resampled, y_resampled)

y_pred_smote = logreg_smote.predict(X_test)
print("\n==== After SMOTE Oversampling ====")
print(classification_report(y_test, y_pred_smote))

# ----------------------------
# 4. Hyperparameter Tuning for Recall
# ----------------------------
param_grid = {"C": [0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(
    LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42),
    param_grid=param_grid,
    scoring="recall",
    cv=5
)

grid.fit(X_train, y_train)
best_logreg = grid.best_estimator_

y_pred_best = best_logreg.predict(X_test)
print("\n==== After Hyperparameter Tuning for Recall ====")
print("Best C:", grid.best_params_["C"])
print(classification_report(y_test, y_pred_best))


==== Baseline Model (Threshold=0.5) ====
              precision    recall  f1-score   support

           0       0.92      0.66      0.77      6423
           1       0.24      0.66      0.36      1077

    accuracy                           0.66      7500
   macro avg       0.58      0.66      0.56      7500
weighted avg       0.82      0.66      0.71      7500


==== After Lowering Threshold to 0.3 ====
              precision    recall  f1-score   support

           0       0.95      0.25      0.39      6423
           1       0.17      0.93      0.29      1077

    accuracy                           0.35      7500
   macro avg       0.56      0.59      0.34      7500
weighted avg       0.84      0.35      0.38      7500


==== After SMOTE Oversampling ====
              precision    recall  f1-score   support

           0       0.92      0.65      0.76      6423
           1       0.24      0.66      0.35      1077

    accuracy                           0.65      7500
   macro

## Interpretation
1. Baseline (Threshold = 0.5)
Precision (class 1): 0.24
Recall (class 1): 0.66
F1 (class 1): 0.36
Accuracy: 0.66

This is actually decent recall for the minority class (66%). Provides balance (not too many false alarms), but precision is low — lots of false positives.

2. Lowering Threshold to 0.3

Precision (class 1): 0.17 → dropped a lot

Recall (class 1): 0.93 → shot way up

F1 (class 1): 0.29 → dropped

By lowering the threshold, the model predicts “positive” much more often, so it catches nearly all true positives (93% recall), but at the cost of predicting many false positives (precision collapse).
This is great if recall is critical (e.g., catching almost all risky clients), but business costs of false positives matter. We would actually pick this

3. After SMOTE Oversampling
Precision (class 1): 0.24
Recall (class 1): 0.66
F1 (class 1): 0.35
This is basically the same as baseline, which tells us SMOTE didn’t add much value here.

4. After Hyperparameter Tuning (Best C = 0.01)
Precision (class 1): 0.25
Recall (class 1): 0.66
F1 (class 1): 0.36

Almost identical to baseline.
The best regularization (C=0.01) slightly stabilizes the model, but it doesn’t fundamentally change performance 

Key takeaway:
We want maximum recall (dont miss risky borrowers), but sacrifice precision.

