# Traditional Machine Learning Models

This notebook trains and evaluates traditional machine learning models for credit risk prediction.

**Purpose**: Compare different ML algorithms to find the best model for predicting loan defaults.

**Models Trained**:
1. **Logistic Regression**: Linear model, interpretable, good baseline
2. **Random Forest**: Ensemble of decision trees, handles non-linear patterns
3. **XGBoost**: Gradient boosting, often best performance

**Process**:
- Load preprocessed data from notebook 02
- Train each model with hyperparameter tuning (GridSearchCV)
- Evaluate using cross-validation to prevent overfitting
- Optimize classification threshold for best F1 score
- Generate performance metrics and visualizations
- Save trained models for later use

**Output**: Trained models, performance metrics, and evaluation visualizations saved to models/ and artifacts/03_traditional_ml_images/

## Setup and Imports


In [1]:
import os
import json
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, 
    precision_score, recall_score, precision_recall_curve, 
    confusion_matrix, classification_report, roc_curve, auc
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

ROOT = os.path.abspath(os.getcwd())
PROJECT_ROOT = os.path.abspath(os.path.join(ROOT, '..'))

MODELS_DIR = os.path.join(PROJECT_ROOT, 'models')
DATASET_DIR = os.path.join(PROJECT_ROOT, 'dataset')
ARTIFACTS_DIR = os.path.join(PROJECT_ROOT, 'artifacts')

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_DIR, exist_ok=True)
os.makedirs(os.path.join(ARTIFACTS_DIR, '03_traditional_ml_images'), exist_ok=True)


## Load Data


In [2]:
X_train_frame = pd.read_pickle(os.path.join(DATASET_DIR, 'X_train.pkl'))
y_train_series = pd.read_pickle(os.path.join(DATASET_DIR, 'y_train.pkl'))
X_test_frame = pd.read_pickle(os.path.join(DATASET_DIR, 'X_test.pkl'))
y_test_series = pd.read_pickle(os.path.join(DATASET_DIR, 'y_test.pkl'))

if isinstance(y_train_series, pd.DataFrame):
    y_train_series = y_train_series.iloc[:, 0]
if isinstance(y_test_series, pd.DataFrame):
    y_test_series = y_test_series.iloc[:, 0]

feature_names = list(X_train_frame.columns)

X_train = X_train_frame.to_numpy(copy=False)
X_test = X_test_frame.to_numpy(copy=False)
y_train = y_train_series.to_numpy(copy=False)
y_test = y_test_series.to_numpy(copy=False)

print(f'Training set: {X_train.shape}, Test set: {X_test.shape}')


Training set: (26064, 17), Test set: (6517, 17)


## Data Preparation


In [3]:
# Calculate class imbalance ratio for XGBoost
# XGBoost needs this to properly weight the minority class during training
# This helps the model learn from both good loans and bad loans equally
positive_class_ratio = (y_train == 1).mean()  # Proportion of bad loans
negative_class_ratio = 1 - positive_class_ratio  # Proportion of good loans
xgb_scale_pos_weight = negative_class_ratio / (positive_class_ratio + 1e-9)  # Weight for positive class

print('CLASS IMBALANCE ANALYSIS')
print(f'Positive class (bad loans) ratio: {positive_class_ratio:.4f} ({positive_class_ratio*100:.2f}%)')
print(f'Negative class (good loans) ratio: {negative_class_ratio:.4f} ({negative_class_ratio*100:.2f}%)')
print(f'XGBoost scale_pos_weight: {xgb_scale_pos_weight:.2f}')

# Set up cross-validation strategy
# StratifiedKFold: Maintains class distribution in each fold (important for imbalanced data)
# n_splits=5: Split data into 5 folds for cross-validation
# shuffle=True: Randomize data before splitting (better for model evaluation)
# random_state=42: Set seed for reproducibility
cross_validator = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(f'\nCross-validation configured: 5-fold stratified (maintains class balance)')


CLASS IMBALANCE ANALYSIS
Positive class (bad loans) ratio: 0.2182 (21.82%)
Negative class (good loans) ratio: 0.7818 (78.18%)
XGBoost scale_pos_weight: 3.58

Cross-validation configured: 5-fold stratified (maintains class balance)


## Model 1: Logistic Regression


### Training


In [4]:
# Model 1: Logistic Regression
# Why Logistic Regression?
#   - Simple, interpretable linear model
#   - Good baseline for comparison
#   - Fast to train
#   - Provides feature importance through coefficients

# Configure model
# max_iter=4000: Allow more iterations for convergence
# class_weight='balanced': Automatically adjust weights for imbalanced classes
# solver='liblinear': Fast solver that supports both L1 and L2 regularization
logistic_model = LogisticRegression(max_iter=4000, class_weight='balanced', solver='liblinear')

# Hyperparameter grid for tuning
# C: Regularization strength (smaller = more regularization, prevents overfitting)
# penalty: L1 (sparse features) or L2 (smooth features)
logistic_params = {
    'C': [0.01, 0.1, 1.0, 5.0, 10.0],  # Try different regularization strengths
    'penalty': ['l1', 'l2']  # Try both L1 and L2 regularization
}

print('TRAINING LOGISTIC REGRESSION')
print('Method: GridSearchCV with 5-fold cross-validation')
print('Searching for best hyperparameters...')
print('  - C values: [0.01, 0.1, 1.0, 5.0, 10.0]')
print('  - Penalty: [L1, L2]')
print('  - Total combinations: 10')
print('\nTraining in progress (this may take a few minutes)...')

# GridSearchCV: Tries all combinations of hyperparameters
# scoring='roc_auc': Optimize for ROC-AUC (area under ROC curve)
# cv=cross_validator: Use our 5-fold stratified cross-validation
# n_jobs=-1: Use all CPU cores for faster training
logistic_grid = GridSearchCV(logistic_model, logistic_params, scoring='roc_auc', 
                            cv=cross_validator, n_jobs=-1, verbose=1)
logistic_grid.fit(X_train, y_train)

# Get best model and evaluate
logistic_best_model = logistic_grid.best_estimator_
logistic_cv_auc = logistic_grid.best_score_

# Generate out-of-fold predictions for threshold optimization
# cross_val_predict: Get predictions from each fold (used for training, not test)
logistic_oof_scores = cross_val_predict(logistic_best_model, X_train, y_train, 
                                       cv=cross_validator, method='predict_proba')[:, 1]
logistic_oof_auc = roc_auc_score(y_train, logistic_oof_scores)

print('LOGISTIC REGRESSION TRAINING COMPLETE')
print(f'Best hyperparameters: {logistic_grid.best_params_}')
print(f'Cross-validation AUC: {logistic_cv_auc:.4f}')
print(f'Out-of-fold AUC: {logistic_oof_auc:.4f}')
print('Model trained and ready for evaluation')

TRAINING LOGISTIC REGRESSION
Method: GridSearchCV with 5-fold cross-validation
Searching for best hyperparameters...
  - C values: [0.01, 0.1, 1.0, 5.0, 10.0]
  - Penalty: [L1, L2]
  - Total combinations: 10

Training in progress (this may take a few minutes)...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
LOGISTIC REGRESSION TRAINING COMPLETE
Best hyperparameters: {'C': 0.1, 'penalty': 'l1'}
Cross-validation AUC: 0.8636
Out-of-fold AUC: 0.8635
Model trained and ready for evaluation


### Threshold Optimization


In [5]:
# Find the best classification threshold by optimizing F1 score
# The default 0.5 threshold often doesn't work well for imbalanced data
# We test different thresholds and pick the one that gives highest F1 score

# Get precision and recall values for all possible thresholds
# precision_recall_curve returns arrays of precision, recall, and threshold values
prec, rec, thresholds = precision_recall_curve(y_train, logistic_oof_scores)

# Calculate F1 score for each threshold: F1 = 2 * (precision * recall) / (precision + recall)
# F1 balances precision and recall, which is important for imbalanced credit risk data
f1_scores = 2 * prec * rec / (prec + rec + 1e-9)

# Find the threshold that gives maximum F1 score
best_idx = np.nanargmax(f1_scores)
# Use previous threshold index (thresholds array is one shorter than prec/rec arrays)
log_optimal_threshold = thresholds[max(0, best_idx - 1)] if len(thresholds) > 0 else 0.5

print(f'Optimal threshold: {log_optimal_threshold:.4f}')


Optimal threshold: 0.6401


### Evaluation


In [6]:
# Convert probability scores to binary predictions using optimal threshold
# Scores above threshold = predicted bad loan (1), below = predicted good loan (0)
log_train_pred = (logistic_oof_scores >= log_optimal_threshold).astype(int)
log_test_scores = logistic_best_model.predict_proba(X_test)[:, 1]
log_test_pred = (log_test_scores >= log_optimal_threshold).astype(int)

# Calculate performance metrics for training set
# ROC-AUC: Overall model discrimination ability (higher is better, max 1.0)
# PR-AUC: Precision-recall area, better for imbalanced data than ROC-AUC
# F1: Harmonic mean of precision and recall, balances both concerns
# Precision: Of loans predicted as bad, how many are actually bad (reduces false alarms)
# Recall: Of actual bad loans, how many did we catch (reduces missed defaults)
log_train_metrics = {
    'roc_auc': roc_auc_score(y_train, logistic_oof_scores),
    'pr_auc': average_precision_score(y_train, logistic_oof_scores),
    'f1': f1_score(y_train, log_train_pred),
    'precision': precision_score(y_train, log_train_pred),
    'recall': recall_score(y_train, log_train_pred)
}

# Calculate same metrics for test set to check generalization
log_test_metrics = {
    'roc_auc': roc_auc_score(y_test, log_test_scores),
    'pr_auc': average_precision_score(y_test, log_test_scores),
    'f1': f1_score(y_test, log_test_pred),
    'precision': precision_score(y_test, log_test_pred),
    'recall': recall_score(y_test, log_test_pred),
    'threshold': log_optimal_threshold
}

# Build confusion matrix to calculate specificity
# Confusion matrix shows: True Negatives, False Positives, False Negatives, True Positives
log_train_cm = confusion_matrix(y_train, log_train_pred)
log_test_cm = confusion_matrix(y_test, log_test_pred)

# Specificity: Of actual good loans, how many did we correctly identify as good
# Important for credit risk - we want to avoid rejecting good customers
# Formula: True Negatives / (True Negatives + False Positives)
log_train_specificity = log_train_cm[0, 0] / (log_train_cm[0, 0] + log_train_cm[0, 1] + 1e-9)
log_test_specificity = log_test_cm[0, 0] / (log_test_cm[0, 0] + log_test_cm[0, 1] + 1e-9)

print('Training Metrics:')
print(f"  ROC-AUC: {log_train_metrics['roc_auc']:.4f}, PR-AUC: {log_train_metrics['pr_auc']:.4f}")
print(f"  F1: {log_train_metrics['f1']:.4f}, Precision: {log_train_metrics['precision']:.4f}, Recall: {log_train_metrics['recall']:.4f}")
print(f"  Specificity: {log_train_specificity:.4f}")

print('\nTest Metrics:')
print(f"  ROC-AUC: {log_test_metrics['roc_auc']:.4f}, PR-AUC: {log_test_metrics['pr_auc']:.4f}")
print(f"  F1: {log_test_metrics['f1']:.4f}, Precision: {log_test_metrics['precision']:.4f}, Recall: {log_test_metrics['recall']:.4f}")
print(f"  Specificity: {log_test_specificity:.4f}")


Training Metrics:
  ROC-AUC: 0.8635, PR-AUC: 0.6981
  F1: 0.6430, Precision: 0.6185, Recall: 0.6695
  Specificity: 0.8848

Test Metrics:
  ROC-AUC: 0.8663, PR-AUC: 0.6956
  F1: 0.6454, Precision: 0.6248, Recall: 0.6674
  Specificity: 0.8881


### Visualizations


In [7]:
images_dir = os.path.join(ARTIFACTS_DIR, '03_traditional_ml_images')
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 150

fpr_train, tpr_train, _ = roc_curve(y_train, logistic_oof_scores)
fpr_test, tpr_test, _ = roc_curve(y_test, log_test_scores)
auc_train = auc(fpr_train, tpr_train)
auc_test = auc(fpr_test, tpr_test)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(fpr_train, tpr_train, label=f'Train (AUC = {auc_train:.4f})', linewidth=2, linestyle='--')
ax.plot(fpr_test, tpr_test, label=f'Test (AUC = {auc_test:.4f})', linewidth=2)
ax.plot([0, 1], [0, 1], 'k--', linewidth=1, alpha=0.5, label='Random')
ax.set_xlabel('False Positive Rate', fontsize=11)
ax.set_ylabel('True Positive Rate', fontsize=11)
ax.set_title('Logistic Regression - ROC Curve', fontsize=12, fontweight='bold')
ax.legend(loc='lower right')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'logistic_regression_roc.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: logistic_regression_roc.png')


Saved: logistic_regression_roc.png


In [8]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(log_train_cm, annot=True, fmt='d', cmap='Blues', ax=ax1, cbar=False)
ax1.set_title('Training Set', fontsize=11, fontweight='bold')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')

sns.heatmap(log_test_cm, annot=True, fmt='d', cmap='Blues', ax=ax2, cbar=False)
ax2.set_title('Test Set', fontsize=11, fontweight='bold')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')

plt.suptitle('Logistic Regression - Confusion Matrix', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'logistic_regression_cm.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: logistic_regression_cm.png')


Saved: logistic_regression_cm.png


In [9]:
prec_train, rec_train, _ = precision_recall_curve(y_train, logistic_oof_scores)
prec_test, rec_test, _ = precision_recall_curve(y_test, log_test_scores)
pr_auc_train = average_precision_score(y_train, logistic_oof_scores)
pr_auc_test = average_precision_score(y_test, log_test_scores)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(rec_train, prec_train, label=f'Train (AUC = {pr_auc_train:.4f})', linewidth=2, linestyle='--')
ax.plot(rec_test, prec_test, label=f'Test (AUC = {pr_auc_test:.4f})', linewidth=2)
ax.set_xlabel('Recall', fontsize=11)
ax.set_ylabel('Precision', fontsize=11)
ax.set_title('Logistic Regression - Precision-Recall Curve', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'logistic_regression_pr.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: logistic_regression_pr.png')


Saved: logistic_regression_pr.png


In [10]:
log_importance = np.abs(logistic_best_model.coef_[0])
log_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': log_importance
}).sort_values('importance', ascending=False).head(15)

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=log_importance_df, y='feature', x='importance', ax=ax, palette='viridis')
ax.set_xlabel('Importance (|Coefficient|)', fontsize=11)
ax.set_ylabel('')
ax.set_title('Logistic Regression - Top 15 Feature Importance', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'logistic_regression_importance.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: logistic_regression_importance.png')


Saved: logistic_regression_importance.png


## Model 2: Random Forest


### Training


In [11]:
# Random Forest: Ensemble of decision trees that vote on predictions
# class_weight='balanced': Automatically adjusts weights to handle imbalanced classes
# random_state: Ensures reproducible results
# n_jobs=-1: Uses all CPU cores for faster training
rf_model = RandomForestClassifier(class_weight='balanced', random_state=48, n_jobs=-1)

# Hyperparameter grid for tuning
# max_depth: Maximum depth of trees (None = no limit, deeper = more complex but risk overfitting)
# min_samples_split: Minimum samples needed to split a node (higher = simpler trees, prevents overfitting)
# min_samples_leaf: Minimum samples in leaf nodes (higher = smoother predictions)
rf_params = {
    'max_depth': [None, 8, 12],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# GridSearchCV: Tests all combinations of hyperparameters using cross-validation
# scoring='roc_auc': Optimizes for ROC-AUC score (good for imbalanced data)
# cv=cross_validator: Uses our 5-fold stratified cross-validation
rf_grid = GridSearchCV(rf_model, rf_params, scoring='roc_auc', cv=cross_validator, n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)

# Get the best model found by grid search
rf_best = rf_grid.best_estimator_
rf_cv_auc = rf_grid.best_score_

# Generate out-of-fold predictions: For each sample, get prediction from model trained on other folds
# This gives unbiased performance estimate without using test set
# method='predict_proba': Returns probability scores (not just 0/1 predictions)
rf_oof_scores = cross_val_predict(rf_best, X_train, y_train, cv=cross_validator, method='predict_proba')[:, 1]
rf_oof_auc = roc_auc_score(y_train, rf_oof_scores)

print(f'Best params: {rf_grid.best_params_}')
print(f'CV AUC: {rf_cv_auc:.4f}, OOF AUC: {rf_oof_auc:.4f}')


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
CV AUC: 0.9315, OOF AUC: 0.9315


### Threshold Optimization


In [12]:
# Find optimal threshold for Random Forest using F1 score optimization
# Same approach as Logistic Regression: test all thresholds and pick best F1
prec, rec, thresholds = precision_recall_curve(y_train, rf_oof_scores)
f1_scores = 2 * prec * rec / (prec + rec + 1e-9)
best_idx = np.nanargmax(f1_scores)
rf_optimal_threshold = thresholds[max(0, best_idx - 1)] if len(thresholds) > 0 else 0.5

print(f'Optimal threshold: {rf_optimal_threshold:.4f}')


Optimal threshold: 0.4400


### Evaluation


In [13]:
# Evaluate Random Forest using optimal threshold
# Convert probabilities to binary predictions
rf_train_pred = (rf_oof_scores >= rf_optimal_threshold).astype(int)
rf_test_scores = rf_best.predict_proba(X_test)[:, 1]
rf_test_pred = (rf_test_scores >= rf_optimal_threshold).astype(int)

# Calculate comprehensive metrics for training set
rf_train_metrics = {
    'roc_auc': roc_auc_score(y_train, rf_oof_scores),
    'pr_auc': average_precision_score(y_train, rf_oof_scores),
    'f1': f1_score(y_train, rf_train_pred),
    'precision': precision_score(y_train, rf_train_pred),
    'recall': recall_score(y_train, rf_train_pred)
}

# Calculate metrics for test set to assess generalization
rf_test_metrics = {
    'roc_auc': roc_auc_score(y_test, rf_test_scores),
    'pr_auc': average_precision_score(y_test, rf_test_scores),
    'f1': f1_score(y_test, rf_test_pred),
    'precision': precision_score(y_test, rf_test_pred),
    'recall': recall_score(y_test, rf_test_pred),
    'threshold': rf_optimal_threshold
}

# Calculate specificity from confusion matrix
rf_train_cm = confusion_matrix(y_train, rf_train_pred)
rf_test_cm = confusion_matrix(y_test, rf_test_pred)
rf_train_specificity = rf_train_cm[0, 0] / (rf_train_cm[0, 0] + rf_train_cm[0, 1] + 1e-9)
rf_test_specificity = rf_test_cm[0, 0] / (rf_test_cm[0, 0] + rf_test_cm[0, 1] + 1e-9)

print('Training Metrics:')
print(f"  ROC-AUC: {rf_train_metrics['roc_auc']:.4f}, PR-AUC: {rf_train_metrics['pr_auc']:.4f}")
print(f"  F1: {rf_train_metrics['f1']:.4f}, Precision: {rf_train_metrics['precision']:.4f}, Recall: {rf_train_metrics['recall']:.4f}")
print(f"  Specificity: {rf_train_specificity:.4f}")

print('\nTest Metrics:')
print(f"  ROC-AUC: {rf_test_metrics['roc_auc']:.4f}, PR-AUC: {rf_test_metrics['pr_auc']:.4f}")
print(f"  F1: {rf_test_metrics['f1']:.4f}, Precision: {rf_test_metrics['precision']:.4f}, Recall: {rf_test_metrics['recall']:.4f}")
print(f"  Specificity: {rf_test_specificity:.4f}")

Training Metrics:
  ROC-AUC: 0.9315, PR-AUC: 0.8815
  F1: 0.8226, Precision: 0.9506, Recall: 0.7249
  Specificity: 0.9895

Test Metrics:
  ROC-AUC: 0.9396, PR-AUC: 0.8948
  F1: 0.8356, Precision: 0.9599, Recall: 0.7398
  Specificity: 0.9914


### Visualizations


In [14]:
fpr_train, tpr_train, _ = roc_curve(y_train, rf_oof_scores)
fpr_test, tpr_test, _ = roc_curve(y_test, rf_test_scores)
auc_train = auc(fpr_train, tpr_train)
auc_test = auc(fpr_test, tpr_test)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(fpr_train, tpr_train, label=f'Train (AUC = {auc_train:.4f})', linewidth=2, linestyle='--')
ax.plot(fpr_test, tpr_test, label=f'Test (AUC = {auc_test:.4f})', linewidth=2)
ax.plot([0, 1], [0, 1], 'k--', linewidth=1, alpha=0.5, label='Random')
ax.set_xlabel('False Positive Rate', fontsize=11)
ax.set_ylabel('True Positive Rate', fontsize=11)
ax.set_title('Random Forest - ROC Curve', fontsize=12, fontweight='bold')
ax.legend(loc='lower right')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'random_forest_roc.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: random_forest_roc.png')


Saved: random_forest_roc.png


In [15]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(rf_train_cm, annot=True, fmt='d', cmap='Blues', ax=ax1, cbar=False)
ax1.set_title('Training Set', fontsize=11, fontweight='bold')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')

sns.heatmap(rf_test_cm, annot=True, fmt='d', cmap='Blues', ax=ax2, cbar=False)
ax2.set_title('Test Set', fontsize=11, fontweight='bold')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')

plt.suptitle('Random Forest - Confusion Matrix', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'random_forest_cm.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: random_forest_cm.png')


Saved: random_forest_cm.png


In [16]:
prec_train, rec_train, _ = precision_recall_curve(y_train, rf_oof_scores)
prec_test, rec_test, _ = precision_recall_curve(y_test, rf_test_scores)
pr_auc_train = average_precision_score(y_train, rf_oof_scores)
pr_auc_test = average_precision_score(y_test, rf_test_scores)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(rec_train, prec_train, label=f'Train (AUC = {pr_auc_train:.4f})', linewidth=2, linestyle='--')
ax.plot(rec_test, prec_test, label=f'Test (AUC = {pr_auc_test:.4f})', linewidth=2)
ax.set_xlabel('Recall', fontsize=11)
ax.set_ylabel('Precision', fontsize=11)
ax.set_title('Random Forest - Precision-Recall Curve', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'random_forest_pr.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: random_forest_pr.png')


Saved: random_forest_pr.png


In [17]:
rf_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_best.feature_importances_
}).sort_values('importance', ascending=False).head(15)

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=rf_importance_df, y='feature', x='importance', ax=ax, palette='viridis')
ax.set_xlabel('Importance', fontsize=11)
ax.set_ylabel('')
ax.set_title('Random Forest - Top 15 Feature Importance', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'random_forest_importance.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: random_forest_importance.png')


Saved: random_forest_importance.png


## Model 3: XGBoost


### Training


In [18]:
# XGBoost: Gradient boosting algorithm that builds trees sequentially
# objective='binary:logistic': Binary classification with logistic output
# eval_metric='auc': Use AUC for evaluation during training
# tree_method='hist': Fast histogram-based tree construction
# learning_rate=0.02: Small learning rate for more stable training (needs more trees)
# scale_pos_weight: Weight for positive class (bad loans) - calculated from class imbalance
#   This makes the model focus more on correctly identifying bad loans
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    tree_method='hist',
    learning_rate=0.02,
    random_state=48,
    scale_pos_weight=xgb_scale_pos_weight
)

# Hyperparameter grid for tuning
# max_depth: Tree depth (deeper = more complex, risk overfitting)
# min_child_weight: Minimum sum of instance weights in child nodes (higher = more regularization)
# gamma: Minimum loss reduction to split (higher = simpler trees, prevents overfitting)
# reg_lambda: L2 regularization on leaf weights (higher = smoother predictions)
xgb_params = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 0.1, 0.3],
    'reg_lambda': [1.0, 2.0, 5.0]
}

# Grid search to find best hyperparameters
xgb_grid = GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cross_validator, n_jobs=-1, verbose=1)
xgb_grid.fit(X_train, y_train)

# Get best model and generate out-of-fold predictions
xgb_best = xgb_grid.best_estimator_
xgb_cv_auc = xgb_grid.best_score_
xgb_oof_scores = cross_val_predict(xgb_best, X_train, y_train, cv=cross_validator, method='predict_proba')[:, 1]
xgb_oof_auc = roc_auc_score(y_train, xgb_oof_scores)

print(f'Best params: {xgb_grid.best_params_}')
print(f'CV AUC: {xgb_cv_auc:.4f}, OOF AUC: {xgb_oof_auc:.4f}')


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best params: {'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 5, 'reg_lambda': 1.0}
CV AUC: 0.9281, OOF AUC: 0.9279


### Threshold Optimization


In [19]:
# Find optimal threshold for XGBoost using F1 score optimization
prec, rec, thresholds = precision_recall_curve(y_train, xgb_oof_scores)
f1_scores = 2 * prec * rec / (prec + rec + 1e-9)
best_idx = np.nanargmax(f1_scores)
xgb_optimal_threshold = thresholds[max(0, best_idx - 1)] if len(thresholds) > 0 else 0.5

print(f'Optimal threshold: {xgb_optimal_threshold:.4f}')


Optimal threshold: 0.6793


### Evaluation


In [20]:
# Evaluate XGBoost using optimal threshold
xgb_train_pred = (xgb_oof_scores >= xgb_optimal_threshold).astype(int)
xgb_test_scores = xgb_best.predict_proba(X_test)[:, 1]
xgb_test_pred = (xgb_test_scores >= xgb_optimal_threshold).astype(int)

# Calculate performance metrics
xgb_train_metrics = {
    'roc_auc': roc_auc_score(y_train, xgb_oof_scores),
    'pr_auc': average_precision_score(y_train, xgb_oof_scores),
    'f1': f1_score(y_train, xgb_train_pred),
    'precision': precision_score(y_train, xgb_train_pred),
    'recall': recall_score(y_train, xgb_train_pred)
}

xgb_test_metrics = {
    'roc_auc': roc_auc_score(y_test, xgb_test_scores),
    'pr_auc': average_precision_score(y_test, xgb_test_scores),
    'f1': f1_score(y_test, xgb_test_pred),
    'precision': precision_score(y_test, xgb_test_pred),
    'recall': recall_score(y_test, xgb_test_pred),
    'threshold': xgb_optimal_threshold
}

# Calculate specificity
xgb_train_cm = confusion_matrix(y_train, xgb_train_pred)
xgb_test_cm = confusion_matrix(y_test, xgb_test_pred)
xgb_train_specificity = xgb_train_cm[0, 0] / (xgb_train_cm[0, 0] + xgb_train_cm[0, 1] + 1e-9)
xgb_test_specificity = xgb_test_cm[0, 0] / (xgb_test_cm[0, 0] + xgb_test_cm[0, 1] + 1e-9)

print('Training Metrics:')
print(f"  ROC-AUC: {xgb_train_metrics['roc_auc']:.4f}, PR-AUC: {xgb_train_metrics['pr_auc']:.4f}")
print(f"  F1: {xgb_train_metrics['f1']:.4f}, Precision: {xgb_train_metrics['precision']:.4f}, Recall: {xgb_train_metrics['recall']:.4f}")
print(f"  Specificity: {xgb_train_specificity:.4f}")

print('\nTest Metrics:')
print(f"  ROC-AUC: {xgb_test_metrics['roc_auc']:.4f}, PR-AUC: {xgb_test_metrics['pr_auc']:.4f}")
print(f"  F1: {xgb_test_metrics['f1']:.4f}, Precision: {xgb_test_metrics['precision']:.4f}, Recall: {xgb_test_metrics['recall']:.4f}")
print(f"  Specificity: {xgb_test_specificity:.4f}")

Training Metrics:
  ROC-AUC: 0.9279, PR-AUC: 0.8757
  F1: 0.8146, Precision: 0.9685, Recall: 0.7030
  Specificity: 0.9936

Test Metrics:
  ROC-AUC: 0.9358, PR-AUC: 0.8867
  F1: 0.8241, Precision: 0.9787, Recall: 0.7117
  Specificity: 0.9957


### Visualizations


In [21]:
fpr_train, tpr_train, _ = roc_curve(y_train, xgb_oof_scores)
fpr_test, tpr_test, _ = roc_curve(y_test, xgb_test_scores)
auc_train = auc(fpr_train, tpr_train)
auc_test = auc(fpr_test, tpr_test)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(fpr_train, tpr_train, label=f'Train (AUC = {auc_train:.4f})', linewidth=2, linestyle='--')
ax.plot(fpr_test, tpr_test, label=f'Test (AUC = {auc_test:.4f})', linewidth=2)
ax.plot([0, 1], [0, 1], 'k--', linewidth=1, alpha=0.5, label='Random')
ax.set_xlabel('False Positive Rate', fontsize=11)
ax.set_ylabel('True Positive Rate', fontsize=11)
ax.set_title('XGBoost - ROC Curve', fontsize=12, fontweight='bold')
ax.legend(loc='lower right')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'xgboost_roc.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: xgboost_roc.png')


Saved: xgboost_roc.png


In [22]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(xgb_train_cm, annot=True, fmt='d', cmap='Blues', ax=ax1, cbar=False)
ax1.set_title('Training Set', fontsize=11, fontweight='bold')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')

sns.heatmap(xgb_test_cm, annot=True, fmt='d', cmap='Blues', ax=ax2, cbar=False)
ax2.set_title('Test Set', fontsize=11, fontweight='bold')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')

plt.suptitle('XGBoost - Confusion Matrix', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'xgboost_cm.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: xgboost_cm.png')


Saved: xgboost_cm.png


In [23]:
prec_train, rec_train, _ = precision_recall_curve(y_train, xgb_oof_scores)
prec_test, rec_test, _ = precision_recall_curve(y_test, xgb_test_scores)
pr_auc_train = average_precision_score(y_train, xgb_oof_scores)
pr_auc_test = average_precision_score(y_test, xgb_test_scores)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(rec_train, prec_train, label=f'Train (AUC = {pr_auc_train:.4f})', linewidth=2, linestyle='--')
ax.plot(rec_test, prec_test, label=f'Test (AUC = {pr_auc_test:.4f})', linewidth=2)
ax.set_xlabel('Recall', fontsize=11)
ax.set_ylabel('Precision', fontsize=11)
ax.set_title('XGBoost - Precision-Recall Curve', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'xgboost_pr.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: xgboost_pr.png')


Saved: xgboost_pr.png


In [24]:
xgb_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_best.feature_importances_
}).sort_values('importance', ascending=False).head(15)

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=xgb_importance_df, y='feature', x='importance', ax=ax, palette='viridis')
ax.set_xlabel('Importance', fontsize=11)
ax.set_ylabel('')
ax.set_title('XGBoost - Top 15 Feature Importance', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'xgboost_importance.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: xgboost_importance.png')


Saved: xgboost_importance.png


## Model Comparison


### Performance Ranking


In [25]:
# Compare all three models across multiple metrics
# OOF AUC: Out-of-fold AUC from cross-validation (training performance estimate)
# Test metrics: Performance on held-out test set (true generalization)
comparison_data = {
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'OOF AUC': [logistic_oof_auc, rf_oof_auc, xgb_oof_auc],
    'Test ROC-AUC': [log_test_metrics['roc_auc'], rf_test_metrics['roc_auc'], xgb_test_metrics['roc_auc']],
    'Test PR-AUC': [log_test_metrics['pr_auc'], rf_test_metrics['pr_auc'], xgb_test_metrics['pr_auc']],
    'Test F1': [log_test_metrics['f1'], rf_test_metrics['f1'], xgb_test_metrics['f1']],
    'Test Precision': [log_test_metrics['precision'], rf_test_metrics['precision'], xgb_test_metrics['precision']],
    'Test Recall': [log_test_metrics['recall'], rf_test_metrics['recall'], xgb_test_metrics['recall']],
    'Optimal Threshold': [log_optimal_threshold, rf_optimal_threshold, xgb_optimal_threshold]
}

comparison_df = pd.DataFrame(comparison_data)
# Sort by Test ROC-AUC (descending) to rank models by overall discrimination ability
# ROC-AUC is a good overall metric that works well even with imbalanced data
comparison_df = comparison_df.sort_values('Test ROC-AUC', ascending=False)
comparison_df.reset_index(drop=True, inplace=True)

print(comparison_df.to_string(index=False))


              Model  OOF AUC  Test ROC-AUC  Test PR-AUC  Test F1  Test Precision  Test Recall  Optimal Threshold
      Random Forest 0.931471      0.939644     0.894829 0.835584        0.959854     0.739803           0.440000
            XGBoost 0.927851      0.935770     0.886745 0.824104        0.978723     0.711674           0.679326
Logistic Regression 0.863523      0.866258     0.695580 0.645359        0.624753     0.667370           0.640149


### ROC Curves Comparison


In [26]:
log_fpr_test, log_tpr_test, _ = roc_curve(y_test, log_test_scores)
rf_fpr_test, rf_tpr_test, _ = roc_curve(y_test, rf_test_scores)
xgb_fpr_test, xgb_tpr_test, _ = roc_curve(y_test, xgb_test_scores)

fig, ax = plt.subplots(figsize=(10, 7))
ax.plot(log_fpr_test, log_tpr_test, label=f'Logistic Regression (AUC = {log_test_metrics["roc_auc"]:.4f})', linewidth=2.5, color='#2E86AB')
ax.plot(rf_fpr_test, rf_tpr_test, label=f'Random Forest (AUC = {rf_test_metrics["roc_auc"]:.4f})', linewidth=2.5, color='#A23B72')
ax.plot(xgb_fpr_test, xgb_tpr_test, label=f'XGBoost (AUC = {xgb_test_metrics["roc_auc"]:.4f})', linewidth=2.5, color='#F18F01')
ax.plot([0, 1], [0, 1], 'k--', linewidth=1, alpha=0.5, label='Random Classifier')
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curves Comparison - Test Set', fontsize=14, fontweight='bold')
ax.legend(loc='lower right', fontsize=11)
ax.grid(alpha=0.3)
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
plt.tight_layout()
plt.savefig(os.path.join(images_dir, 'roc_curves_comparison.png'), dpi=150, bbox_inches='tight')
plt.close()
print('Saved: roc_curves_comparison.png')


Saved: roc_curves_comparison.png


## Save Results


In [27]:
# Prepare results dictionary with all model information
# Store models, hyperparameters, metrics, confusion matrices, and thresholds
all_results = {
    'logistic_regression': {
        'model': logistic_best_model,
        'best_params': logistic_grid.best_params_,
        'cv_auc': float(logistic_cv_auc),
        'oof_auc': float(logistic_oof_auc),
        'train_metrics': {k: float(v) for k, v in log_train_metrics.items()},
        'test_metrics': {k: float(v) for k, v in log_test_metrics.items()},
        'train_confusion': log_train_cm.tolist(),
        'test_confusion': log_test_cm.tolist(),
        'train_specificity': float(log_train_specificity),
        'test_specificity': float(log_test_specificity),
        'optimal_threshold': float(log_optimal_threshold)
    },
    'random_forest': {
        'model': rf_best,
        'best_params': rf_grid.best_params_,
        'cv_auc': float(rf_cv_auc),
        'oof_auc': float(rf_oof_auc),
        'train_metrics': {k: float(v) for k, v in rf_train_metrics.items()},
        'test_metrics': {k: float(v) for k, v in rf_test_metrics.items()},
        'train_confusion': rf_train_cm.tolist(),
        'test_confusion': rf_test_cm.tolist(),
        'train_specificity': float(rf_train_specificity),
        'test_specificity': float(rf_test_specificity),
        'optimal_threshold': float(rf_optimal_threshold)
    },
    'xgboost': {
        'model': xgb_best,
        'best_params': xgb_grid.best_params_,
        'cv_auc': float(xgb_cv_auc),
        'oof_auc': float(xgb_oof_auc),
        'train_metrics': {k: float(v) for k, v in xgb_train_metrics.items()},
        'test_metrics': {k: float(v) for k, v in xgb_test_metrics.items()},
        'train_confusion': xgb_train_cm.tolist(),
        'test_confusion': xgb_test_cm.tolist(),
        'train_specificity': float(xgb_train_specificity),
        'test_specificity': float(xgb_test_specificity),
        'optimal_threshold': float(xgb_optimal_threshold)
    },
    'comparison': json.loads(comparison_df.to_json(orient='records'))
}

# Create JSON version without model objects (models can't be serialized to JSON)
# This JSON file stores all metrics and hyperparameters for analysis
results_json = {}
for k, v in all_results.items():
    if k == 'comparison':
        results_json[k] = v
    else:
        results_json[k] = {kk: vv for kk, vv in v.items() if kk != 'model'}

with open(os.path.join(MODELS_DIR, 'traditional_ml_results.json'), 'w') as f:
    json.dump(results_json, f, indent=2)

# Save trained models using joblib (handles scikit-learn models well)
# Each model is saved with its feature names and optimal threshold for later use
for model_name, model_obj in [('logistic_regression', logistic_best_model), ('random_forest', rf_best), ('xgboost', xgb_best)]:
    model_path = os.path.join(MODELS_DIR, f'{model_name}_traditional.joblib')
    payload = {'model': model_obj, 'feature_names': feature_names, 'threshold': all_results[model_name]['optimal_threshold']}
    joblib.dump(payload, model_path)

print('Saved all model results and artifacts')


Saved all model results and artifacts
