Import Libraries

In [None]:
# Cell 1: Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# For model explainability
import shap

# Ensure plots are displayed inline
%matplotlib inline

# Set a global random seed for reproducibility
RANDOM_STATE = 42


Load and Inspect the Dataset

In [None]:
# Cell 2: Load and Inspect the Dataset

# Update the file path as necessary. For example, if your CSV is named 'loan_data.csv'
data = pd.read_csv('loan_data.csv')

# Inspect the first few rows and data summary
print("Data Head:")
print(data.head())
print("\nData Info:")
print(data.info())


Train-Test Split

In [None]:
# Cell 3: Data Preprocessing

# -- Handle missing values
# For production, consider more sophisticated imputation instead of dropping rows
data = data.dropna()

# -- Feature selection: Define features (X) and target (y)
# Assuming 'Loan_Status' is the target variable; modify if necessary
X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']

# -- Convert categorical variables into dummy/indicator variables
# This will one-hot encode all categorical features
X = pd.get_dummies(X, drop_first=True)


Define Model Evaluation Function

In [None]:
# Cell 5: Model Evaluation Function

def evaluate_model(model, X_test, y_test):
    # Predict class labels
    y_pred = model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Predict probabilities for ROC AUC calculation
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    print("ROC AUC Score: {:.4f}".format(auc))
    
    # Plot the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    # Plot the ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label='ROC curve (AUC = {:.4f})'.format(auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc='lower right')
    plt.show()


Build and Evaluate the XGBoost Classifier

In [None]:
# Cell 6: Random Forest Classifier

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=RANDOM_STATE)

# Hyperparameter grid for RandomForest tuning
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Apply GridSearchCV to find the best parameters
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='roc_auc', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# Retrieve the best model
print("Best parameters for RandomForest:", rf_grid.best_params_)
rf_best = rf_grid.best_estimator_

# Evaluate the RandomForest model
print("\n--- RandomForest Evaluation ---")
evaluate_model(rf_best, X_test, y_test)


Compare Models with ROC Curve

In [None]:
# Cell 8: Compare Models with ROC Curve

# Get predicted probabilities for the positive class from both models
rf_probs = rf_best.predict_proba(X_test)[:, 1]
xgb_probs = xgb_best.predict_proba(X_test)[:, 1]

# Compute ROC curve values for RandomForest and XGBoost
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_probs)

# Plot the ROC curves of both models together
plt.figure(figsize=(8, 6))
plt.plot(rf_fpr, rf_tpr, label='RandomForest (AUC = {:.4f})'.format(roc_auc_score(y_test, rf_probs)))
plt.plot(xgb_fpr, xgb_tpr, label='XGBoost (AUC = {:.4f})'.format(roc_auc_score(y_test, xgb_probs)))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()


SHAP Analysis for Model Explainability

In [None]:
# Cell 9: SHAP Analysis for Model Explainability

# --- SHAP for RandomForest ---
explainer_rf = shap.TreeExplainer(rf_best)
shap_values_rf = explainer_rf.shap_values(X_test)

# For binary classification, use the explanation for the positive class (index 1)
plt.figure()
shap.summary_plot(shap_values_rf[1], X_test, plot_type="bar", show=False)
plt.title("SHAP Feature Importance (RandomForest)")
plt.show()

# --- SHAP for XGBoost ---
explainer_xgb = shap.TreeExplainer(xgb_best)
shap_values_xgb = explainer_xgb.shap_values(X_test)

plt.figure()
shap.summary_plot(shap_values_xgb, X_test, plot_type="bar", show=False)
plt.title("SHAP Feature Importance (XGBoost)")
plt.show()
