# Loan Default Risk Predictor (Beginner ML Project)

This notebook builds a simple loan default risk model on a real dataset and adds explainability with SHAP.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

import shap

## 1. Load data

In [None]:
data = fetch_openml(name="credit-g", version=1, as_frame=True)
df = data.frame.copy()
df.head()

## 2. Basic EDA

In [None]:
print(df.shape)
df["class"].value_counts(normalize=True)

## 3. Train/test split

In [None]:
X = df.drop(columns=["class"])
y = df["class"].map({"good": 0, "bad": 1})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 4. Preprocessing + model

In [None]:
numeric_features = X.select_dtypes(include=["number"]).columns
categorical_features = X.select_dtypes(include=["object", "category"]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
] )

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
] )

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

model = RandomForestClassifier(
    n_estimators=200, random_state=42, class_weight="balanced"
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
] )

clf.fit(X_train, y_train)

## 5. Evaluation

In [None]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

print("
Classification Report:
")
print(classification_report(y_test, y_pred))

## 6. Explainability with SHAP

In [None]:
# Fit SHAP on a small sample for speed
X_sample = X_test.sample(200, random_state=42)

# Transform features for SHAP
X_transformed = clf.named_steps["preprocessor"].transform(X_sample)

explainer = shap.TreeExplainer(clf.named_steps["model"])
shap_values = explainer.shap_values(X_transformed)

# Get feature names after one-hot encoding
ohe = clf.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
ohe_feature_names = ohe.get_feature_names_out(categorical_features)
feature_names = np.concatenate([numeric_features, ohe_feature_names])

shap.summary_plot(shap_values[1], X_transformed, feature_names=feature_names)
