# Credit Risk Prediction with SHAP

In [15]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
# Install missing packages if needed
# !pip install shap xgboost lightgbm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import json
import shap
import xgboost as xgb
import lightgbm as lgb
import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
plt.rcParams['figure.dpi'] = 120

In [17]:
#  USER: set your dataset path here 
DATA_PATH = Path(r"F:\Datas\AI_Course\Soft_Skill\4 Module\Dataset\archive\credit_risk_dataset.csv")

OUT_DIR = Path("project_output")
MODELS_DIR = OUT_DIR / "models"
FIG_DIR = OUT_DIR / "figures"

for p in [OUT_DIR, MODELS_DIR, FIG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("Output directory:", OUT_DIR.resolve())

Output directory: F:\Datas\AI_Course\Soft_Skill\4 Module\V02\New folder\project_output


In [18]:
# Load dataset
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Dataset not found at: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
print("Loaded dataset shape:", df.shape)
df.head()

Loaded dataset shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [19]:
# Detect target column
possible_targets = ["default","is_default","loan_status","target","status","label","default_payment_next_month"]
target_col = None
for c in df.columns:
    if c.lower() in possible_targets:
        target_col = c
        break
if target_col is None:
    target_col = df.columns[-1]

print("Using target column:", target_col)

y = df[target_col].astype("category").cat.codes
X = df.drop(columns=[target_col])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric cols:", len(num_cols), "Categorical cols:", len(cat_cols))

Using target column: loan_status
Numeric cols: 7 Categorical cols: 4


In [20]:
# Build preprocessing pipeline
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])
preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
], remainder="drop")

In [21]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (26064, 11) Test shape: (6517, 11)


In [22]:
# Train models
models = {}

# RandomForest baseline
models["random_forest"] = Pipeline([
    ("preproc", preprocessor),
    ("model", RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42))
])

# XGBoost
try:
    xgb_clf = xgb.XGBClassifier(
        eval_metric="logloss",
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        random_state=42,
        use_label_encoder=False
    )
    models["xgboost"] = Pipeline([
        ("preproc", preprocessor),
        ("model", xgb_clf)
    ])
except Exception as e:
    print("XGBoost not available:", e)

# LightGBM
try:
    lgb_clf = lgb.LGBMClassifier(
        n_estimators=200,
        learning_rate=0.05,
        random_state=42
    )
    models["lightgbm"] = Pipeline([
        ("preproc", preprocessor),
        ("model", lgb_clf)
    ])
except Exception as e:
    print("LightGBM not available:", e)

for name, pipe in models.items():
    print("Training:", name)
    pipe.fit(X_train, y_train)
    outp = MODELS_DIR / f"{name}.joblib"
    joblib.dump(pipe, outp)
    print("Saved:", outp)

Training: random_forest
Saved: project_output\models\random_forest.joblib
Training: xgboost
Saved: project_output\models\xgboost.joblib
Training: lightgbm
[LightGBM] [Info] Number of positive: 5686, number of negative: 20378
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 26064, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.218155 -> initscore=-1.276449
[LightGBM] [Info] Start training from score -1.276449
Saved: project_output\models\lightgbm.joblib


In [23]:
# Evaluate and save metrics
results = {}
for name, pipe in models.items():
    y_pred = pipe.predict(X_test)
    try:
        y_proba = pipe.predict_proba(X_test)[:,1]
    except Exception:
        y_proba = y_pred
    metrics = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision": float(precision_score(y_test, y_pred, zero_division=0)),
        "recall": float(recall_score(y_test, y_pred, zero_division=0)),
        "f1": float(f1_score(y_test, y_pred, zero_division=0)),
        "roc_auc": float(roc_auc_score(y_test, y_proba))
    }
    results[name] = metrics
    mfile = OUT_DIR / f"{name}_metrics.json"
    if mfile.exists():
        mfile.unlink()
    with open(mfile, "w") as f:
        json.dump(metrics, f, indent=2)
    print("Saved metrics:", mfile)

best_name = max(results.items(), key=lambda kv: kv[1]["roc_auc"])[0]
best_model = models[best_name]
print("Best model:", best_name)

Saved metrics: project_output\random_forest_metrics.json
Saved metrics: project_output\xgboost_metrics.json
Saved metrics: project_output\lightgbm_metrics.json
Best model: lightgbm


In [24]:
# SHAP global analysis
preproc = best_model.named_steps["preproc"]
model_obj = best_model.named_steps["model"]
X_test_pre = preproc.transform(X_test)

# Feature names for transformed data
try:
    feat_names = preproc.get_feature_names_out()
    feat_names = [str(f) for f in feat_names]
except Exception:
    feat_names = num_cols + [f"{c}_encoded" for c in cat_cols]

# SHAP DataFrame
try:
    X_test_pre_df = pd.DataFrame(X_test_pre, columns=feat_names)
except Exception:
    X_test_pre_df = pd.DataFrame(X_test_pre)
    feat_names = X_test_pre_df.columns.astype(str).tolist()

# Explain
explainer = shap.TreeExplainer(model_obj)
sv_raw = explainer.shap_values(X_test_pre)

# pick positive class
if isinstance(sv_raw, list):
    shap_values = np.array(sv_raw[1]) if len(sv_raw) > 1 else np.array(sv_raw[0])
else:
    shap_values = np.array(sv_raw)

# Summary plot
plt.figure(figsize=(8,6))
shap.summary_plot(shap_values, X_test_pre_df, feature_names=feat_names, show=False)
summary_path = FIG_DIR / "shap_summary.png"
if summary_path.exists():
    summary_path.unlink()
plt.savefig(str(summary_path), bbox_inches='tight', dpi=200)
plt.close()

# Feature importance
fi_path = OUT_DIR / "shap_feature_importance.csv"
if fi_path.exists():
    fi_path.unlink()
mean_abs = np.mean(np.abs(shap_values), axis=0)
if len(mean_abs) == len(feat_names):
    feat_imp = pd.Series(mean_abs, index=feat_names).sort_values(ascending=False)
else:
    feat_imp = pd.Series(mean_abs).sort_values(ascending=False)
feat_imp.to_csv(fi_path)
print("Saved SHAP summary + feature importance")

Saved SHAP summary + feature importance


In [25]:
# Dependence plots for top 3 features
try:
    top_feats = feat_imp.index[:3].tolist()
except Exception:
    top_feats = list(range(min(3, X_test_pre.shape[1])))

for f in top_feats:
    plt.figure(figsize=(6,4))
    shap.dependence_plot(f, shap_values, X_test_pre_df, feature_names=feat_names, show=False)
    dep_path = FIG_DIR / f"dependence_{str(f).replace('/','_')}.png"
    if dep_path.exists():
        dep_path.unlink()
    plt.savefig(str(dep_path), bbox_inches='tight', dpi=200)
    plt.close()
print("Saved dependence plots for:", top_feats)

Saved dependence plots for: ['num__loan_percent_income', 'cat__loan_grade', 'num__person_income']


In [26]:
# Local SHAP explanations  always generate true_positive, true_negative, false_positive PNGs

y_test_arr = np.array(y_test)
y_pred_test = best_model.predict(X_test)

idx_tp_arr = np.where((y_test_arr == 1) & (y_pred_test == 1))[0]
idx_tn_arr = np.where((y_test_arr == 0) & (y_pred_test == 0))[0]
idx_fp_arr = np.where((y_test_arr == 0) & (y_pred_test == 1))[0]

def pick_index(arr, fallback):
    return int(arr[0]) if len(arr) > 0 else int(fallback)

fallback_idx = 0  # always valid as long as test set non-empty

indices = {
    "true_positive": pick_index(idx_tp_arr, fallback_idx),
    "true_negative": pick_index(idx_tn_arr, fallback_idx),
    "false_positive": pick_index(idx_fp_arr, fallback_idx)
}

for label, idx in indices.items():
    outp = FIG_DIR / f"{label}.png"
    # Make sure idx is in range
    if idx < 0 or idx >= shap_values.shape[0]:
        idx = 0
    # Simple bar chart explanation (robust)
    contrib = pd.Series(
        shap_values[idx],
        index=feat_names if len(shap_values[idx]) == len(feat_names) else range(len(shap_values[idx]))
    )
    top = contrib.abs().sort_values(ascending=False).head(10)
    fig, ax = plt.subplots(figsize=(6,4))
    top.plot.bar(ax=ax)
    ax.set_title(f"Local SHAP explanation: {label}")
    if outp.exists():
        outp.unlink()
    fig.savefig(str(outp), bbox_inches='tight', dpi=200)
    plt.close(fig)
    print("Saved local explanation:", outp)

Saved local explanation: project_output\figures\true_positive.png
Saved local explanation: project_output\figures\true_negative.png
Saved local explanation: project_output\figures\false_positive.png


In [27]:
# Write full report (overwrite safely)
report_path = OUT_DIR / "report.md"
if report_path.exists():
    report_path.unlink()
with open(report_path, "w", encoding="utf-8") as f:
    f.write("# Credit Risk Prediction â€” Final Report\n\n")
    f.write("## Dataset\n")
    f.write(f"- Source file: {DATA_PATH}\n")
    f.write(f"- Shape: {df.shape}\n\n")
    f.write("## Preprocessing\n")
    f.write("- Numeric imputation (median) and scaling.\n")
    f.write("- Categorical imputation and ordinal encoding.\n\n")
    f.write("## Models Trained\n")
    for nm in results:
        f.write(f"- {nm}: ROC_AUC={results[nm]['roc_auc']:.4f}, F1={results[nm]['f1']:.4f}\n")
    f.write(f"\n**Selected best model:** {best_name}\n\n")
    f.write("## SHAP\n")
    f.write("- Global SHAP summary plot included.\n")
    f.write("- SHAP dependence plots for top 3 features included.\n")
    f.write("- Local explanations (true_positive, true_negative, false_positive) included.\n\n")
    f.write("## Interpretation\n")
    f.write("SHAP analysis provides both global and local interpretability to support lending decisions.\n\n")
    f.write("## Conclusion and Recommendations\n")
    f.write("Use model probabilities and SHAP explanations to create explainable decision workflows for loan approvals.\n")
print("Saved report:", report_path)

Saved report: project_output\report.md


In [28]:
# Create ZIP (overwrite if exists)
zip_path = Path("project_output.zip")
if zip_path.exists():
    zip_path.unlink()
shutil.make_archive("project_output", "zip", OUT_DIR)
print("Created ZIP:", zip_path.resolve())

Created ZIP: F:\Datas\AI_Course\Soft_Skill\4 Module\V02\New folder\project_output.zip
