In [1]:
from pathlib import Path
import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap
import joblib

REPO_ROOT = Path("/Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection").resolve()
os.chdir(REPO_ROOT)
print("CWD:", Path.cwd())

MODEL_PATH = REPO_ROOT / "artifacts/xgb_model.pkl"
TEST_PATH  = REPO_ROOT / "data/processed/test.csv"

FIG_DIR = REPO_ROOT / "docs/figures/explainability/07_shap"
FIG_DIR.mkdir(parents=True, exist_ok=True)

print("MODEL exists:", MODEL_PATH.exists())
print("TEST exists :", TEST_PATH.exists())
print("FIG_DIR     :", FIG_DIR)


CWD: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection
MODEL exists: True
TEST exists : True
FIG_DIR     : /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/explainability/07_shap


In [2]:
model = joblib.load(MODEL_PATH)

df = pd.read_csv(TEST_PATH)
y = df["Class"].astype(int).values
X = df.drop(columns=["Class"])
feature_names = X.columns.astype(str)

print("X shape:", X.shape, "| fraud count:", int(y.sum()))


X shape: (42722, 102) | fraud count: 52


In [3]:
rng = np.random.default_rng(42)

# Small background for speed + stability
bg_size = min(500, len(X))
bg_idx = rng.choice(len(X), size=bg_size, replace=False)
X_bg = X.iloc[bg_idx]

# pick high-risk rows for local explanations (top predicted prob)
proba = model.predict_proba(X)[:, 1]
top_idx = np.argsort(proba)[::-1][:3]  # 3 examples
X_explain = X.iloc[top_idx]
y_explain = y[top_idx]
p_explain = proba[top_idx]

list(zip(top_idx.tolist(), y_explain.tolist(), p_explain.tolist()))


[(1763, 1, 0.9999697208404541),
 (1664, 1, 0.9999592304229736),
 (19840, 1, 0.9999184608459473)]

In [4]:
# This avoids TreeExplainer parsing the booster (your base_score error path).
explainer = shap.Explainer(model.predict_proba, X_bg)

# Explain only class-1 probability => take [:, 1] later
shap_values = explainer(X_explain)
shap_values


.values =
array([[[-9.23480377e-03,  9.23480344e-03],
        [-1.15971747e-02,  1.15971752e-02],
        [-1.11592969e-02,  1.11592964e-02],
        [-7.79717983e-02,  7.79717980e-02],
        [-9.39282626e-04,  9.39282728e-04],
        [ 4.67452720e-03, -4.67452683e-03],
        [-1.14629315e-02,  1.14629311e-02],
        [ 1.57286838e-03, -1.57286828e-03],
        [-7.78995827e-03,  7.78995913e-03],
        [-8.05380136e-02,  8.05380125e-02],
        [-4.57580447e-02,  4.57580445e-02],
        [-3.75859052e-03,  3.75859096e-03],
        [-3.86220157e-03,  3.86220059e-03],
        [-9.27951258e-02,  9.27951270e-02],
        [ 4.40527350e-04, -4.40526511e-04],
        [-1.62947829e-02,  1.62947817e-02],
        [-8.23902132e-02,  8.23902116e-02],
        [-3.27881426e-04,  3.27881214e-04],
        [-1.22324824e-03,  1.22324871e-03],
        [ 1.05785534e-03, -1.05785432e-03],
        [-3.93033117e-03,  3.93032966e-03],
        [-2.61209816e-03,  2.61209771e-03],
        [ 3.20507884e-

In [5]:
sample_size = min(2000, len(X))
sample_idx = rng.choice(len(X), size=sample_size, replace=False)
X_sample = X.iloc[sample_idx]

shap_global = explainer(X_sample)

# For binary classifier predict_proba => shap has 2 outputs. We use class 1.
# shap.Explanation may store values as (n, d, 2) or list-like depending on version.
vals = shap_global.values
if vals.ndim == 3:
    vals_pos = vals[:, :, 1]
else:
    # if already (n, d) assume it is positive-class
    vals_pos = vals

# summary beeswarm
plt.figure()
shap.summary_plot(vals_pos, X_sample, feature_names=feature_names, show=False, max_display=30)
out1 = FIG_DIR / "shap_summary.png"
plt.savefig(out1, dpi=200, bbox_inches="tight")
plt.close()

# bar plot
plt.figure()
shap.summary_plot(vals_pos, X_sample, feature_names=feature_names, show=False, plot_type="bar", max_display=30)
out2 = FIG_DIR / "shap_bar.png"
plt.savefig(out2, dpi=200, bbox_inches="tight")
plt.close()

print("Saved:", out1)
print("Saved:", out2)


PermutationExplainer explainer: 2001it [00:58, 28.40it/s]                          
  shap.summary_plot(vals_pos, X_sample, feature_names=feature_names, show=False, max_display=30)


Saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/explainability/07_shap/shap_summary.png
Saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/explainability/07_shap/shap_bar.png


  shap.summary_plot(vals_pos, X_sample, feature_names=feature_names, show=False, plot_type="bar", max_display=30)


In [6]:
# local SHAP values for the explained rows
vals_local = shap_values.values
base_local = shap_values.base_values

# Handle class dimension if present
if vals_local.ndim == 3:
    vals_local_pos = vals_local[:, :, 1]
    base_local_pos = base_local[:, 1] if base_local.ndim == 2 else base_local
else:
    vals_local_pos = vals_local
    base_local_pos = base_local

for i in range(len(X_explain)):
    exp = shap.Explanation(
        values=vals_local_pos[i],
        base_values=base_local_pos[i],
        data=X_explain.iloc[i].values,
        feature_names=feature_names.tolist(),
    )

    plt.figure()
    shap.plots.waterfall(exp, max_display=15, show=False)
    out = FIG_DIR / f"shap_local_{i}.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print("Saved:", out)


Saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/explainability/07_shap/shap_local_0.png
Saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/explainability/07_shap/shap_local_1.png
Saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/explainability/07_shap/shap_local_2.png


In [7]:
meta = {
    "model_path": str(MODEL_PATH),
    "test_path": str(TEST_PATH),
    "global_sample_size": int(sample_size),
    "background_size": int(bg_size),
    "local_examples": [
        {"row_index": int(idx), "y_true": int(y[idx]), "p_xgb": float(proba[idx])}
        for idx in top_idx
    ],
    "figures": {
        "summary": str((FIG_DIR / "shap_summary.png").relative_to(REPO_ROOT)),
        "bar": str((FIG_DIR / "shap_bar.png").relative_to(REPO_ROOT)),
        "locals": [
            str((FIG_DIR / f"shap_local_{i}.png").relative_to(REPO_ROOT))
            for i in range(len(X_explain))
        ],
    },
}

meta_path = FIG_DIR / "shap_run_meta.json"
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("Saved:", meta_path)
meta


Saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/explainability/07_shap/shap_run_meta.json


{'model_path': '/Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/artifacts/xgb_model.pkl',
 'test_path': '/Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/data/processed/test.csv',
 'global_sample_size': 2000,
 'background_size': 500,
 'local_examples': [{'row_index': 1763,
   'y_true': 1,
   'p_xgb': 0.9999697208404541},
  {'row_index': 1664, 'y_true': 1, 'p_xgb': 0.9999592304229736},
  {'row_index': 19840, 'y_true': 1, 'p_xgb': 0.9999184608459473}],
 'figures': {'summary': 'docs/figures/explainability/07_shap/shap_summary.png',
  'bar': 'docs/figures/explainability/07_shap/shap_bar.png',
  'locals': ['docs/figures/explainability/07_shap/shap_local_0.png',
   'docs/figures/explainability/07_shap/shap_local_1.png',
   'docs/figures/explainability/07_shap/shap_local_2.png']}}