In [None]:
# ===============================
# SHAP Explainability
# ===============================

import shap
import matplotlib.pyplot as plt
import pandas as pd
import joblib
import os

# -------------------------------
# Load processed data
# -------------------------------
parquet_file = "../data/processed/fraud_processed.parquet"
fraud = pd.read_parquet(parquet_file)

# Target & features
y = fraud['class']
X = pd.get_dummies(
    fraud.drop(columns=['class','signup_time','purchase_time','ip_address','device_id','user_id']),
    drop_first=True
)

# -------------------------------
# Load trained Random Forest model from project-root models/
# -------------------------------
model_path = "../models/random_forest_model.pkl"  # root models folder
if not os.path.exists(model_path):
    raise FileNotFoundError(f"{model_path} not found. Please run the modeling notebook first.")
    
rf = joblib.load(model_path)
print("Random Forest model loaded successfully from root models/ folder.")

# -------------------------------
# SHAP explainer
# -------------------------------
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X)

# -------------------------------
# Global feature importance
# -------------------------------
shap.summary_plot(shap_values[1], X)

# Compare with built-in feature importance
importance = pd.Series(rf.feature_importances_, index=X.columns)
importance.sort_values(ascending=False).head(10).plot(kind='barh')
plt.title("Top 10 Feature Importances")
plt.show()

# -------------------------------
# Local explanations (examples)
# -------------------------------
# True Positive
tp_index = ((rf.predict(X) == 1) & (y == 1)).idxmax()
shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][tp_index],
    X.iloc[tp_index],
    matplotlib=True
)

# False Positive
fp_index = ((rf.predict(X) == 1) & (y == 0)).idxmax()
shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][fp_index],
    X.iloc[fp_index],
    matplotlib=True
)

# False Negative
fn_index = ((rf.predict(X) == 0) & (y == 1)).idxmax()
shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][fn_index],
    X.iloc[fn_index],
    matplotlib=True
)


Random Forest model loaded successfully from root models/ folder.
