# Model Explainability for Fraud Detection

This notebook explains the best-performing models for fraud detection using SHAP (SHapley Additive exPlanations).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import joblib
import shap

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))

# Import custom modules
from model import (
    load_model, explain_model_with_shap,
    plot_shap_summary, plot_shap_dependence
)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display all columns
pd.set_option('display.max_columns', None)

## 1. Load the Models and Test Data

In [None]:
# Load the best models
xgb_fraud = load_model('xgb_fraud')
xgb_cc = load_model('xgb_cc')

# Load test data
X_test_fraud = joblib.load('../data/processed/X_test_fraud.pkl')
y_test_fraud = joblib.load('../data/processed/y_test_fraud.pkl')
X_test_cc = joblib.load('../data/processed/X_test_cc.pkl')
y_test_cc = joblib.load('../data/processed/y_test_cc.pkl')

## 2. Explain the Fraud_Data Model

In [None]:
# Calculate SHAP values for Fraud_Data model
shap_values_fraud, explainer_fraud = explain_model_with_shap(xgb_fraud, X_test_fraud)

# Plot SHAP summary
plot_shap_summary(shap_values_fraud, X_test_fraud)

In [None]:
# Get feature names
feature_names_fraud = X_test_fraud.columns.tolist()

# Get top 5 features by mean absolute SHAP value
mean_shap_values_fraud = np.abs(shap_values_fraud).mean(axis=0)
top_features_fraud = pd.DataFrame({
    'Feature': feature_names_fraud,
    'Mean |SHAP|': mean_shap_values_fraud
}).sort_values('Mean |SHAP|', ascending=False).head(5)

top_features_fraud

In [None]:
# Plot SHAP dependence plots for top 3 features
for feature in top_features_fraud['Feature'].head(3):
    plot_shap_dependence(shap_values_fraud, X_test_fraud, feature)

In [None]:
# Plot SHAP force plot for a sample of fraud cases
fraud_indices = np.where(y_test_fraud == 1)[0][:5]  # Get indices of first 5 fraud cases
if len(fraud_indices) > 0:
    plt.figure(figsize=(20, 3 * len(fraud_indices)))
    for i, idx in enumerate(fraud_indices):
        plt.subplot(len(fraud_indices), 1, i + 1)
        shap.force_plot(explainer_fraud.expected_value, 
                        shap_values_fraud[idx], 
                        X_test_fraud.iloc[idx],
                        feature_names=feature_names_fraud,
                        matplotlib=True,
                        show=False)
        plt.title(f"Fraud Case {i+1}")
    plt.tight_layout()
    plt.show()

## 3. Explain the Creditcard Model

In [None]:
# Calculate SHAP values for Creditcard model
shap_values_cc, explainer_cc = explain_model_with_shap(xgb_cc, X_test_cc)

# Plot SHAP summary
plot_shap_summary(shap_values_cc, X_test_cc)

In [None]:
# Get feature names
feature_names_cc = X_test_cc.columns.tolist()

# Get top 5 features by mean absolute SHAP value
mean_shap_values_cc = np.abs(shap_values_cc).mean(axis=0)
top_features_cc = pd.DataFrame({
    'Feature': feature_names_cc,
    'Mean |SHAP|': mean_shap_values_cc
}).sort_values('Mean |SHAP|', ascending=False).head(5)

top_features_cc

In [None]:
# Plot SHAP dependence plots for top 3 features
for feature in top_features_cc['Feature'].head(3):
    plot_shap_dependence(shap_values_cc, X_test_cc, feature)

In [None]:
# Plot SHAP force plot for a sample of fraud cases
fraud_indices_cc = np.where(y_test_cc == 1)[0][:5]  # Get indices of first 5 fraud cases
if len(fraud_indices_cc) > 0:
    plt.figure(figsize=(20, 3 * len(fraud_indices_cc)))
    for i, idx in enumerate(fraud_indices_cc):
        plt.subplot(len(fraud_indices_cc), 1, i + 1)
        shap.force_plot(explainer_cc.expected_value, 
                        shap_values_cc[idx], 
                        X_test_cc.iloc[idx],
                        feature_names=feature_names_cc,
                        matplotlib=True,
                        show=False)
        plt.title(f"Fraud Case {i+1}")
    plt.tight_layout()
    plt.show()

## 4. Interpretation of SHAP Results

### Fraud_Data Model

[After running the notebook, provide interpretation of the SHAP results for the Fraud_Data model here]

### Creditcard Model

[After running the notebook, provide interpretation of the SHAP results for the Creditcard model here]

## 5. Business Insights and Recommendations

[After running the notebook, provide business insights and recommendations based on the model explanations here]