### Load Necessary Libraries

In [3]:
import shap
import lime
import lime.lime_tabular
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # Example model

### Load and Prepare Data

In [8]:
# Define features and target
X = fraud_data_df.drop(columns=["class"])  # Drop the target column
y = fraud_data_df["class"]  # Use "class" as the target variable

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train a Machine Learning Model

In [12]:
import joblib

# Load the trained model
model = joblib.load("random_forest_fraud.pkl")

### Explain the Model with SHAP

#### Initialize SHAP Explainer

In [9]:
import shap
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import pickle

# Load the dataset
fraud_data_df = pd.read_csv('../data/Fraud_Data.csv')

# Convert datetime columns to numeric (if any)
if 'datetime_column' in fraud_data_df.columns:  # Replace with the actual column name
    fraud_data_df['datetime_column'] = pd.to_datetime(fraud_data_df['datetime_column'])
    fraud_data_df['year'] = fraud_data_df['datetime_column'].dt.year
    fraud_data_df['month'] = fraud_data_df['datetime_column'].dt.month
    fraud_data_df['day'] = fraud_data_df['datetime_column'].dt.day
    fraud_data_df['hour'] = fraud_data_df['datetime_column'].dt.hour
    fraud_data_df['minute'] = fraud_data_df['datetime_column'].dt.minute
    fraud_data_df.drop(columns=['datetime_column'], inplace=True)  # Remove original column

# Handle categorical columns by encoding them (if any)
categorical_columns = fraud_data_df.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_columns:
    fraud_data_df[col] = encoder.fit_transform(fraud_data_df[col])

# Split into features and target
X = fraud_data_df.drop(columns=["class"])  # Replace "class" with the actual target column name
y = fraud_data_df["class"]  # Replace "class" with the actual target column name

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the RandomForest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# For testing, you can sample a smaller portion of your dataset
X_test_small = X_test.sample(n=1000, random_state=42)

# Create SHAP explainer
explainer = shap.TreeExplainer(model)

# Compute SHAP values for the test set
shap_values_subset = explainer.shap_values(X_test_small)

# Check the shape of SHAP values to ensure the index corresponds to the right class
print(f"SHAP values shape: {len(shap_values_subset)}")

# Check the shape of X_test_small for consistency
print(f"Shape of X_test_small: {X_test_small.shape}")

# Print the first few entries of shap_values_subset to inspect them
print(f"First few shap_values_subset[1]: {shap_values_subset[1][:5]}")

# Plot the SHAP summary for class 1 (assuming fraud is class 1)
# Ensure that shap_values_subset[1] aligns with X_test_small
shap.summary_plot(shap_values_subset[1], X_test_small)

# Alternatively, if there is an additional column (base value), remove it
# shap.summary_plot(shap_values_subset[1][:, :-1], X_test_small)

# Save SHAP values
with open("shap_values.pkl", "wb") as f:
    pickle.dump(shap_values_subset, f)

print("SHAP values and plot created successfully!")


SHAP values shape: 1000
Shape of X_test_small: (1000, 10)
First few shap_values_subset[1]: [[ 0.0023271  -0.0023271 ]
 [ 0.01557039 -0.01557039]
 [ 0.02070202 -0.02070202]
 [ 0.00457055 -0.00457055]
 [ 0.00364243 -0.00364243]]


AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.

#### Generate SHAP Plots

##### Summary Plot (Overall Feature Importance)

In [None]:
shap.summary_plot(shap_values[1], X_test)  # Assuming fraud is class 1

##### Force Plot (Single Prediction Explanation)

In [None]:
# Select a single instance
i = 10  # Change the index to analyze different instances
shap.force_plot(explainer.expected_value[1], shap_values[1][i], X_test.iloc[i], matplotlib=True)

##### Dependence Plot (Feature-Target Relationship)

In [None]:
shap.dependence_plot("amount", shap_values[1], X_test)  # Replace "amount" with a key feature

### Explain the Model with LIME

#### Initialize LIME Explainer

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train.values, feature_names=X.columns, class_names=["Not Fraud", "Fraud"], mode="classification"
)

#### Generate LIME Explanation for a Single Prediction

In [None]:
i = 10  # Change index to explain different predictions
exp = explainer.explain_instance(X_test.iloc[i].values, model.predict_proba)

# Show LIME explanation
exp.show_in_notebook()