In [2]:
import shap
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import os

# Load data
df = pd.read_csv("../data/MachineLearningRating_v3.txt", delimiter="|", low_memory=False)
df["ClaimOccurred"] = (df["TotalClaims"] > 0).astype(int)
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]
severity_df = df[df["TotalClaims"] > 0].copy()

# Select features
features = ["Province", "VehicleType", "RegistrationYear", "make", "SumInsured", "CalculatedPremiumPerTerm"]
severity_df = severity_df.dropna(subset=features + ["TotalClaims"])
X = severity_df[features]
y = severity_df["TotalClaims"]

# One-hot encode
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat = encoder.fit_transform(X[categorical_cols])
X_num = X[numerical_cols].values
X_encoded = np.hstack((X_cat, X_num))

# Load trained Random Forest model
model = joblib.load("outputs/task4/random_forest_model.joblib")

# SHAP explainability (use TreeExplainer for tree-based models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_encoded[:100])

# Create output directory
output_dir = "outputs/task4/shap"
os.makedirs(output_dir, exist_ok=True)

# SHAP Beeswarm Plot
plt.figure()
shap.summary_plot(shap_values, X_encoded[:100], show=False)
plt.savefig(f"{output_dir}/shap_beeswarm.png")
plt.close()

# SHAP Bar Plot
plt.figure()
shap.summary_plot(shap_values, X_encoded[:100], plot_type="bar", show=False)
plt.savefig(f"{output_dir}/shap_feature_importance.png")
plt.close()

# Text summary file
summary_text = f"""
✅ SHAP Interpretability Summary
===============================
• Model: Random Forest
• Dataset: First 100 rows
• Features: {features}

Files saved:
- SHAP Beeswarm: shap_beeswarm.png
- SHAP Feature Importance: shap_feature_importance.png
"""

with open(f"{output_dir}/shap_summary.txt", "w") as f:
    f.write(summary_text)

# Print summary in notebook
print(summary_text)



✅ SHAP Interpretability Summary
• Model: Random Forest
• Dataset: First 100 rows
• Features: ['Province', 'VehicleType', 'RegistrationYear', 'make', 'SumInsured', 'CalculatedPremiumPerTerm']

Files saved:
- SHAP Beeswarm: shap_beeswarm.png
- SHAP Feature Importance: shap_feature_importance.png

