In [None]:
# ===============================================================
#   Model Explainability (SHAP)
# ===============================================================

# 1Ô∏è‚É£ Install dependencies
!pip install shap matplotlib seaborn joblib --quiet

# 2Ô∏è‚É£ Import libraries
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.ensemble import RandomForestClassifier

# 3Ô∏è‚É£ Load Models
rf_fraud = joblib.load('models/random_forest_fraud.pkl')
rf_credit = joblib.load('models/random_forest_credit.pkl')

# 4Ô∏è‚É£ Load datasets
fraud_df = pd.read_csv('data/Fraud_Data_Processed.csv')
credit_df = pd.read_csv('data/CreditCard_Processed.csv')

# 5Ô∏è‚É£ Define helper function for explainability
def explain_model(model, df, target_col, dataset_name):
    print(f"\n==============================")
    print(f"üîç Model Explainability for {dataset_name}")
    print(f"==============================")

    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Identify numeric & categorical columns for preprocessed model
    num_cols = X.select_dtypes(include=['int64','float64']).columns
    cat_cols = X.select_dtypes(include=['object','category']).columns

    # -------------------------
    # Feature Importance (built-in)
    # -------------------------
    rf_model = model.named_steps['model']
    importances = rf_model.feature_importances_
    feature_names = model.named_steps['preprocessor'].get_feature_names_out()
    fi_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    fi_df = fi_df.sort_values('importance', ascending=False).head(10)

    print("\nTop 10 Features (Built-in Importance):")
    print(fi_df)
    sns.barplot(x='importance', y='feature', data=fi_df, palette='viridis')
    plt.title(f"{dataset_name} - Top 10 Features (RF)")
    plt.show()

    # -------------------------
    # SHAP Analysis
    # -------------------------
    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(model.named_steps['preprocessor'].transform(X))

    # SHAP summary plot
    shap.summary_plot(shap_values[1], model.named_steps['preprocessor'].transform(X), feature_names=feature_names)

    # Identify TP, FP, FN
    y_pred = model.predict(X)
    tp_idx = np.where((y == 1) & (y_pred == 1))[0][0]
    fp_idx = np.where((y == 0) & (y_pred == 1))[0][0]
    fn_idx = np.where((y == 1) & (y_pred == 0))[0][0]

    print("\nüîπ SHAP Force Plot - True Positive")
    shap.force_plot(explainer.expected_value[1], shap_values[1][tp_idx,:], feature_names=feature_names, matplotlib=True)
    plt.show()

    print("\nüîπ SHAP Force Plot - False Positive")
    shap.force_plot(explainer.expected_value[1], shap_values[1][fp_idx,:], feature_names=feature_names, matplotlib=True)
    plt.show()

    print("\nüîπ SHAP Force Plot - False Negative")
    shap.force_plot(explainer.expected_value[1], shap_values[1][fn_idx,:], feature_names=feature_names, matplotlib=True)
    plt.show()

    # -------------------------
    # Top 5 Drivers
    # -------------------------
    top5_features = fi_df['feature'].head(5).tolist()
    print("\nTop 5 Feature Drivers:", top5_features)

    # -------------------------
    # Business Recommendations
    # -------------------------
    print("\nüí° Business Recommendations:")
    print("1. Transactions with unusually high 'purchase_value' should trigger additional verification.")
    print("2. Transactions occurring in unusual 'hour_of_day' ranges may require monitoring.")
    print("3. Users from new countries or 'Unknown' locations should be flagged for manual review.")

# 6Ô∏è‚É£ Run for Fraud_Data_Processed
explain_model(rf_fraud, fraud_df, target_col='class', dataset_name='Fraud_Data_Processed')

# 7Ô∏è‚É£ Run for CreditCard_Processed
explain_model(rf_credit, credit_df, target_col='Class', dataset_name='CreditCard_Processed')


