# **Week 14 – Ethics & Explainability (SHAP)**

**Class Task: Explain Model Predictions using SHAP**

- Objective

To understand why the model predicts a certain house price, not just what it predicts.

Step 1: Import Libraries

In [1]:
import shap
import pandas as pd
import joblib
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("SHAP version:", shap.__version__)

  from .autonotebook import tqdm as notebook_tqdm


SHAP version: 0.50.0


Step 2: Load Model & Data

In [2]:
print("\n[1/7] Loading model and data...")
model = joblib.load("house_price_model.pkl")
df = pd.read_csv("train_cleaned.csv")

# Select features
X = df[['GrLivArea', 'OverallQual', 'GarageCars']]
print(f"✓ Data loaded: {X.shape[0]} samples, {X.shape[1]} features")


[1/7] Loading model and data...
✓ Data loaded: 1458 samples, 3 features


Step 3: Use SMALLER sample to prevent crash

In [3]:
print("\n[2/7] Preparing sample data...")
X_sample = X.sample(n=50, random_state=42)  # Reduced to 50
print(f"✓ Using {X_sample.shape[0]} samples for SHAP analysis")


[2/7] Preparing sample data...
✓ Using 50 samples for SHAP analysis


Step 4: Create SHAP Explainer

In [4]:
print("\n[3/7] Creating SHAP explainer...")
try:
    # Try TreeExplainer first (for RandomForest, XGBoost, etc.)
    explainer = shap.TreeExplainer(model)
    print("✓ Using TreeExplainer")
except:
    # Fallback to KernelExplainer if TreeExplainer fails
    print("⚠ TreeExplainer failed, using KernelExplainer...")
    background = shap.sample(X_sample, 10)
    explainer = shap.KernelExplainer(model.predict, background)


[3/7] Creating SHAP explainer...
✓ Using TreeExplainer


Step 5: Calculate SHAP values (SAFER METHOD)

In [5]:
print("\n[4/7] Calculating SHAP values...")
try:
    shap_values = explainer.shap_values(X_sample, check_additivity=False)
    print("✓ SHAP values calculated successfully")
except Exception as e:
    print(f"⚠ Error: {e}")
    print("Trying alternative method...")
    # Alternative: compute one at a time
    shap_values = np.array([explainer.shap_values(X_sample.iloc[[i]], check_additivity=False)[0] 
                            for i in range(len(X_sample))])


[4/7] Calculating SHAP values...
✓ SHAP values calculated successfully


Step 6: Simple Text-Based Feature Importance (NO PLOTTING)

In [6]:
print("\n[5/7] Computing feature importance...")
print("\n" + "="*60)
print("FEATURE IMPORTANCE SUMMARY (SHAP Values)")
print("="*60)

mean_abs_shap = np.abs(shap_values).mean(axis=0)
feature_importance = pd.DataFrame({
    'Feature': X_sample.columns,
    'Mean |SHAP|': mean_abs_shap,
    'Rank': range(1, len(X_sample.columns) + 1)
}).sort_values('Mean |SHAP|', ascending=False).reset_index(drop=True)

feature_importance['Rank'] = range(1, len(feature_importance) + 1)

for idx, row in feature_importance.iterrows():
    bar = "█" * int(row['Mean |SHAP|'] / mean_abs_shap.max() * 40)
    print(f"{row['Rank']}. {row['Feature']:15s}: {row['Mean |SHAP|']:8.2f} {bar}")

print("="*60)


[5/7] Computing feature importance...

FEATURE IMPORTANCE SUMMARY (SHAP Values)
1. GarageCars     : 365493.82 ████████████████████████████████████████
2. GrLivArea      : 61990.64 ██████
3. OverallQual    : 48162.87 █████


Step 7: Local Explanation for ONE sample (Text-based)

In [10]:
print("\n[6/7] Local explanation for first sample:")
print("-" * 60)
print("Sample Input:")
for col in X_sample.columns:
    print(f"  {col:15s}: {X_sample.iloc[0][col]}")

print("\nSHAP Contributions:")
base_value = explainer.expected_value if hasattr(explainer, 'expected_value') else X['SalePrice'].mean() if 'SalePrice' in df else 0

for i, col in enumerate(X_sample.columns):
    contribution = shap_values[0][i]
    direction = "↑" if contribution > 0 else "↓"
    print(f"  {col:15s}: {contribution:+8.2f} {direction}")

print(f"\nBase prediction: {base_value:.2f}")
print(f"Final adjustment: {shap_values[0].sum():+.2f}")
print("-" * 60)


[6/7] Local explanation for first sample:
------------------------------------------------------------
Sample Input:
  GrLivArea      : 1923
  OverallQual    : 7
  GarageCars     : 2

SHAP Contributions:
  GrLivArea      : +18631.82 ↑
  OverallQual    : +27784.55 ↑
  GarageCars     : +436135.63 ↑


TypeError: unsupported format string passed to numpy.ndarray.__format__

Step 8: Save SHAP values to CSV for later analysis

In [11]:
print("\n[7/7] Saving results...")
shap_df = pd.DataFrame(shap_values, columns=X_sample.columns)
shap_df.to_csv("shap_values.csv", index=False)
print("✓ SHAP values saved to 'shap_values.csv'")

X_sample.to_csv("shap_sample_data.csv", index=False)
print("✓ Sample data saved to 'shap_sample_data.csv'")

feature_importance.to_csv("feature_importance.csv", index=False)
print("✓ Feature importance saved to 'feature_importance.csv'")

print("\n" + "="*60)
print("✅ SHAP ANALYSIS COMPLETE - NO CRASH!")
print("="*60)
print("\nGenerated files:")
print("  1. shap_values.csv - Raw SHAP values")
print("  2. shap_sample_data.csv - Sample data used")
print("  3. feature_importance.csv - Feature rankings")
print("\nTo create plots safely, run the plotting script separately.")


[7/7] Saving results...
✓ SHAP values saved to 'shap_values.csv'
✓ Sample data saved to 'shap_sample_data.csv'
✓ Feature importance saved to 'feature_importance.csv'

✅ SHAP ANALYSIS COMPLETE - NO CRASH!

Generated files:
  1. shap_values.csv - Raw SHAP values
  2. shap_sample_data.csv - Sample data used
  3. feature_importance.csv - Feature rankings

To create plots safely, run the plotting script separately.
