# Phase 5: Comprehensive Evaluation & Interpretability

**Objective:** Complete system evaluation with interpretability analysis

**Goals:**
- Evaluate complete end-to-end system
- Analyze feature importance with SHAP
- Perform ablation studies
- Generate comprehensive visualizations
- Document key insights and recommendations

## 1. Setup and Imports

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import sys
import shap
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# ML libraries
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)

# Display settings
pd.set_option('display.max_columns', None)
%matplotlib inline
sns.set_style('whitegrid')

print("âœ… All libraries imported successfully")

## 2. Load All Components

In [None]:
# Load data
print("Loading data and models...")
X_train = joblib.load('../data/splits/X_train.pkl')
X_test = joblib.load('../data/splits/X_test.pkl')
y_test = joblib.load('../data/splits/y_test.pkl')

# Load models
bootstrap_ensemble = joblib.load('../results/models/bootstrap_ensemble.pkl')
escalation_system = joblib.load('../results/models/escalation_system.pkl')
preprocessor = joblib.load('../results/models/preprocessor.pkl')

# Load uncertainty estimates
uncertainty_data = joblib.load('../results/models/uncertainty_estimates.pkl')
proba_test = uncertainty_data['test']['proba']
uncertainty_test = uncertainty_data['test']['uncertainty']
y_pred_test = uncertainty_data['test']['y_pred']

print(f"Test samples: {len(y_test)}")
print(f"Features: {len(preprocessor.feature_names)}")
print("âœ… All components loaded")

## 3. Complete System Evaluation

In [None]:
# Evaluate complete system
print("Evaluating complete credit risk assessment system...\n")

# Get escalation decisions
escalate_mask = escalation_system.process_predictions(proba_test, uncertainty_test)

# Automated decisions
automated_mask = ~escalate_mask
n_automated = np.sum(automated_mask)
n_escalated = np.sum(escalate_mask)

print("="*60)
print("COMPLETE SYSTEM PERFORMANCE (TEST SET)")
print("="*60)

print(f"\nDecision Distribution:")
print(f"  Total Samples:        {len(y_test)}")
print(f"  Automated:            {n_automated} ({n_automated/len(y_test)*100:.1f}%)")
print(f"  Escalated to Human:   {n_escalated} ({n_escalated/len(y_test)*100:.1f}%)")

# Automated performance
if n_automated > 0:
    y_test_auto = y_test.values[automated_mask]
    y_pred_auto = y_pred_test[automated_mask]
    proba_auto = proba_test[automated_mask, 1]
    
    print(f"\nAutomated Decisions Performance:")
    print(f"  Accuracy:             {accuracy_score(y_test_auto, y_pred_auto):.4f}")
    print(f"  Precision:            {precision_score(y_test_auto, y_pred_auto):.4f}")
    print(f"  Recall:               {recall_score(y_test_auto, y_pred_auto):.4f}")
    print(f"  F1-Score:             {f1_score(y_test_auto, y_pred_auto):.4f}")
    print(f"  AUC-ROC:              {roc_auc_score(y_test_auto, proba_auto):.4f}")

# Escalated cases analysis
if n_escalated > 0:
    y_test_esc = y_test.values[escalate_mask]
    y_pred_esc = y_pred_test[escalate_mask]
    
    # What would accuracy be if we automated these?
    acc_if_automated = accuracy_score(y_test_esc, y_pred_esc)
    
    print(f"\nEscalated Cases Analysis:")
    print(f"  Accuracy if Automated: {acc_if_automated:.4f}")
    print(f"  Default Rate:          {y_test_esc.mean():.2%}")
    print(f"  Avg Uncertainty:       {uncertainty_test[escalate_mask].mean():.4f}")

print(f"\n" + "="*60)

## 4. Confusion Matrix Analysis

In [None]:
# Create comprehensive confusion matrix visualization
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Overall confusion matrix
cm_overall = confusion_matrix(y_test, y_pred_test)
sns.heatmap(cm_overall, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Paid', 'Default'],
            yticklabels=['Paid', 'Default'],
            cbar_kws={'label': 'Count'})
axes[0].set_title('Overall Predictions\n(Before Escalation)', fontweight='bold', fontsize=12)
axes[0].set_ylabel('True Label', fontweight='bold')
axes[0].set_xlabel('Predicted Label', fontweight='bold')

# Automated decisions only
if n_automated > 0:
    cm_auto = confusion_matrix(y_test_auto, y_pred_auto)
    sns.heatmap(cm_auto, annot=True, fmt='d', cmap='Greens', ax=axes[1],
                xticklabels=['Paid', 'Default'],
                yticklabels=['Paid', 'Default'],
                cbar_kws={'label': 'Count'})
    axes[1].set_title(f'Automated Decisions Only\n({n_automated} samples)', 
                     fontweight='bold', fontsize=12)
    axes[1].set_ylabel('True Label', fontweight='bold')
    axes[1].set_xlabel('Predicted Label', fontweight='bold')

# Escalated cases
if n_escalated > 0:
    cm_esc = confusion_matrix(y_test_esc, y_pred_esc)
    sns.heatmap(cm_esc, annot=True, fmt='d', cmap='Oranges', ax=axes[2],
                xticklabels=['Paid', 'Default'],
                yticklabels=['Paid', 'Default'],
                cbar_kws={'label': 'Count'})
    axes[2].set_title(f'Escalated to Human\n({n_escalated} samples)',
                     fontweight='bold', fontsize=12)
    axes[2].set_ylabel('True Label', fontweight='bold')
    axes[2].set_xlabel('Predicted Label (if automated)', fontweight='bold')

plt.tight_layout()
plt.savefig('../results/figures/final_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()
print("âœ… Confusion matrices saved")

## 5. SHAP Interpretability Analysis

In [None]:
# Select a base model from the ensemble for SHAP analysis
print("Performing SHAP analysis...")
print("Note: Using first model from bootstrap ensemble for efficiency\n")

base_model = bootstrap_ensemble.models[0]

# Sample data for SHAP (to speed up computation)
np.random.seed(42)
sample_indices = np.random.choice(len(X_test), min(1000, len(X_test)), replace=False)
X_sample = X_test.iloc[sample_indices]

# Create SHAP explainer
explainer = shap.TreeExplainer(base_model)
shap_values = explainer.shap_values(X_sample)

print("âœ… SHAP values calculated")

In [None]:
# SHAP Summary Plot (Feature Importance)
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False, max_display=20)
plt.title('SHAP Feature Importance', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../results/figures/shap_importance.png', dpi=300, bbox_inches='tight')
plt.show()
print("âœ… SHAP importance plot saved")

In [None]:
# SHAP Summary Plot (Impact Direction)
plt.figure(figsize=(10, 10))
shap.summary_plot(shap_values, X_sample, show=False, max_display=20)
plt.title('SHAP Feature Impact Analysis', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../results/figures/shap_summary.png', dpi=300, bbox_inches='tight')
plt.show()
print("âœ… SHAP summary plot saved")

In [None]:
# SHAP Dependence Plots for top 3 features
# Get feature importance
feature_importance = np.abs(shap_values).mean(axis=0)
top_features_idx = np.argsort(feature_importance)[-3:][::-1]
top_features = [X_sample.columns[i] for i in top_features_idx]

fig, axes = plt.subplots(1, 3, figsize=(16, 5))
for idx, (feat_idx, feat_name) in enumerate(zip(top_features_idx, top_features)):
    shap.dependence_plot(
        feat_idx, 
        shap_values, 
        X_sample,
        ax=axes[idx],
        show=False
    )
    axes[idx].set_title(f'SHAP Dependence: {feat_name}', fontweight='bold', fontsize=11)

plt.tight_layout()
plt.savefig('../results/figures/shap_dependence.png', dpi=300, bbox_inches='tight')
plt.show()
print("âœ… SHAP dependence plots saved")

## 6. Ablation Study

In [None]:
# Ablation study: Compare different configurations
print("Performing ablation study...\n")

# Load baseline model for comparison
baseline_model = joblib.load('../results/models/xgboost_best.pkl')
y_pred_baseline = baseline_model.predict(X_test)
proba_baseline = baseline_model.predict_proba(X_test)[:, 1]

ablation_results = []

# 1. Baseline (single model, no uncertainty, no escalation)
ablation_results.append({
    'Configuration': 'Baseline (Single Model)',
    'Accuracy': accuracy_score(y_test, y_pred_baseline),
    'Precision': precision_score(y_test, y_pred_baseline),
    'Recall': recall_score(y_test, y_pred_baseline),
    'F1-Score': f1_score(y_test, y_pred_baseline),
    'AUC-ROC': roc_auc_score(y_test, proba_baseline),
    'Automation Rate': 1.0,
    'Escalation Rate': 0.0
})

# 2. Bootstrap Ensemble (no escalation)
ablation_results.append({
    'Configuration': 'Bootstrap Ensemble Only',
    'Accuracy': accuracy_score(y_test, y_pred_test),
    'Precision': precision_score(y_test, y_pred_test),
    'Recall': recall_score(y_test, y_pred_test),
    'F1-Score': f1_score(y_test, y_pred_test),
    'AUC-ROC': roc_auc_score(y_test, proba_test[:, 1]),
    'Automation Rate': 1.0,
    'Escalation Rate': 0.0
})

# 3. Complete System (ensemble + escalation)
if n_automated > 0:
    ablation_results.append({
        'Configuration': 'Complete System (Ensemble + Escalation)',
        'Accuracy': accuracy_score(y_test_auto, y_pred_auto),
        'Precision': precision_score(y_test_auto, y_pred_auto),
        'Recall': recall_score(y_test_auto, y_pred_auto),
        'F1-Score': f1_score(y_test_auto, y_pred_auto),
        'AUC-ROC': roc_auc_score(y_test_auto, proba_auto),
        'Automation Rate': n_automated / len(y_test),
        'Escalation Rate': n_escalated / len(y_test)
    })

ablation_df = pd.DataFrame(ablation_results)

print("="*80)
print("ABLATION STUDY RESULTS")
print("="*80)
print(ablation_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize ablation study
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#3498db', '#2ecc71', '#e74c3c']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    values = ablation_df[metric].values
    bars = ax.bar(range(len(ablation_df)), values, color=colors)
    ax.set_xticks(range(len(ablation_df)))
    ax.set_xticklabels(ablation_df['Configuration'], rotation=15, ha='right')
    ax.set_ylabel(metric, fontsize=12, fontweight='bold')
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_ylim([min(values) - 0.02, 1.0])
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.4f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('../results/figures/ablation_study.png', dpi=300, bbox_inches='tight')
plt.show()
print("âœ… Ablation study visualization saved")

## 7. Final ROC Curve Comparison

In [None]:
# Plot ROC curves for all configurations
plt.figure(figsize=(10, 8))

# Baseline
fpr_base, tpr_base, _ = roc_curve(y_test, proba_baseline)
auc_base = roc_auc_score(y_test, proba_baseline)
plt.plot(fpr_base, tpr_base, label=f'Baseline Model (AUC={auc_base:.3f})', 
         linewidth=2, color='#3498db')

# Bootstrap Ensemble
fpr_ens, tpr_ens, _ = roc_curve(y_test, proba_test[:, 1])
auc_ens = roc_auc_score(y_test, proba_test[:, 1])
plt.plot(fpr_ens, tpr_ens, label=f'Bootstrap Ensemble (AUC={auc_ens:.3f})',
         linewidth=2, color='#2ecc71')

# Complete System (automated only)
if n_automated > 0:
    fpr_auto, tpr_auto, _ = roc_curve(y_test_auto, proba_auto)
    auc_auto = roc_auc_score(y_test_auto, proba_auto)
    plt.plot(fpr_auto, tpr_auto, 
             label=f'Complete System - Automated ({n_automated} samples, AUC={auc_auto:.3f})',
             linewidth=2, color='#e74c3c')

# Random baseline
plt.plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)

plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curves - System Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('../results/figures/final_roc_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print("âœ… Final ROC comparison saved")

## 8. Business Impact Analysis

In [None]:
# Calculate business impact
print("\nBusiness Impact Analysis:")
print("="*60)

# Costs from escalation system
costs = escalation_system.calculate_costs(y_test.values, y_pred_test, escalate_mask)

print(f"\nCost Analysis (Test Set):")
print(f"  Baseline Total Cost:      ${costs['baseline_cost']:.2f}")
print(f"  System Total Cost:        ${costs['total_cost']:.2f}")
print(f"  Cost Savings:             ${costs['cost_savings']:.2f}")
print(f"  Savings Rate:             {costs['cost_savings']/costs['baseline_cost']*100:.1f}%")

print(f"\nCost Breakdown:")
print(f"  False Positive Cost:      ${costs['false_positive_cost']:.2f}")
print(f"  False Negative Cost:      ${costs['false_negative_cost']:.2f}")
print(f"  Human Review Cost:        ${costs['escalation_cost']:.2f}")

print(f"\nOperational Metrics:")
print(f"  Applications Processed:   {len(y_test)}")
print(f"  Automated Decisions:      {n_automated} ({n_automated/len(y_test)*100:.1f}%)")
print(f"  Human Reviews Required:   {n_escalated} ({n_escalated/len(y_test)*100:.1f}%)")

# Efficiency gain
time_per_manual_review = 15  # minutes
time_saved = n_automated * time_per_manual_review / 60  # hours
print(f"\nTime Efficiency:")
print(f"  Time Saved (vs manual):   {time_saved:.1f} hours")
print(f"  Productivity Gain:        {n_automated/len(y_test)*100:.1f}%")

print("="*60)

In [None]:
# Visualize business impact
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Cost comparison
costs_data = ['Baseline', 'With System']
costs_values = [costs['baseline_cost'], costs['total_cost']]
colors_cost = ['#e74c3c', '#2ecc71']
bars = axes[0, 0].bar(costs_data, costs_values, color=colors_cost)
axes[0, 0].set_ylabel('Total Cost ($)', fontsize=12, fontweight='bold')
axes[0, 0].set_title('Total Cost Comparison', fontsize=14, fontweight='bold')
axes[0, 0].grid(axis='y', alpha=0.3)
for bar in bars:
    height = bar.get_height()
    axes[0, 0].text(bar.get_x() + bar.get_width()/2., height,
                    f'${height:.2f}', ha='center', va='bottom', fontsize=11)

# Cost breakdown
cost_breakdown = {
    'False\nPositives': costs['false_positive_cost'],
    'False\nNegatives': costs['false_negative_cost'],
    'Human\nReview': costs['escalation_cost']
}
axes[0, 1].bar(cost_breakdown.keys(), cost_breakdown.values(), 
               color=['#e74c3c', '#f39c12', '#3498db'])
axes[0, 1].set_ylabel('Cost ($)', fontsize=12, fontweight='bold')
axes[0, 1].set_title('Cost Breakdown', fontsize=14, fontweight='bold')
axes[0, 1].grid(axis='y', alpha=0.3)

# Decision distribution
decision_data = ['Automated', 'Escalated']
decision_values = [n_automated, n_escalated]
explode = (0.05, 0)
axes[1, 0].pie(decision_values, labels=decision_data, autopct='%1.1f%%',
               startangle=90, colors=['#2ecc71', '#e74c3c'], explode=explode)
axes[1, 0].set_title('Decision Distribution', fontsize=14, fontweight='bold')

# Accuracy by category
acc_data = {
    'Baseline\nModel': ablation_df.loc[0, 'Accuracy'],
    'Bootstrap\nEnsemble': ablation_df.loc[1, 'Accuracy'],
    'System\n(Automated)': ablation_df.loc[2, 'Accuracy'] if len(ablation_df) > 2 else 0
}
bars = axes[1, 1].bar(acc_data.keys(), acc_data.values(), 
                      color=['#3498db', '#2ecc71', '#e74c3c'])
axes[1, 1].set_ylabel('Accuracy', fontsize=12, fontweight='bold')
axes[1, 1].set_title('Accuracy Comparison', fontsize=14, fontweight='bold')
axes[1, 1].set_ylim([min(acc_data.values()) - 0.01, 1.0])
axes[1, 1].grid(axis='y', alpha=0.3)
for bar in bars:
    height = bar.get_height()
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.4f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('../results/figures/business_impact.png', dpi=300, bbox_inches='tight')
plt.show()
print("âœ… Business impact visualization saved")

## 9. Save Final Results

In [None]:
# Save ablation study results
ablation_df.to_csv('../results/reports/ablation_study_results.csv', index=False)
print("âœ… Ablation study results saved")

# Save business impact summary
business_impact = pd.DataFrame([{
    'Metric': 'Total Cost Baseline',
    'Value': costs['baseline_cost']
}, {
    'Metric': 'Total Cost with System',
    'Value': costs['total_cost']
}, {
    'Metric': 'Cost Savings',
    'Value': costs['cost_savings']
}, {
    'Metric': 'Savings Rate (%)',
    'Value': costs['cost_savings']/costs['baseline_cost']*100
}, {
    'Metric': 'Automation Rate (%)',
    'Value': n_automated/len(y_test)*100
}, {
    'Metric': 'Escalation Rate (%)',
    'Value': n_escalated/len(y_test)*100
}])
business_impact.to_csv('../results/reports/business_impact_summary.csv', index=False)
print("âœ… Business impact summary saved")

## 10. Final Summary

In [None]:
print("\n" + "="*60)
print("PHASE 5 COMPLETE: COMPREHENSIVE EVALUATION")
print("="*60)

print("\nâœ… Achievements:")
print("   - Complete end-to-end system evaluated")
print("   - SHAP interpretability analysis completed")
print("   - Ablation study performed")
print("   - Business impact quantified")
print("   - All visualizations generated")

print("\nðŸ“Š Final System Performance:")
print(f"   Automation Rate:          {n_automated/len(y_test)*100:.1f}%")
print(f"   Automated Accuracy:       {accuracy_score(y_test_auto, y_pred_auto):.4f}")
print(f"   Automated AUC-ROC:        {roc_auc_score(y_test_auto, proba_auto):.4f}")
print(f"   Cost Savings:             ${costs['cost_savings']:.2f}")
print(f"   Savings Rate:             {costs['cost_savings']/costs['baseline_cost']*100:.1f}%")

print("\nðŸŽ¯ Project Success Criteria:")
if n_automated/len(y_test) >= 0.70:
    print(f"   âœ… Automation â‰¥70%: {n_automated/len(y_test)*100:.1f}%")
if accuracy_score(y_test_auto, y_pred_auto) >= 0.85:
    print(f"   âœ… Accuracy â‰¥85%: {accuracy_score(y_test_auto, y_pred_auto):.4f}")
if costs['cost_savings'] > 0:
    print(f"   âœ… Positive cost savings: ${costs['cost_savings']:.2f}")

print("\nðŸš€ Next: Phase 6 - Final Documentation")
print("   - Comprehensive project report")
print("   - Presentation slides")
print("   - Code documentation")
print("   - README and deployment guide")
print("="*60)