# 03_evaluation.ipynb


In [4]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, precision_recall_curve,
                             average_precision_score, precision_score, recall_score, f1_score)
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')


In [5]:

print("=" * 80)
print("HEALTHCARE PROVIDER FRAUD DETECTION - EVALUATION & ERROR ANALYSIS")
print("=" * 80)


HEALTHCARE PROVIDER FRAUD DETECTION - EVALUATION & ERROR ANALYSIS




# 1. LOAD MODEL, SCALER, AND TEST DATA


In [6]:

print("\n1. LOADING MODEL, SCALER, AND TEST DATA...")

# Load saved model and scaler
with open('../models/best_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('../models/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

print(f"✓ Model loaded: {type(model).__name__}")
print(f"✓ Scaler loaded: StandardScaler")

# Load test data
test_data = pd.read_csv('../data/processed_test_provider_data.csv')
print(f"✓ Test data loaded: {test_data.shape}")

# Prepare test features and target
providers_test = test_data['Provider']
X_test = test_data.drop(['Provider', 'PotentialFraud'], axis=1)
y_test = test_data['PotentialFraud'].map({'Yes': 1, 'No': 0})

# Scale test features
X_test_scaled = scaler.transform(X_test)

print(f"✓ Test samples: {X_test.shape[0]}")
print(f"✓ Test features: {X_test.shape[1]}")
print(f"✓ Fraud cases in test: {y_test.sum()}/{len(y_test)} ({y_test.sum()/len(y_test)*100:.2f}%)")



1. LOADING MODEL, SCALER, AND TEST DATA...
✓ Model loaded: RandomForestClassifier
✓ Scaler loaded: StandardScaler
✓ Test data loaded: (1082, 53)
✓ Test samples: 1082
✓ Test features: 51
✓ Fraud cases in test: 101/1082 (9.33%)




# 2. MODEL PREDICTIONS ON TEST SET


In [7]:

print("\n2. GENERATING PREDICTIONS ON TEST SET...")

y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

print("✓ Predictions generated")
print(f"✓ Predicted fraud cases: {y_pred.sum()}/{len(y_pred)} ({y_pred.sum()/len(y_pred)*100:.2f}%)")



2. GENERATING PREDICTIONS ON TEST SET...
✓ Predictions generated
✓ Predicted fraud cases: 165/1082 (15.25%)




# 3. COMPREHENSIVE EVALUATION METRICS


In [8]:

print("\n3. COMPREHENSIVE EVALUATION METRICS...")

# Calculate all metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)

print("\n" + "=" * 80)
print("FINAL TEST SET PERFORMANCE")
print("=" * 80)
print(f"Precision:    {precision:.4f}  (Of flagged providers, {precision*100:.1f}% are actually fraudulent)")
print(f"Recall:       {recall:.4f}  (Model catches {recall*100:.1f}% of all fraudulent providers)")
print(f"F1-Score:     {f1:.4f}  (Harmonic mean of precision and recall)")
print(f"ROC-AUC:      {roc_auc:.4f}  (Overall discriminative ability)")
print(f"PR-AUC:       {pr_auc:.4f}  (Area under precision-recall curve)")
print("=" * 80)

# Detailed classification report
print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 80)
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraudulent'], digits=4))



3. COMPREHENSIVE EVALUATION METRICS...

FINAL TEST SET PERFORMANCE
Precision:    0.5394  (Of flagged providers, 53.9% are actually fraudulent)
Recall:       0.8812  (Model catches 88.1% of all fraudulent providers)
F1-Score:     0.6692  (Harmonic mean of precision and recall)
ROC-AUC:      0.9690  (Overall discriminative ability)
PR-AUC:       0.7530  (Area under precision-recall curve)

DETAILED CLASSIFICATION REPORT
              precision    recall  f1-score   support

  Legitimate     0.9869    0.9225    0.9536       981
  Fraudulent     0.5394    0.8812    0.6692       101

    accuracy                         0.9187      1082
   macro avg     0.7632    0.9019    0.8114      1082
weighted avg     0.9451    0.9187    0.9271      1082





# 4. CONFUSION MATRIX ANALYSIS


In [9]:

print("\n4. CONFUSION MATRIX ANALYSIS...")

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

print("\n" + "=" * 80)
print("CONFUSION MATRIX BREAKDOWN")
print("=" * 80)
print(f"True Negatives (TN):  {tn:4d}  - Legitimate providers correctly identified")
print(f"False Positives (FP): {fp:4d}  - Legitimate providers wrongly flagged (Type I Error)")
print(f"False Negatives (FN): {fn:4d}  - Fraudulent providers missed (Type II Error)")
print(f"True Positives (TP):  {tp:4d}  - Fraudulent providers correctly caught")
print("=" * 80)

print(f"\nError Rates:")
print(f"  False Positive Rate: {fp/(fp+tn)*100:.2f}% (of all legitimate)")
print(f"  False Negative Rate: {fn/(fn+tp)*100:.2f}% (of all fraudulent)")

# Visualize confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Raw counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Legitimate', 'Fraudulent'],
            yticklabels=['Legitimate', 'Fraudulent'],
            annot_kws={'size': 14, 'weight': 'bold'})
axes[0].set_ylabel('True Label', fontweight='bold', fontsize=11)
axes[0].set_xlabel('Predicted Label', fontweight='bold', fontsize=11)
axes[0].set_title('Confusion Matrix (Counts)', fontweight='bold', fontsize=12)

# Add text annotations
for i in range(2):
    for j in range(2):
        if i == 0 and j == 0:
            label = f'TN\n{cm[i,j]}'
        elif i == 0 and j == 1:
            label = f'FP\n{cm[i,j]}'
        elif i == 1 and j == 0:
            label = f'FN\n{cm[i,j]}'
        else:
            label = f'TP\n{cm[i,j]}'

# Normalized
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=axes[1],
            xticklabels=['Legitimate', 'Fraudulent'],
            yticklabels=['Legitimate', 'Fraudulent'],
            annot_kws={'size': 14, 'weight': 'bold'})
axes[1].set_ylabel('True Label', fontweight='bold', fontsize=11)
axes[1].set_xlabel('Predicted Label', fontweight='bold', fontsize=11)
axes[1].set_title('Confusion Matrix (Percentages)', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.savefig('../reports/figures/test_confusion_matrix.png', dpi=300, bbox_inches='tight')
print("\n✓ Saved: test_confusion_matrix.png")
plt.close()



4. CONFUSION MATRIX ANALYSIS...

CONFUSION MATRIX BREAKDOWN
True Negatives (TN):   905  - Legitimate providers correctly identified
False Positives (FP):   76  - Legitimate providers wrongly flagged (Type I Error)
False Negatives (FN):   12  - Fraudulent providers missed (Type II Error)
True Positives (TP):    89  - Fraudulent providers correctly caught

Error Rates:
  False Positive Rate: 7.75% (of all legitimate)
  False Negative Rate: 11.88% (of all fraudulent)

✓ Saved: test_confusion_matrix.png

✓ Saved: test_confusion_matrix.png




# 5. ROC AND PRECISION-RECALL CURVES ON TEST SET


In [10]:

print("\n5. GENERATING ROC AND PR CURVES FOR TEST SET...")

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# ROC Curve
fpr, tpr, thresholds_roc = roc_curve(y_test, y_pred_proba)
axes[0].plot(fpr, tpr, linewidth=3, label=f'Model (AUC = {roc_auc:.3f})', color='#2ca02c')
axes[0].plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random (AUC = 0.500)')
axes[0].set_xlabel('False Positive Rate', fontweight='bold', fontsize=11)
axes[0].set_ylabel('True Positive Rate', fontweight='bold', fontsize=11)
axes[0].set_title('ROC Curve - Test Set', fontweight='bold', fontsize=12)
axes[0].legend(loc='lower right', fontsize=10)
axes[0].grid(True, alpha=0.3)

# Precision-Recall Curve
precision_curve, recall_curve, thresholds_pr = precision_recall_curve(y_test, y_pred_proba)
baseline = y_test.sum() / len(y_test)
axes[1].plot(recall_curve, precision_curve, linewidth=3, 
            label=f'Model (AP = {pr_auc:.3f})', color='#2ca02c')
axes[1].axhline(y=baseline, color='k', linestyle='--', linewidth=2,
               label=f'Random (AP = {baseline:.3f})')
axes[1].set_xlabel('Recall', fontweight='bold', fontsize=11)
axes[1].set_ylabel('Precision', fontweight='bold', fontsize=11)
axes[1].set_title('Precision-Recall Curve - Test Set', fontweight='bold', fontsize=12)
axes[1].legend(loc='upper right', fontsize=10)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/test_roc_pr_curves.png', dpi=300, bbox_inches='tight')
print("✓ Saved: test_roc_pr_curves.png")
plt.close()



5. GENERATING ROC AND PR CURVES FOR TEST SET...
✓ Saved: test_roc_pr_curves.png
✓ Saved: test_roc_pr_curves.png




# 6. COST-BENEFIT ANALYSIS


In [11]:

print("\n6. COST-BENEFIT ANALYSIS...")

# Define costs (realistic estimates)
cost_investigation = 5000        # Cost to investigate one provider
cost_missed_fraud = 100000       # Average loss from one missed fraud case
benefit_caught_fraud = 100000    # Benefit/savings from catching one fraud
cost_damaged_reputation = 2000   # Additional cost from false accusations

# Calculate costs and benefits
cost_fp = fp * (cost_investigation + cost_damaged_reputation)
cost_fn = fn * cost_missed_fraud
cost_tp = tp * cost_investigation
benefit_tp = tp * benefit_caught_fraud

total_cost = cost_fp + cost_fn + cost_tp
total_benefit = benefit_tp
net_benefit = total_benefit - total_cost

print("\n" + "=" * 80)
print("COST-BENEFIT ANALYSIS")
print("=" * 80)
print(f"\nCosts:")
print(f"  False Positives (Investigation + Reputation): ${cost_fp:>12,.0f}")
print(f"    ({fp} cases × ${cost_investigation + cost_damaged_reputation:,})")
print(f"  False Negatives (Missed Fraud):               ${cost_fn:>12,.0f}")
print(f"    ({fn} cases × ${cost_missed_fraud:,})")
print(f"  True Positives (Investigation Cost):          ${cost_tp:>12,.0f}")
print(f"    ({tp} cases × ${cost_investigation:,})")
print(f"  {'-' * 60}")
print(f"  Total Costs:                                  ${total_cost:>12,.0f}")

print(f"\nBenefits:")
print(f"  Fraud Prevented (True Positives):             ${benefit_tp:>12,.0f}")
print(f"    ({tp} cases × ${benefit_caught_fraud:,})")

print(f"\n{'=' * 80}")
print(f"  NET BENEFIT:                                  ${net_benefit:>12,.0f}")
print(f"{'=' * 80}")

# Compare with baselines
print(f"\nComparison with Baseline Strategies:")

# Baseline 1: Random 10% investigation
random_inv_rate = 0.1
baseline_tp_random = int(y_test.sum() * random_inv_rate)
baseline_fp_random = int((len(y_test) - y_test.sum()) * random_inv_rate)
baseline_fn_random = y_test.sum() - baseline_tp_random
baseline_benefit_random = (baseline_tp_random * benefit_caught_fraud) - \
                          ((baseline_tp_random + baseline_fp_random) * cost_investigation) - \
                          (baseline_fn_random * cost_missed_fraud)

print(f"  1. Random 10% Investigation:   ${baseline_benefit_random:>12,.0f}")

# Baseline 2: Investigate all (100%)
baseline_all_cost = len(y_test) * cost_investigation
baseline_all_benefit = y_test.sum() * benefit_caught_fraud
baseline_all_net = baseline_all_benefit - baseline_all_cost

print(f"  2. Investigate All Providers:  ${baseline_all_net:>12,.0f}")

# Baseline 3: Investigate none (do nothing)
baseline_none = -(y_test.sum() * cost_missed_fraud)
print(f"  3. No Investigation (Do Nothing): ${baseline_none:>12,.0f}")

print(f"\n  Our Model Improvement:")
print(f"    vs Random:          ${net_benefit - baseline_benefit_random:>12,.0f}")
print(f"    vs Investigate All: ${net_benefit - baseline_all_net:>12,.0f}")
print(f"    vs Do Nothing:      ${net_benefit - baseline_none:>12,.0f}")

# ROI calculation
total_investigation_cost = (tp + fp) * cost_investigation
roi = (net_benefit / total_investigation_cost) * 100 if total_investigation_cost > 0 else 0

print(f"\n  Return on Investment (ROI):    {roi:>11,.1f}%")



6. COST-BENEFIT ANALYSIS...

COST-BENEFIT ANALYSIS

Costs:
  False Positives (Investigation + Reputation): $     532,000
    (76 cases × $7,000)
  False Negatives (Missed Fraud):               $   1,200,000
    (12 cases × $100,000)
  True Positives (Investigation Cost):          $     445,000
    (89 cases × $5,000)
  ------------------------------------------------------------
  Total Costs:                                  $   2,177,000

Benefits:
  Fraud Prevented (True Positives):             $   8,900,000
    (89 cases × $100,000)

  NET BENEFIT:                                  $   6,723,000

Comparison with Baseline Strategies:
  1. Random 10% Investigation:   $  -8,640,000
  2. Investigate All Providers:  $   4,690,000
  3. No Investigation (Do Nothing): $ -10,100,000

  Our Model Improvement:
    vs Random:          $  15,363,000
    vs Investigate All: $   2,033,000
    vs Do Nothing:      $  16,823,000

  Return on Investment (ROI):          814.9%




# 7. ERROR ANALYSIS - FALSE POSITIVES


In [12]:

print("\n" + "=" * 80)
print("7. ERROR ANALYSIS - FALSE POSITIVES")
print("=" * 80)

# Identify false positives
fp_mask = (y_test == 0) & (y_pred == 1)
fp_indices = np.where(fp_mask)[0]
fp_providers = providers_test.iloc[fp_indices].values
fp_features = X_test.iloc[fp_indices]
fp_probas = y_pred_proba[fp_mask]

print(f"\nTotal False Positives: {len(fp_indices)}")
print(f"False Positive Rate: {len(fp_indices)/(len(y_test)-y_test.sum())*100:.2f}%")

if len(fp_indices) > 0:
    # Select 3 cases with highest fraud probabilities (most confident errors)
    num_cases = min(3, len(fp_indices))
    top_fp_indices = np.argsort(fp_probas)[-num_cases:]
    
    # Load feature importance for analysis
    feature_importance = pd.read_csv('../reports/feature_importance.csv')
    top_features = feature_importance.head(10)['Feature'].tolist()
    
    # Get fraud and legitimate averages from training data
    train_data = pd.read_csv('../data/processed_train_provider_data.csv')
    X_train_all = train_data.drop(['Provider', 'PotentialFraud'], axis=1)
    y_train_all = train_data['PotentialFraud'].map({'Yes': 1, 'No': 0})
    
    for i, idx in enumerate(top_fp_indices, 1):
        print(f"\n{'=' * 80}")
        print(f"FALSE POSITIVE CASE STUDY #{i}")
        print(f"{'=' * 80}")
        print(f"Provider ID: {fp_providers[idx]}")
        print(f"Fraud Probability: {fp_probas[idx]:.4f} (Model was {fp_probas[idx]*100:.1f}% confident)")
        print(f"True Label: Legitimate (0)")
        print(f"Predicted Label: Fraudulent (1)")
        
        print(f"\n--- Key Characteristics ---")
        provider_features = fp_features.iloc[idx]
        
        for feature in top_features:
            if feature in provider_features.index:
                value = provider_features[feature]
                fraud_avg = X_train_all[y_train_all == 1][feature].mean()
                legit_avg = X_train_all[y_train_all == 0][feature].mean()
                
                # Determine which average it's closer to
                closer_to = "FRAUD" if abs(value - fraud_avg) < abs(value - legit_avg) else "Legitimate"
                
                print(f"\n  {feature}:")
                print(f"    This Provider: {value:>10.2f}")
                print(f"    Fraud Avg:     {fraud_avg:>10.2f}")
                print(f"    Legit Avg:     {legit_avg:>10.2f}")
                print(f"    → Closer to: {closer_to}")
        
        print(f"\n--- Analysis ---")
        print(f"  This legitimate provider exhibits patterns that resemble fraudulent")
        print(f"  behavior. Possible explanations:")
        print(f"    • High-volume specialty practice (e.g., cardiology, oncology)")
        print(f"    • Teaching hospital with complex cases")
        print(f"    • Serves high-risk patient population")
        print(f"    • Legitimate practice in high-cost region")
        print(f"\n  Recommendation:")
        print(f"    • Review provider credentials and specialty")
        print(f"    • Quick verification before full investigation")
        print(f"    • Consider adding specialty/region features to model")



7. ERROR ANALYSIS - FALSE POSITIVES

Total False Positives: 76
False Positive Rate: 7.75%

FALSE POSITIVE CASE STUDY #1
Provider ID: PRV51456
Fraud Probability: 0.9744 (Model was 97.4% confident)
True Label: Legitimate (0)
Predicted Label: Fraudulent (1)

--- Key Characteristics ---

  Inpatient_MaxLOS:
    This Provider:      35.00
    Fraud Avg:          23.64
    Legit Avg:           4.53
    → Closer to: FRAUD

  Inpatient_TotalLOS:
    This Provider:     657.00
    Fraud Avg:         252.82
    Legit Avg:          19.47
    → Closer to: FRAUD

  Inpatient_AvgClaimsPerBeneficiary:
    This Provider:       1.09
    Fraud Avg:           0.98
    Legit Avg:           0.35
    → Closer to: FRAUD

  Inpatient_NumBeneficiaries:
    This Provider:     103.00
    Fraud Avg:          38.78
    Legit Avg:           3.28
    → Closer to: FRAUD

  Inpatient_NumClaims:
    This Provider:     112.00
    Fraud Avg:          43.92
    Legit Avg:           3.49
    → Closer to: FRAUD

  Inpatient_



# 8. ERROR ANALYSIS - FALSE NEGATIVES


In [13]:

print("\n" + "=" * 80)
print("8. ERROR ANALYSIS - FALSE NEGATIVES")
print("=" * 80)

# Identify false negatives
fn_mask = (y_test == 1) & (y_pred == 0)
fn_indices = np.where(fn_mask)[0]
fn_providers = providers_test.iloc[fn_indices].values
fn_features = X_test.iloc[fn_indices]
fn_probas = y_pred_proba[fn_mask]

print(f"\nTotal False Negatives: {len(fn_indices)}")
print(f"False Negative Rate: {len(fn_indices)/y_test.sum()*100:.2f}%")
print(f"These are fraudulent providers that evaded detection!")

if len(fn_indices) > 0:
    # Select 3 cases with lowest fraud probabilities (most missed)
    num_cases = min(3, len(fn_indices))
    top_fn_indices = np.argsort(fn_probas)[:num_cases]
    
    for i, idx in enumerate(top_fn_indices, 1):
        print(f"\n{'=' * 80}")
        print(f"FALSE NEGATIVE CASE STUDY #{i}")
        print(f"{'=' * 80}")
        print(f"Provider ID: {fn_providers[idx]}")
        print(f"Fraud Probability: {fn_probas[idx]:.4f} (Model was only {fn_probas[idx]*100:.1f}% confident)")
        print(f"True Label: Fraudulent (1)")
        print(f"Predicted Label: Legitimate (0)")
        
        print(f"\n--- Key Characteristics ---")
        provider_features = fn_features.iloc[idx]
        
        for feature in top_features:
            if feature in provider_features.index:
                value = provider_features[feature]
                fraud_avg = X_train_all[y_train_all == 1][feature].mean()
                legit_avg = X_train_all[y_train_all == 0][feature].mean()
                
                closer_to = "Fraud" if abs(value - fraud_avg) < abs(value - legit_avg) else "LEGITIMATE"
                
                print(f"\n  {feature}:")
                print(f"    This Provider: {value:>10.2f}")
                print(f"    Fraud Avg:     {fraud_avg:>10.2f}")
                print(f"    Legit Avg:     {legit_avg:>10.2f}")
                print(f"    → Closer to: {closer_to}")
        
        print(f"\n--- Analysis ---")
        print(f"  This fraudulent provider successfully mimics legitimate behavior.")
        print(f"  Sophisticated fraud characteristics:")
        print(f"    • Operating 'under the radar' with moderate volumes")
        print(f"    • Subtle billing manipulation")
        print(f"    • Diversified fraud across multiple small claims")
        print(f"    • May be using advanced schemes (kickbacks, unbundling)")
        print(f"\n  Recommendation:")
        print(f"    • Add network analysis (provider relationships)")
        print(f"    • Include temporal patterns (billing frequency spikes)")
        print(f"    • Analyze procedure code combinations for unbundling")
        print(f"    • Consider anomaly detection for outliers")



8. ERROR ANALYSIS - FALSE NEGATIVES

Total False Negatives: 12
False Negative Rate: 11.88%
These are fraudulent providers that evaded detection!

FALSE NEGATIVE CASE STUDY #1
Provider ID: PRV55010
Fraud Probability: 0.1948 (Model was only 19.5% confident)
True Label: Fraudulent (1)
Predicted Label: Legitimate (0)

--- Key Characteristics ---

  Inpatient_MaxLOS:
    This Provider:       0.00
    Fraud Avg:          23.64
    Legit Avg:           4.53
    → Closer to: LEGITIMATE

  Inpatient_TotalLOS:
    This Provider:       0.00
    Fraud Avg:         252.82
    Legit Avg:          19.47
    → Closer to: LEGITIMATE

  Inpatient_AvgClaimsPerBeneficiary:
    This Provider:       0.00
    Fraud Avg:           0.98
    Legit Avg:           0.35
    → Closer to: LEGITIMATE

  Inpatient_NumBeneficiaries:
    This Provider:       0.00
    Fraud Avg:          38.78
    Legit Avg:           3.28
    → Closer to: LEGITIMATE

  Inpatient_NumClaims:
    This Provider:       0.00
    Fraud Avg:  



# 9. THRESHOLD ANALYSIS


In [14]:

print("\n" + "=" * 80)
print("9. THRESHOLD ANALYSIS")
print("=" * 80)

print("\nAnalyzing different classification thresholds...")

thresholds_to_test = [0.3, 0.4, 0.5, 0.6, 0.7]
threshold_results = []

for threshold in thresholds_to_test:
    y_pred_threshold = (y_pred_proba >= threshold).astype(int)
    
    prec = precision_score(y_test, y_pred_threshold)
    rec = recall_score(y_test, y_pred_threshold)
    f1_thresh = f1_score(y_test, y_pred_threshold)
    
    cm_thresh = confusion_matrix(y_test, y_pred_threshold)
    tn_t, fp_t, fn_t, tp_t = cm_thresh.ravel()
    
    # Calculate net benefit for this threshold
    cost_fp_t = fp_t * (cost_investigation + cost_damaged_reputation)
    cost_fn_t = fn_t * cost_missed_fraud
    cost_tp_t = tp_t * cost_investigation
    benefit_tp_t = tp_t * benefit_caught_fraud
    net_benefit_t = benefit_tp_t - (cost_fp_t + cost_fn_t + cost_tp_t)
    
    threshold_results.append({
        'Threshold': threshold,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1_thresh,
        'TP': tp_t,
        'FP': fp_t,
        'FN': fn_t,
        'Net Benefit ($)': net_benefit_t
    })

threshold_df = pd.DataFrame(threshold_results)
print("\n" + threshold_df.to_string(index=False))

print(f"\nCurrent model uses threshold: 0.5")
print(f"Consider adjusting threshold based on business priorities:")
print(f"  • Lower threshold (0.3-0.4): Higher recall, catch more fraud")
print(f"  • Higher threshold (0.6-0.7): Higher precision, fewer false alarms")



9. THRESHOLD ANALYSIS

Analyzing different classification thresholds...

 Threshold  Precision   Recall  F1-Score  TP  FP  FN  Net Benefit ($)
       0.3   0.420601 0.970297  0.586826  98 135   3          8065000
       0.4   0.477157 0.930693  0.630872  94 103   7          7509000
       0.5   0.539394 0.881188  0.669173  89  76  12          6723000
       0.6   0.602837 0.841584  0.702479  85  56  16          6083000
       0.7   0.655172 0.752475  0.700461  76  40  25          4440000

Current model uses threshold: 0.5
Consider adjusting threshold based on business priorities:
  • Lower threshold (0.3-0.4): Higher recall, catch more fraud
  • Higher threshold (0.6-0.7): Higher precision, fewer false alarms



# 10. SAVE COMPREHENSIVE RESULTS


In [15]:

print("\n10. SAVING COMPREHENSIVE RESULTS...")

# Save final report
report = {
    'Model': type(model).__name__,
    'Test_Samples': len(y_test),
    'Fraud_Cases': int(y_test.sum()),
    'Legitimate_Cases': int(len(y_test) - y_test.sum()),
    'Precision': precision,
    'Recall': recall,
    'F1_Score': f1,
    'ROC_AUC': roc_auc,
    'PR_AUC': pr_auc,
    'True_Positives': int(tp),
    'False_Positives': int(fp),
    'True_Negatives': int(tn),
    'False_Negatives': int(fn),
    'Net_Benefit_USD': int(net_benefit),
    'Investigation_Cost_USD': int(total_investigation_cost),
    'ROI_Percent': roi
}

report_df = pd.DataFrame([report])
report_df.to_csv('../reports/final_test_evaluation.csv', index=False)

# Save detailed predictions
predictions_df = pd.DataFrame({
    'Provider': providers_test.values,
    'True_Label': y_test.values,
    'Predicted_Label': y_pred,
    'Fraud_Probability': y_pred_proba,
    'Correct_Prediction': (y_test == y_pred).astype(int),
    'Error_Type': ['Correct' if y_test.iloc[i] == y_pred[i] 
                   else ('False Positive' if y_pred[i] == 1 
                   else 'False Negative') 
                   for i in range(len(y_test))]
})
predictions_df.to_csv('../reports/test_predictions_detailed.csv', index=False)

# Save threshold analysis
threshold_df.to_csv('../reports/threshold_analysis.csv', index=False)

print("✓ Saved: final_test_evaluation.csv")
print("✓ Saved: test_predictions_detailed.csv")
print("✓ Saved: threshold_analysis.csv")



10. SAVING COMPREHENSIVE RESULTS...
✓ Saved: final_test_evaluation.csv
✓ Saved: test_predictions_detailed.csv
✓ Saved: threshold_analysis.csv




# 11. RECOMMENDATIONS FOR IMPROVEMENT


In [16]:

print("\n" + "=" * 80)
print("11. RECOMMENDATIONS FOR MODEL IMPROVEMENT")
print("=" * 80)

print("\n1. FEATURE ENGINEERING:")
print("   • Network analysis: Provider-provider referral networks")
print("   • Temporal patterns: Sudden changes in billing behavior")
print("   • Geographic features: Regional cost-of-living adjustments")
print("   • Procedure combinations: Detect unbundling patterns")
print("   • Patient trajectory analysis: Track patient movement between providers")

print("\n2. ADVANCED MODELING:")
print("   • Hyperparameter optimization (GridSearchCV, Bayesian)")
print("   • Ensemble multiple models (stacking, voting)")
print("   • Anomaly detection for sophisticated cases")
print("   • Deep learning for complex pattern recognition")
print("   • Time-series models for temporal fraud detection")

print("\n3. DATA ENHANCEMENT:")
print("   • Historical fraud investigation outcomes")
print("   • External data: Provider licenses, complaint history")
print("   • More granular procedure/diagnosis codes")
print("   • Patient satisfaction scores")
print("   • Peer comparison within specialties")

print("\n4. OPERATIONAL IMPROVEMENTS:")
print("   • Implement threshold optimization based on investigation capacity")
print("   • Two-stage model: Quick triage then deep investigation")
print("   • Active learning: Retrain with investigation outcomes")
print("   • Explainable AI: Provide reasons for each flagged case")
print("   • Regular model monitoring and retraining (quarterly)")

print("\n5. BUSINESS INTEGRATION:")
print("   • Risk scoring system (not just binary classification)")
print("   • Prioritization dashboard for investigators")
print("   • Feedback loop from investigation results")
print("   • Integration with existing fraud detection systems")
print("   • A/B testing of different model versions")

print("\n" + "=" * 80)
print("EVALUATION COMPLETE!")
print("=" * 80)

print(f"\n{'=' * 80}")
print("FINAL SUMMARY")
print(f"{'=' * 80}")
print(f"\nModel Performance:")
print(f"  • Precision: {precision:.1%} of flagged providers are actually fraudulent")
print(f"  • Recall: {recall:.1%} of fraudulent providers are successfully detected")
print(f"  • F1-Score: {f1:.4f} (balanced performance metric)")
print(f"\nBusiness Impact:")
print(f"  • Net Benefit: ${net_benefit:,.0f}")
print(f"  • ROI: {roi:.1f}%")
print(f"  • Fraud Cases Caught: {tp}/{y_test.sum()} ({tp/y_test.sum()*100:.1f}%)")
print(f"\nAreas for Improvement:")
print(f"  • {fp} false positives → Need better specificity")
print(f"  • {fn} false negatives → Sophisticated fraud still missed")
print(f"\nConclusion:")
print(f"  The model provides significant value over baseline approaches and")
print(f"  successfully identifies the majority of fraudulent providers. However,")
print(f"  continuous improvement and human oversight remain essential for")
print(f"  optimal fraud detection and prevention.")
print(f"{'=' * 80}")


11. RECOMMENDATIONS FOR MODEL IMPROVEMENT

1. FEATURE ENGINEERING:
   • Network analysis: Provider-provider referral networks
   • Temporal patterns: Sudden changes in billing behavior
   • Geographic features: Regional cost-of-living adjustments
   • Procedure combinations: Detect unbundling patterns
   • Patient trajectory analysis: Track patient movement between providers

2. ADVANCED MODELING:
   • Hyperparameter optimization (GridSearchCV, Bayesian)
   • Ensemble multiple models (stacking, voting)
   • Anomaly detection for sophisticated cases
   • Deep learning for complex pattern recognition
   • Time-series models for temporal fraud detection

3. DATA ENHANCEMENT:
   • Historical fraud investigation outcomes
   • External data: Provider licenses, complaint history
   • More granular procedure/diagnosis codes
   • Patient satisfaction scores
   • Peer comparison within specialties

4. OPERATIONAL IMPROVEMENTS:
   • Implement threshold optimization based on investigation capacit