# 02_modeling.ipynb


In [18]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, precision_recall_curve,
                             average_precision_score, f1_score, precision_score, recall_score)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import pickle
import warnings
warnings.filterwarnings('ignore')


In [19]:


print("=" * 80)
print("HEALTHCARE PROVIDER FRAUD DETECTION - MODELING")
print("=" * 80)


HEALTHCARE PROVIDER FRAUD DETECTION - MODELING


# 1. LOAD PROCESSED DATA



In [20]:


print("\n1. LOADING PROCESSED DATA...")

train_data = pd.read_csv('../data/processed_train_provider_data.csv')
print(f"✓ Training data loaded: {train_data.shape}")

# Prepare features and target
X_train_full = train_data.drop(['Provider', 'PotentialFraud'], axis=1)
y_train_full = train_data['PotentialFraud'].map({'Yes': 1, 'No': 0})

print(f"✓ Features: {X_train_full.shape[1]}")
print(f"✓ Training samples: {X_train_full.shape[0]}")
print(f"✓ Class distribution:\n{y_train_full.value_counts()}")
print(f"✓ Fraud percentage: {y_train_full.sum()/len(y_train_full)*100:.2f}%")



1. LOADING PROCESSED DATA...
✓ Training data loaded: (4328, 53)
✓ Features: 51
✓ Training samples: 4328
✓ Class distribution:
PotentialFraud
0    3923
1     405
Name: count, dtype: int64
✓ Fraud percentage: 9.36%




# 2. TRAIN-VALIDATION SPLIT


In [21]:

print("\n2. SPLITTING DATA INTO TRAIN AND VALIDATION...")

# Split training data into train and validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

print(f"✓ Training set: {X_train.shape} - Fraud: {y_train.sum()}/{len(y_train)} ({y_train.sum()/len(y_train)*100:.2f}%)")
print(f"✓ Validation set: {X_val.shape} - Fraud: {y_val.sum()}/{len(y_val)} ({y_val.sum()/len(y_val)*100:.2f}%)")



2. SPLITTING DATA INTO TRAIN AND VALIDATION...
✓ Training set: (3462, 51) - Fraud: 324/3462 (9.36%)
✓ Validation set: (866, 51) - Fraud: 81/866 (9.35%)




# 3. FEATURE SCALING


In [22]:

print("\n3. FEATURE SCALING...")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print("✓ Features scaled using StandardScaler")



3. FEATURE SCALING...
✓ Features scaled using StandardScaler




# 4. HANDLING CLASS IMBALANCE


In [23]:

print("\n4. HANDLING CLASS IMBALANCE...")

print(f"\nOriginal distribution: {y_train.value_counts().to_dict()}")

print("\n--- Testing Multiple Strategies ---")

# Strategy 1: SMOTE
print("\n1. SMOTE (Synthetic Minority Oversampling):")
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print(f"   After SMOTE: {y_train_smote.value_counts().to_dict()}")

# Strategy 2: Random Undersampling
print("\n2. Random Undersampling:")
undersample = RandomUnderSampler(random_state=42, sampling_strategy=0.5)
X_train_under, y_train_under = undersample.fit_resample(X_train_scaled, y_train)
print(f"   After Undersampling: {y_train_under.value_counts().to_dict()}")

# Strategy 3: SMOTETomek
print("\n3. SMOTETomek (Combined):")
smotetomek = SMOTETomek(random_state=42)
X_train_combined, y_train_combined = smotetomek.fit_resample(X_train_scaled, y_train)
print(f"   After SMOTETomek: {y_train_combined.value_counts().to_dict()}")

# We'll use SMOTE as our primary strategy
X_train_resampled = X_train_smote
y_train_resampled = y_train_smote

print(f"\n✓ Using SMOTE for training")



4. HANDLING CLASS IMBALANCE...

Original distribution: {0: 3138, 1: 324}

--- Testing Multiple Strategies ---

1. SMOTE (Synthetic Minority Oversampling):
   After SMOTE: {0: 3138, 1: 3138}

2. Random Undersampling:
   After Undersampling: {0: 648, 1: 324}

3. SMOTETomek (Combined):
   After SMOTETomek: {0: 3134, 1: 3134}

✓ Using SMOTE for training
   After SMOTETomek: {0: 3134, 1: 3134}

✓ Using SMOTE for training




# 5. MODEL TRAINING


In [24]:

print("\n5. TRAINING MODELS...")

models = {
    'Logistic Regression': LogisticRegression(
        random_state=42, 
        max_iter=1000,
        class_weight='balanced',
        C=0.1
    ),
    'Decision Tree': DecisionTreeClassifier(
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42,
        class_weight='balanced'
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42,
        subsample=0.8
    )
}

trained_models = {}
results = []

for name, model in models.items():
    print(f"\n--- Training {name} ---")
    
    # Train on resampled data
    model.fit(X_train_resampled, y_train_resampled)
    trained_models[name] = model
    
    # Predictions on validation set
    y_val_pred = model.predict(X_val_scaled)
    y_val_proba = model.predict_proba(X_val_scaled)[:, 1]
    
    # Calculate metrics
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_proba)
    pr_auc = average_precision_score(y_val, y_val_proba)
    
    results.append({
        'Model': name,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'PR-AUC': pr_auc
    })
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"PR-AUC: {pr_auc:.4f}")



5. TRAINING MODELS...

--- Training Logistic Regression ---
Precision: 0.4422
Recall: 0.8025
F1-Score: 0.5702
ROC-AUC: 0.9261
PR-AUC: 0.6567

--- Training Decision Tree ---


Precision: 0.4621
Recall: 0.7531
F1-Score: 0.5728
ROC-AUC: 0.8641
PR-AUC: 0.4315

--- Training Random Forest ---
Precision: 0.4552
Recall: 0.8148
F1-Score: 0.5841
ROC-AUC: 0.9237
PR-AUC: 0.6090

--- Training Gradient Boosting ---
Precision: 0.4552
Recall: 0.8148
F1-Score: 0.5841
ROC-AUC: 0.9237
PR-AUC: 0.6090

--- Training Gradient Boosting ---
Precision: 0.5000
Recall: 0.6914
F1-Score: 0.5803
ROC-AUC: 0.9304
PR-AUC: 0.6553
Precision: 0.5000
Recall: 0.6914
F1-Score: 0.5803
ROC-AUC: 0.9304
PR-AUC: 0.6553




# 6. MODEL COMPARISON


In [25]:

print("\n6. MODEL COMPARISON...")

results_df = pd.DataFrame(results)
print("\n" + "=" * 80)
print(results_df.to_string(index=False))
print("=" * 80)

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot of metrics
metrics = ['Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'PR-AUC']
x = np.arange(len(results_df))
width = 0.15

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
for i, metric in enumerate(metrics):
    axes[0].bar(x + i*width, results_df[metric], width, label=metric, color=colors[i])

axes[0].set_xlabel('Model', fontweight='bold')
axes[0].set_ylabel('Score', fontweight='bold')
axes[0].set_title('Model Performance Comparison', fontweight='bold', fontsize=12)
axes[0].set_xticks(x + width * 2)
axes[0].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[0].legend(loc='lower right')
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].set_ylim(0, 1)

# Heatmap
metrics_data = results_df[['Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'PR-AUC']].values
sns.heatmap(metrics_data.T, annot=True, fmt='.3f', cmap='YlGnBu',
            xticklabels=results_df['Model'], yticklabels=metrics,
            ax=axes[1], cbar_kws={'label': 'Score'}, vmin=0, vmax=1)
axes[1].set_title('Performance Heatmap', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.savefig('../reports/figures/model_comparison.png', dpi=300, bbox_inches='tight')
print("\n✓ Saved: model_comparison.png")
plt.close()



6. MODEL COMPARISON...

              Model  Precision   Recall  F1-Score  ROC-AUC   PR-AUC
Logistic Regression   0.442177 0.802469  0.570175 0.926052 0.656701
      Decision Tree   0.462121 0.753086  0.572770 0.864064 0.431520
      Random Forest   0.455172 0.814815  0.584071 0.923740 0.608976
  Gradient Boosting   0.500000 0.691358  0.580311 0.930377 0.655336



✓ Saved: model_comparison.png




# 7. ROC AND PRECISION-RECALL CURVES


In [26]:

print("\n7. GENERATING ROC AND PR CURVES...")

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

# ROC Curves
for idx, (name, model) in enumerate(trained_models.items()):
    y_val_proba = model.predict_proba(X_val_scaled)[:, 1]
    fpr, tpr, _ = roc_curve(y_val, y_val_proba)
    auc = roc_auc_score(y_val, y_val_proba)
    axes[0].plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})', 
                linewidth=2.5, color=colors[idx])

axes[0].plot([0, 1], [0, 1], 'k--', label='Random (AUC = 0.500)', linewidth=1.5)
axes[0].set_xlabel('False Positive Rate', fontweight='bold', fontsize=11)
axes[0].set_ylabel('True Positive Rate', fontweight='bold', fontsize=11)
axes[0].set_title('ROC Curves - Validation Set', fontweight='bold', fontsize=12)
axes[0].legend(loc='lower right', fontsize=9)
axes[0].grid(True, alpha=0.3)

# Precision-Recall Curves
for idx, (name, model) in enumerate(trained_models.items()):
    y_val_proba = model.predict_proba(X_val_scaled)[:, 1]
    precision_curve, recall_curve, _ = precision_recall_curve(y_val, y_val_proba)
    pr_auc = average_precision_score(y_val, y_val_proba)
    axes[1].plot(recall_curve, precision_curve, 
                label=f'{name} (AP = {pr_auc:.3f})', 
                linewidth=2.5, color=colors[idx])

# Baseline (random classifier)
baseline = y_val.sum() / len(y_val)
axes[1].axhline(y=baseline, color='k', linestyle='--', 
               label=f'Random (AP = {baseline:.3f})', linewidth=1.5)

axes[1].set_xlabel('Recall', fontweight='bold', fontsize=11)
axes[1].set_ylabel('Precision', fontweight='bold', fontsize=11)
axes[1].set_title('Precision-Recall Curves - Validation Set', fontweight='bold', fontsize=12)
axes[1].legend(loc='upper right', fontsize=9)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/roc_pr_curves.png', dpi=300, bbox_inches='tight')
print("✓ Saved: roc_pr_curves.png")
plt.close()



7. GENERATING ROC AND PR CURVES...
✓ Saved: roc_pr_curves.png
✓ Saved: roc_pr_curves.png




# 8. CONFUSION MATRICES FOR ALL MODELS


In [27]:

print("\n8. GENERATING CONFUSION MATRICES...")

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, (name, model) in enumerate(trained_models.items()):
    y_val_pred = model.predict(X_val_scaled)
    cm = confusion_matrix(y_val, y_val_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Legitimate', 'Fraudulent'],
                yticklabels=['Legitimate', 'Fraudulent'])
    axes[idx].set_ylabel('True Label', fontweight='bold')
    axes[idx].set_xlabel('Predicted Label', fontweight='bold')
    axes[idx].set_title(f'{name}', fontweight='bold', fontsize=11)

plt.tight_layout()
plt.savefig('../reports/figures/confusion_matrices_all.png', dpi=300, bbox_inches='tight')
print("✓ Saved: confusion_matrices_all.png")
plt.close()



8. GENERATING CONFUSION MATRICES...
✓ Saved: confusion_matrices_all.png
✓ Saved: confusion_matrices_all.png




# 9. FEATURE IMPORTANCE ANALYSIS


In [28]:

print("\n9. ANALYZING FEATURE IMPORTANCE...")

# Use Random Forest for feature importance
rf_model = trained_models['Random Forest']
feature_names = X_train.columns

feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n" + "=" * 80)
print("TOP 20 MOST IMPORTANT FEATURES (Random Forest)")
print("=" * 80)
print(feature_importance.head(20).to_string(index=False))

# Visualize top 15
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
colors_bars = plt.cm.viridis(np.linspace(0.3, 0.9, len(top_features)))
plt.barh(range(len(top_features)), top_features['Importance'], color=colors_bars)
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Importance', fontweight='bold', fontsize=11)
plt.ylabel('Feature', fontweight='bold', fontsize=11)
plt.title('Top 15 Feature Importances (Random Forest)', fontweight='bold', fontsize=12)
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('../reports/figures/feature_importance.png', dpi=300, bbox_inches='tight')
print("\n✓ Saved: feature_importance.png")
plt.close()



9. ANALYZING FEATURE IMPORTANCE...

TOP 20 MOST IMPORTANT FEATURES (Random Forest)
                          Feature  Importance
                 Inpatient_MaxLOS    0.122580
               Inpatient_TotalLOS    0.088234
Inpatient_AvgClaimsPerBeneficiary    0.058011
       Inpatient_NumBeneficiaries    0.057489
              Inpatient_NumClaims    0.048593
       Inpatient_UniqueProcedures    0.044748
        Inpatient_TotalReimbursed    0.042849
          Inpatient_MaxReimbursed    0.040878
        Inpatient_TotalDeductible    0.032402
   ChronicConditionsCount_sum_Out    0.031007
          HasRenalDisease_sum_Out    0.030123
   ChronicConditionsCount_sum_Inp    0.028740
       Outpatient_UniqueDiagnoses    0.027094
       Inpatient_AvgClaimDuration    0.026930
   ChronicConditionsCount_max_Inp    0.026519
       Outpatient_TotalDeductible    0.023699
          Inpatient_StdReimbursed    0.020345
       Outpatient_TotalReimbursed    0.019091
          HasRenalDisease_sum_Inp    0.018



# 10. SELECT AND SAVE BEST MODEL


In [29]:

print("\n10. SELECTING BEST MODEL...")

# Select model with best F1-score (balanced between precision and recall)
best_idx = results_df['F1-Score'].idxmax()
best_model_name = results_df.loc[best_idx, 'Model']
best_model = trained_models[best_model_name]

print(f"\n{'=' * 80}")
print(f"BEST MODEL: {best_model_name}")
print(f"{'=' * 80}")
print(f"Validation Performance:")
for metric in ['Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'PR-AUC']:
    print(f"  {metric}: {results_df.loc[best_idx, metric]:.4f}")

# Save model and scaler
with open('../models/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("\n✓ Best model saved: models/best_model.pkl")
print("✓ Scaler saved: models/scaler.pkl")

# Save all trained models for comparison
with open('../models/all_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)
print("✓ All models saved: models/all_models.pkl")

# Save results
results_df.to_csv('../reports/model_comparison_results.csv', index=False)
feature_importance.to_csv('../reports/feature_importance.csv', index=False)

print("✓ Saved: model_comparison_results.csv")
print("✓ Saved: feature_importance.csv")



10. SELECTING BEST MODEL...

BEST MODEL: Random Forest
Validation Performance:
  Precision: 0.4552
  Recall: 0.8148
  F1-Score: 0.5841
  ROC-AUC: 0.9237
  PR-AUC: 0.6090

✓ Best model saved: models/best_model.pkl
✓ Scaler saved: models/scaler.pkl
✓ All models saved: models/all_models.pkl
✓ Saved: model_comparison_results.csv
✓ Saved: feature_importance.csv




# 11. CROSS-VALIDATION FOR BEST MODEL


In [30]:

print("\n11. CROSS-VALIDATION ANALYSIS...")

print(f"\nPerforming 5-fold stratified cross-validation on {best_model_name}...")

# Create a new instance of the best model
from sklearn.base import clone
cv_model = clone(best_model)

# Use cross-validation on the resampled training data
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(cv_model, X_train_resampled, y_train_resampled, 
                            cv=skf, scoring='f1', n_jobs=-1)

print(f"\nCross-Validation F1-Scores: {cv_scores}")
print(f"Mean F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

print("\n" + "=" * 80)
print("MODELING COMPLETE!")
print("=" * 80)
print(f"\nBest Model: {best_model_name}")
print(f"Validation F1-Score: {results_df.loc[best_idx, 'F1-Score']:.4f}")
print(f"Cross-Validation F1-Score: {cv_scores.mean():.4f}")
print(f"\nNext Step: Run 03_evaluation.ipynb for detailed evaluation on test set")


11. CROSS-VALIDATION ANALYSIS...

Performing 5-fold stratified cross-validation on Random Forest...

Cross-Validation F1-Scores: [0.93998441 0.93647059 0.94238358 0.94301327 0.93343774]
Mean F1-Score: 0.9391 (+/- 0.0073)

MODELING COMPLETE!

Best Model: Random Forest
Validation F1-Score: 0.5841
Cross-Validation F1-Score: 0.9391

Next Step: Run 03_evaluation.ipynb for detailed evaluation on test set

Cross-Validation F1-Scores: [0.93998441 0.93647059 0.94238358 0.94301327 0.93343774]
Mean F1-Score: 0.9391 (+/- 0.0073)

MODELING COMPLETE!

Best Model: Random Forest
Validation F1-Score: 0.5841
Cross-Validation F1-Score: 0.9391

Next Step: Run 03_evaluation.ipynb for detailed evaluation on test set
