cell 1

In [22]:
#Imports & load processed data

import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score
)

DATA_DIR = "../data"
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")

train_full = pd.read_csv(os.path.join(PROCESSED_DIR, "train_provider_features.csv"))
test_full  = pd.read_csv(os.path.join(PROCESSED_DIR, "test_provider_features.csv"))

train_full.head()

Unnamed: 0,Provider,inp_claims,inp_total_reimbursed,inp_mean_reimbursed,inp_max_reimbursed,inp_mean_deductible,inp_mean_los,inp_unique_bene,outp_claims,outp_total_reimbursed,...,mean_ChronicCond_Cancer,mean_ChronicCond_ObstrPulmonary,mean_ChronicCond_Depression,mean_ChronicCond_Diabetes,mean_ChronicCond_IschemicHeart,mean_ChronicCond_Osteoporasis,mean_ChronicCond_rheumatoidarthritis,mean_ChronicCond_stroke,PotentialFraud,FraudLabel
0,PRV51001,5.0,97000.0,19400.0,42000.0,1068.0,5.0,5.0,20.0,7640.0,...,1.8,1.6,1.2,1.2,1.2,2.0,1.4,1.6,No,0
1,PRV51003,62.0,573000.0,9241.935484,57000.0,1068.0,5.16129,53.0,70.0,32670.0,...,1.887097,1.629032,1.596774,1.209677,1.112903,1.790323,1.693548,1.887097,Yes,1
2,PRV51004,,,,,,,,149.0,52170.0,...,,,,,,,,,No,0
3,PRV51005,,,,,,,,1165.0,280910.0,...,,,,,,,,,Yes,1
4,PRV51007,3.0,19000.0,6333.333333,10000.0,1068.0,5.333333,3.0,69.0,14710.0,...,2.0,2.0,1.333333,1.0,1.0,2.0,1.666667,1.333333,No,0


## Notebook 02 Goals
- Train multiple models
- Compare Logistic Regression, Random Forest, Gradient Boosting
- Evaluate with PR-AUC and ROC-AUC
- Select best model


cell 2

In [23]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# 1) Choose feature columns (everything except ID + labels)
feature_cols = [c for c in train_full.columns 
                if c not in ["Provider", "PotentialFraud", "FraudLabel"]]

X_train_full = train_full[feature_cols]
y = train_full["FraudLabel"]

X_test_full = test_full[feature_cols].copy()

# 2) Drop columns that are all-NaN in TRAIN or TEST
cols_all_nan_train = X_train_full.columns[X_train_full.isna().all()]
cols_all_nan_test  = X_test_full.columns[X_test_full.isna().all()]

drop_cols = set(cols_all_nan_train) | set(cols_all_nan_test)

print("Dropping all-NaN columns:", drop_cols)

feature_cols = [c for c in feature_cols if c not in drop_cols]
X_train_full = X_train_full[feature_cols]
X_test_full  = X_test_full[feature_cols]

# 3) Impute with medians from TRAIN ONLY
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train_full)
X_test_imputed  = imputer.transform(X_test_full)

# 4) Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_imputed)

# 5) Train/validation split on SCALED data
indices = np.arange(len(y))

X_train, X_val, y_train, y_val, idx_train, idx_val = train_test_split(
    X_scaled,
    y,
    indices,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Save these for Notebook 3
np.save(os.path.join(PROCESSED_DIR, "X_val.npy"), X_val)
np.save(os.path.join(PROCESSED_DIR, "y_val.npy"), y_val)
np.save(os.path.join(PROCESSED_DIR, "val_indices.npy"), idx_val)


Dropping all-NaN columns: set()


['../data/processed/best_model.pkl']

cell 3

In [24]:
#Model training and evaluation
models = {
    "logreg": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "rf": RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight="balanced"
    ),
    "gb": GradientBoostingClassifier(random_state=42),
}

results = {}

for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    y_proba = clf.predict_proba(X_val)[:, 1]

    roc = roc_auc_score(y_val, y_proba)
    pr  = average_precision_score(y_val, y_proba)

    print(f"\n=== {name} ===")
    print("ROC-AUC:", roc)
    print("PR-AUC :", pr)
    print(classification_report(y_val, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

    results[name] = {"model": clf, "roc_auc": roc, "pr_auc": pr}

#Select best model and save
best_name = max(results.keys(), key=lambda k: results[k]["pr_auc"])
best_model = results[best_name]["model"]
best_name, results[best_name]["roc_auc"], results[best_name]["pr_auc"]


=== logreg ===
ROC-AUC: 0.9494353105035275
PR-AUC : 0.7437851118802034
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       981
           1       0.47      0.82      0.60       101

    accuracy                           0.90      1082
   macro avg       0.73      0.86      0.77      1082
weighted avg       0.93      0.90      0.91      1082

Confusion matrix:
 [[889  92]
 [ 18  83]]

=== rf ===
ROC-AUC: 0.95795359352449
PR-AUC : 0.7497441240281424
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       981
           1       0.73      0.53      0.62       101

    accuracy                           0.94      1082
   macro avg       0.84      0.76      0.79      1082
weighted avg       0.93      0.94      0.93      1082

Confusion matrix:
 [[961  20]
 [ 47  54]]

=== gb ===
ROC-AUC: 0.9642867956520422
PR-AUC : 0.7789618517950189
              precision    recall  f1-score   support

 

('gb', np.float64(0.9642867956520422), np.float64(0.7789618517950189))

In [None]:
# Model baseline comparison table
roc_scores = {k: v['roc_auc'] for k, v in results.items()}
pr_scores = {k: v['pr_auc'] for k, v in results.items()}

comparison = pd.DataFrame({
    'Model': list(roc_scores.keys()),
    'ROC-AUC': list(roc_scores.values()),
    'PR-AUC': list(pr_scores.values())
})
comparison


In [None]:
# Plot ROC and Precision-Recall curves for the selected best model
from sklearn.metrics import roc_curve, precision_recall_curve
import matplotlib.pyplot as plt

best_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, _ = roc_curve(y_val, best_proba)
prec, rec, _ = precision_recall_curve(y_val, best_proba)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC (AUC = {results[best_name]['roc_auc']:.3f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Validation')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

plt.figure(figsize=(6, 5))
plt.plot(rec, prec, label=f"PR (AUC = {results[best_name]['pr_auc']:.3f})")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Validation')
plt.legend()
plt.grid(alpha=0.3)
plt.show()


### Metric Justification
- Fraud data are highly imbalanced, so precision and recall matter more than accuracy.
- PR-AUC emphasizes performance on the positive (fraud) class, making it more informative here than ROC-AUC.
- We therefore prioritize the model with the highest PR-AUC when selecting the winner.


cell 4

In [None]:
# 1) Retrain best model on ALL training data (using the same imputed & scaled features)

# Recompute scaled FULL train (we already have X_train_imputed & medians & scaler)
X_full_imputed = X_train_imputed  # all providers in train_full
X_full_scaled  = scaler.fit_transform(X_full_imputed)  # fit scaler on full train

best_model.fit(X_full_scaled, y)

# 2) Prepare TEST set: impute with same medians, scale with same scaler
X_test_imputed = X_test_imputed  # already filled with medians earlier
X_test_scaled  = scaler.transform(X_test_imputed)

import numpy as np
print("Any NaN in X_test_scaled?", np.isnan(X_test_scaled).any())

# 3) Predict probabilities
test_proba = best_model.predict_proba(X_test_scaled)[:, 1]

submission = pd.DataFrame({
    "Provider": test_full["Provider"],
    "FraudProbability": test_proba
})

submission.head()

cell 5

### Why Gradient Boosting Was Selected
- Delivers the best PR-AUC among the tested models.
- Captures nonlinear interactions across claim features.
- Sequential boosting reduces residual errors from prior learners.
- Outperforms Random Forest on this imbalanced fraud problem.
- More flexible than Logistic Regression for complex provider behavior.


In [26]:
import joblib
import numpy as np
import os

# Save model
joblib.dump(best_model, "../data/processed/best_model.pkl")

# Save validation data used for evaluation
np.save("../data/processed/X_val.npy", X_val)
np.save("../data/processed/y_val.npy", y_val)

# Save scaler + imputer too if needed
joblib.dump(scaler, "../data/processed/scaler.pkl")
joblib.dump(imputer, "../data/processed/imputer.pkl")

['../data/processed/imputer.pkl']

### Export Artifacts for Notebook 03
Saving the trained model, scaler, and imputer so Notebook 03 can load and evaluate them consistently.


cell 6

In [27]:
# Rebuild feature column list (same as training)
feature_cols = [
    c for c in test_full.columns
    if c not in ["Provider"]     # keep provider ID out of X
]

# Prepare X_test
X_test = test_full[feature_cols].copy()

# Impute missing values (same imputer used on train)
X_test_imputed = imputer.transform(X_test)

# Scale features (same scaler used on train)
X_test_scaled = scaler.transform(X_test_imputed)

# Predict fraud probability
test_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Create final dataframe
submission = pd.DataFrame({
    "Provider": test_full["Provider"],
    "FraudProbability": test_proba
})

# Save to CSV
output_path = os.path.join(PROCESSED_DIR, "provider_fraud_predictions.csv")
submission.to_csv(output_path, index=False)

print("Saved:", output_path)

Saved: ../data/processed/provider_fraud_predictions.csv




### Notebook 02 Summary
- Purpose: train and compare multiple models for provider fraud detection.
- Comparison table highlights ROC-AUC and PR-AUC across Logistic Regression, Random Forest, and Gradient Boosting.
- Metric choice: PR-AUC prioritized due to class imbalance.
- Gradient Boosting chosen based on superior PR-AUC and ability to model nonlinear patterns.
- Outputs exported (model + preprocessing) for downstream evaluation in Notebook 03.
