# Modeling

## 1. Import Libraries

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib

%matplotlib inline

## 2. Load Processed Data

In [9]:
df = pd.read_csv('/content/processed_provider_data (2).csv')
print("Data Shape:", df.shape)
df.head()

Data Shape: (4956, 25)


Unnamed: 0,Provider,TotalClaims,TotalInpatientClaims,TotalOutpatientClaims,TotalReimbursed,AvgReimbursed,TotalDeductible,AvgDeductible,UniqueBeneficiaries,UniqueAttendingPhysicians,...,Sum_ChronicCond_KidneyDisease,Sum_ChronicCond_Cancer,Sum_ChronicCond_ObstrPulmonary,Sum_ChronicCond_Depression,Sum_ChronicCond_Diabetes,Sum_ChronicCond_IschemicHeart,Sum_ChronicCond_Osteoporasis,Sum_ChronicCond_rheumatoidarthritis,Sum_ChronicCond_stroke,PotentialFraud
0,PRV51001,9,5,4,99120,11013.333333,5340.0,593.333333,9,8,...,7,3,3,5,8,8,1,4,3,0
1,PRV51003,81,62,19,584890,7220.864198,66286.0,818.345679,71,19,...,48,9,27,32,61,70,15,21,8,1
2,PRV51004,33,0,33,8230,249.393939,0.0,0.0,32,16,...,19,6,12,17,29,26,9,12,9,0
3,PRV51005,278,0,278,69880,251.366906,1210.0,4.352518,110,6,...,130,50,62,89,215,228,87,96,22,1
4,PRV51007,17,3,14,20870,1227.647059,3204.0,188.470588,14,6,...,2,4,5,6,10,13,2,8,5,0


## 3. Data Splitting
We will split the data into Training (70%), Validation (15%), and Test (15%) sets.
However, for cross-validation, we often just split into Train/Test and use CV on Train.

In [10]:
X = df.drop(['Provider', 'PotentialFraud'], axis=1)
y = df['PotentialFraud']

# Split into Train (80%) and Test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)
print("Fraud in Train:", y_train.sum())
print("Fraud in Test:", y_test.sum())

Train Shape: (3964, 23)
Test Shape: (992, 23)
Fraud in Train: 405
Fraud in Test: 101


## 4. Preprocessing Pipeline
Handle missing values and scaling.

In [11]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## 5. Model Training & Selection
We will test three models:
1. **Logistic Regression** (Baseline)
2. **Random Forest** (Robust, interpretable)
3. **Gradient Boosting** (High performance)

We will use **SMOTE** to handle class imbalance within the pipeline.

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    # Create pipeline with SMOTE
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    # Train
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    # Evaluate
    roc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)

    results[name] = {
        'model': pipeline,
        'ROC_AUC': roc,
        'PR_AUC': pr_auc,
        'Report': classification_report(y_test, y_pred)
    }

    print(f"{name} Results:")
    print(f"ROC AUC: {roc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")
    print("-"*30)

Training Logistic Regression...
Logistic Regression Results:
ROC AUC: 0.9253
PR AUC: 0.6662
------------------------------
Training Random Forest...
Random Forest Results:
ROC AUC: 0.8987
PR AUC: 0.5943
------------------------------
Training Gradient Boosting...
Gradient Boosting Results:
ROC AUC: 0.9073
PR AUC: 0.6379
------------------------------


## 6. Hyperparameter Tuning (Random Forest)
We'll tune the Random Forest model as it's often a good balance of performance and interpretability.

In [None]:
rf_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

param_dist = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

search = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist, n_iter=10, scoring='f1', cv=3, random_state=42, n_jobs=-1)
search.fit(X_train, y_train)

print("Best Params:", search.best_params_)
best_model = search.best_estimator_

Best Params: {'classifier__n_estimators': 300, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': None}


## 7. Save Best Model and Predictions

In [None]:
import os

# Create directories if they don't exist
os.makedirs('../models', exist_ok=True)
os.makedirs('/content/Downloads', exist_ok=True)

joblib.dump(best_model, '../models/best_model.pkl')

# Save test predictions for evaluation notebook
test_preds = pd.DataFrame({
    'y_true': y_test,
    'y_pred': best_model.predict(X_test),
    'y_prob': best_model.predict_proba(X_test)[:, 1]
})
test_preds.to_csv('/content/Downloads/test_predictions.csv', index=False)
print("Saved model and predictions.")

Saved model and predictions.
