# Modeling

## 1. Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib

%matplotlib inline

## 2. Load Processed Data

In [6]:
df = pd.read_csv('C:\\Users\\abdel\\OneDrive\\Desktop\\New folder\\HealthCare-Provider-Fraud-Detection-Project\\output\\processed_provider_data.csv')
print("Data Shape:", df.shape)
df.head()

Data Shape: (5410, 25)


Unnamed: 0,Provider,TotalClaims,TotalInpatientClaims,TotalOutpatientClaims,TotalReimbursed,AvgReimbursed,TotalDeductible,AvgDeductible,UniqueBeneficiaries,UniqueAttendingPhysicians,...,Sum_ChronicCond_KidneyDisease,Sum_ChronicCond_Cancer,Sum_ChronicCond_ObstrPulmonary,Sum_ChronicCond_Depression,Sum_ChronicCond_Diabetes,Sum_ChronicCond_IschemicHeart,Sum_ChronicCond_Osteoporasis,Sum_ChronicCond_rheumatoidarthritis,Sum_ChronicCond_stroke,PotentialFraud
0,PRV51001,25,5,20,104640,4185.6,5340.0,213.6,24,14,...,17,5,10,9,21,23,6,8,6,0
1,PRV51003,132,62,70,605670,4588.409091,66286.0,502.166667,117,44,...,64,10,41,54,100,112,33,38,12,1
2,PRV51004,149,0,149,52170,350.134228,310.0,2.080537,138,38,...,50,16,41,63,105,108,49,46,17,0
3,PRV51005,1165,0,1165,280910,241.124464,3700.0,3.175966,495,6,...,507,165,295,485,799,895,344,331,124,1
4,PRV51007,72,3,69,33710,468.194444,3264.0,45.333333,58,10,...,22,12,16,29,49,51,21,22,12,0


## 3. Data Splitting
We will split the data into Training (70%), Validation (15%), and Test (15%) sets.
However, for cross-validation, we often just split into Train/Test and use CV on Train.

In [7]:
X = df.drop(['Provider', 'PotentialFraud'], axis=1)
y = df['PotentialFraud']

# Split into Train (80%) and Test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)
print("Fraud in Train:", y_train.sum())
print("Fraud in Test:", y_test.sum())

Train Shape: (4328, 23)
Test Shape: (1082, 23)
Fraud in Train: 405
Fraud in Test: 101


## 4. Preprocessing Pipeline
Handle missing values and scaling.

In [8]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## 5. Model Training & Selection
We will test three models:
1. **Logistic Regression** (Baseline)
2. **Random Forest** (Robust, interpretable)
3. **Gradient Boosting** (High performance)

We will use **SMOTE** to handle class imbalance within the pipeline.

In [11]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    # Create pipeline with SMOTE
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    # Train
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    # Evaluate
    roc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)

    results[name] = {
        'model': pipeline,
        'ROC_AUC': roc,
        'PR_AUC': pr_auc,
        'Report': classification_report(y_test, y_pred)
    }

    print(f"{name} Results:")
    print(f"ROC AUC: {roc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")
    print("-"*30)

Training Logistic Regression...
Logistic Regression Results:
ROC AUC: 0.9554
PR AUC: 0.7570
------------------------------
Training Random Forest...
Random Forest Results:
ROC AUC: 0.9429
PR AUC: 0.6962
------------------------------
Training Gradient Boosting...
Gradient Boosting Results:
ROC AUC: 0.9410
PR AUC: 0.7290
------------------------------


## 6. Hyperparameter Tuning (Random Forest)
We'll tune the Random Forest model as it's often a good balance of performance and interpretability.

In [12]:
rf_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

param_dist = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

search = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist, n_iter=10, scoring='f1', cv=3, random_state=42, n_jobs=-1)
search.fit(X_train, y_train)

print("Best Params:", search.best_params_)
best_model = search.best_estimator_

Best Params: {'classifier__n_estimators': 300, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': 30}


## 7. Save Best Model and Predictions

In [14]:
import os

# Create directories if they don't exist
os.makedirs('output', exist_ok=True)

joblib.dump(best_model, 'output/best_model.pkl')

# Save test predictions for evaluation notebook
test_preds = pd.DataFrame({
    'y_true': y_test,
    'y_pred': best_model.predict(X_test),
    'y_prob': best_model.predict_proba(X_test)[:, 1]
})
test_preds.to_csv('output/test_predictions.csv', index=False)
print("Saved model and predictions.")

Saved model and predictions.
