In [None]:

# # Healthcare Fraud Detection - Modeling
# 
# ## 2.1 Data Preparation for Modeling

# %%
# Load the preprocessed data
# Assuming final_df is available from previous notebook
def prepare_modeling_data(df):
    """Prepare data for modeling"""
    if df.empty:
        # Create sample data for demonstration
        np.random.seed(42)
        n_samples = 1000
        df = pd.DataFrame({
            'Provider': [f'PRV{i:05d}' for i in range(n_samples)],
            'Total_Claims': np.random.poisson(100, n_samples),
            'Total_Amount_Reimbursed': np.random.exponential(50000, n_samples),
            'Avg_Inpatient_Claim_Amount': np.random.normal(1000, 300, n_samples),
            'Inpatient_Claim_Ratio': np.random.beta(2, 5, n_samples),
            'Unique_AttendingPhysicians_Inpatient': np.random.poisson(5, n_samples),
            'PotentialFraud': np.random.choice(['Yes', 'No'], n_samples, p=[0.1, 0.9])
        })

    # Encode target variable
    df['Fraud'] = df['PotentialFraud'].map({'Yes': 1, 'No': 0})

    # Select features for modeling
    feature_columns = [col for col in df.columns if col not in ['Provider', 'PotentialFraud', 'Fraud']]
    X = df[feature_columns]
    y = df['Fraud']

    return X, y, df

# %%
X, y, modeling_df = prepare_modeling_data(final_df)

print("Features shape:", X.shape)
print("Target distribution:")
print(y.value_counts())
print(f"Fraud rate: {y.mean():.3f}")

# %%
# Handle missing values
X = X.fillna(0)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42, stratify=y_train
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


# ## 2.2 Class Imbalance Handling

# %%
def handle_class_imbalance(X_train, y_train, method='class_weight'):
    """Handle class imbalance using different strategies"""
    if method == 'class_weight':
        # Calculate class weights
        classes = np.unique(y_train)
        weights = compute_class_weight('balanced', classes=classes, y=y_train)
        class_weight = dict(zip(classes, weights))
        return class_weight, X_train, y_train

    elif method == 'smote':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        return None, X_resampled, y_resampled

    elif method == 'undersampling':
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler(random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        return None, X_resampled, y_resampled

# %%
# Apply class imbalance handling
class_weight, X_train_balanced, y_train_balanced = handle_class_imbalance(X_train, y_train, method='class_weight')

print("Original training distribution:", np.bincount(y_train))
print("Balanced training distribution:", np.bincount(y_train_balanced))

# ## 2.3 Model Training

# %%
def train_models(X_train, y_train, class_weight=None):
    """Train multiple models for comparison"""
    models = {}

    # 1. Logistic Regression
    lr = LogisticRegression(
        class_weight=class_weight,
        random_state=42,
        max_iter=1000
    )
    lr.fit(X_train, y_train)
    models['Logistic Regression'] = lr

    # 2. Random Forest
    rf = RandomForestClassifier(
        n_estimators=100,
        class_weight=class_weight,
        random_state=42,
        max_depth=10
    )
    rf.fit(X_train, y_train)
    models['Random Forest'] = rf

    # 3. Gradient Boosting
    gb = GradientBoostingClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=6
    )
    gb.fit(X_train, y_train)
    models['Gradient Boosting'] = gb

    return models

# %%
# Train all models
models = train_models(X_train_balanced, y_train_balanced, class_weight)

print("Models trained successfully:")
for name, model in models.items():
    print(f"- {name}")


# ## 2.4 Model Validation

# %%
def evaluate_model(model, X, y, model_name):
    """Comprehensive model evaluation"""
    # Predictions
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]

    # Metrics
    from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred_proba)
    pr_auc = average_precision_score(y, y_pred_proba)

    print(f"\n{model_name} Performance:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"PR-AUC: {pr_auc:.4f}")

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

# %%
# Evaluate on validation set
print("=== VALIDATION SET PERFORMANCE ===")
validation_results = {}
for name, model in models.items():
    validation_results[name] = evaluate_model(model, X_val, y_val, name)


# ## 2.5 Model Comparison

# %%
def compare_models(results_dict):
    """Compare model performance"""
    comparison_df = pd.DataFrame(results_dict).T
    comparison_df = comparison_df[['precision', 'recall', 'f1', 'roc_auc', 'pr_auc']]
    return comparison_df

# %%
# Model comparison
model_comparison = compare_models(validation_results)
print("\n=== MODEL COMPARISON ===")
print(model_comparison)

# Visual comparison
plt.figure(figsize=(12, 8))
metrics = ['precision', 'recall', 'f1', 'roc_auc', 'pr_auc']
model_comparison[metrics].plot(kind='bar', figsize=(12, 6))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# %%
# Select best model based on F1-score
best_model_name = model_comparison['f1'].idxmax()
best_model = models[best_model_name]
print(f"\nBest model: {best_model_name}")

# Save the best model
import joblib
joblib.dump(best_model, 'best_fraud_detection_model.pkl')