In [None]:
# ============================================================================
# HEALTHCARE FRAUD DETECTION - COMPLETE WORKING CODE
# Author: Mohammed Haqib (RA2512049015044)
# Institution: SRM Institute of Science and Technology
# COPY-PASTE THIS ENTIRE CODE AND RUN - GUARANTEED TO WORK
# ============================================================================

print("="*80)
print("HEALTHCARE FRAUD DETECTION - FINAL WORKING VERSION")
print("="*80)
print("\nüöÄ Starting execution...\n")

# Install packages
import subprocess, sys
print("üì¶ Installing packages...")
for pkg in ['imbalanced-learn', 'scikit-learn', 'seaborn>=0.12.0']:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
    except: pass
print("‚úÖ Packages installed!\n")

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, os, pickle
from datetime import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ Libraries imported!\n")

# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================

print("="*80)
print("STEP 1: LOADING DATA")
print("="*80)

train_ben = pd.read_csv('Train_Beneficiarydata-1542865627584.csv')
train_inp = pd.read_csv('Train_Inpatientdata-1542865627584.csv')
train_out = pd.read_csv('Train_Outpatientdata-1542865627584.csv')
train_lab = pd.read_csv('Train-1542865627584.csv')

test_ben = pd.read_csv('Test_Beneficiarydata-1542969243754.csv')
test_inp = pd.read_csv('Test_Inpatientdata-1542969243754.csv')
test_out = pd.read_csv('Test_Outpatientdata-1542969243754.csv')
test_lab = pd.read_csv('Test-1542969243754.csv')

print(f"‚úÖ Loaded {len(train_ben):,} train beneficiaries")
print(f"‚úÖ Loaded {len(test_ben):,} test beneficiaries\n")

# ============================================================================
# STEP 2: FIX LABELS
# ============================================================================

print("="*80)
print("STEP 2: FIXING LABELS")
print("="*80)

print(f"\nüîç Train labels: {list(train_lab.columns)}")
print(f"üîç Test labels: {list(test_lab.columns)}")

# Find fraud column
poss = ['PotentialFraud', 'Potential Fraud', 'potential_fraud', 'fraud', 'Fraud', 'is_fraud', 'IsFraud']
fraud_col = next((c for c in train_lab.columns if c in poss),
                 train_lab.columns[1] if len(train_lab.columns) >= 2 else None)

if fraud_col and fraud_col != 'PotentialFraud':
    print(f"üîß Renaming '{fraud_col}' ‚Üí 'PotentialFraud'")
    train_lab.rename(columns={fraud_col: 'PotentialFraud'}, inplace=True)

# Check test labels
has_test_labels = len(test_lab.columns) >= 2
if has_test_labels:
    test_fraud_col = next((c for c in test_lab.columns if c in poss), test_lab.columns[1])
    if test_fraud_col != 'PotentialFraud':
        test_lab.rename(columns={test_fraud_col: 'PotentialFraud'}, inplace=True)
    print("‚úÖ Test has labels")
else:
    print("‚ö†Ô∏è  Test labels missing - will use train/val split")

# Convert chronic Y/N to 1/0
print("\nüîß Converting chronic conditions...")
for df in [train_ben, test_ben]:
    chrs = [c for c in df.columns if 'ChronicCond' in c or 'RenalDiseaseIndicator' in c]
    for col in chrs:
        if df[col].dtype == 'object':
            df[col] = df[col].map({'Y': 1, 'y': 1, 'Yes': 1, 'N': 0, 'n': 0, 'No': 0, 1: 1, 0: 0, 2: 1}).fillna(0)
print("‚úÖ Conversions done\n")

# ============================================================================
# STEP 3: DATA OVERVIEW
# ============================================================================

print("="*80)
print("STEP 3: DATA OVERVIEW")
print("="*80)

os.makedirs('analytics', exist_ok=True)

fraud_cnt = train_lab['PotentialFraud'].value_counts()
print(f"\nüìä Fraud Distribution:")
for k, v in fraud_cnt.items():
    print(f"   {k}: {v:,} ({v/len(train_lab)*100:.1f}%)")

# Dashboard
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Dataset Overview', fontsize=18, fontweight='bold')

axes[0, 0].pie(fraud_cnt.values, labels=fraud_cnt.index, autopct='%1.1f%%',
               colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[0, 0].set_title('Fraud Distribution', fontsize=14, fontweight='bold')

claim_types = pd.DataFrame({'Type': ['Inpatient', 'Outpatient'],
                             'Count': [len(train_inp), len(train_out)]})
axes[0, 1].bar(claim_types['Type'], claim_types['Count'], color=['#3498db', '#9b59b6'])
axes[0, 1].set_title('Claims by Type', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Count', fontweight='bold')
for i, v in enumerate(claim_types['Count']):
    axes[0, 1].text(i, v+5000, f'{v:,}', ha='center', fontweight='bold')
axes[0, 1].grid(alpha=0.3, axis='y')

prov_stat = pd.DataFrame({'Dataset': ['Train', 'Test'],
                          'Providers': [train_lab['Provider'].nunique(), test_lab['Provider'].nunique()]})
axes[1, 0].bar(prov_stat['Dataset'], prov_stat['Providers'], color=['#e67e22', '#16a085'])
axes[1, 0].set_title('Provider Count', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Count', fontweight='bold')
for i, v in enumerate(prov_stat['Providers']):
    axes[1, 0].text(i, v+50, f'{v:,}', ha='center', fontweight='bold')
axes[1, 0].grid(alpha=0.3, axis='y')

data_sz = pd.DataFrame({'Component': ['Train\nBen', 'Test\nBen', 'Train\nClaims', 'Test\nClaims'],
                        'Count': [len(train_ben), len(test_ben),
                                  len(train_inp)+len(train_out), len(test_inp)+len(test_out)]})
axes[1, 1].bar(data_sz['Component'], data_sz['Count'], color=['#3498db', '#2ecc71', '#e74c3c', '#f39c12'])
axes[1, 1].set_title('Dataset Size', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Count', fontweight='bold')
for i, v in enumerate(data_sz['Count']):
    axes[1, 1].text(i, v+5000, f'{v:,}', ha='center', fontweight='bold', fontsize=9)
axes[1, 1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('analytics/overview.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Overview saved")
plt.close()

# ============================================================================
# STEP 4: FEATURE ENGINEERING
# ============================================================================

print("\n" + "="*80)
print("STEP 4: FEATURE ENGINEERING")
print("="*80)

def create_features(ben, inp, out, name):
    print(f"\nüîÑ {name}...")

    inp_c = inp.copy()
    out_c = out.copy()
    inp_c['ClaimType'] = 'Inpatient'
    out_c['ClaimType'] = 'Outpatient'

    common = list(set(inp_c.columns) & set(out_c.columns))
    claims = pd.concat([inp_c[common], out_c[common]], ignore_index=True)
    claims_ben = claims.merge(ben, on='BeneID', how='left')

    # Financial
    fin_cols = [c for c in ['InscClaimAmtReimbursed', 'DeductibleAmtPaid'] if c in claims_ben.columns]
    if fin_cols:
        agg = {c: ['sum', 'mean', 'std', 'max'] for c in fin_cols}
        features = claims_ben.groupby('Provider').agg(agg).reset_index()
        features.columns = ['_'.join(c).strip('_') if c[1] else c[0] for c in features.columns.values]
    else:
        features = pd.DataFrame({'Provider': claims_ben['Provider'].unique()})

    # Utilization
    util = claims_ben.groupby('Provider').agg({'BeneID': 'nunique', 'ClaimID': 'count'}).reset_index()
    util.columns = ['Provider', 'UniqueBeneficiaries', 'TotalClaims']
    features = features.merge(util, on='Provider', how='outer')

    # Physician
    phys = [c for c in ['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician'] if c in claims_ben.columns]
    if phys:
        phys_feat = claims_ben.groupby('Provider')[phys].nunique().reset_index()
        phys_feat.columns = ['Provider'] + [f'Num{c}s' for c in phys]
        features = features.merge(phys_feat, on='Provider', how='outer')

    # Clinical
    diag = [c for c in claims_ben.columns if 'ClmDiagnosisCode' in c]
    proc = [c for c in claims_ben.columns if 'ClmProcedureCode' in c]
    claims_ben['UniqueDiag'] = claims_ben[diag].apply(lambda x: x.dropna().nunique(), axis=1) if diag else 0
    claims_ben['UniqueProc'] = claims_ben[proc].apply(lambda x: x.dropna().nunique(), axis=1) if proc else 0

    clin = claims_ben.groupby('Provider')[['UniqueDiag', 'UniqueProc']].agg(['sum', 'mean']).reset_index()
    clin.columns = ['_'.join(c).strip('_') if c[1] else c[0] for c in clin.columns.values]
    features = features.merge(clin, on='Provider', how='outer')

    # Chronic
    chronic = [c for c in ben.columns if 'ChronicCond' in c or 'RenalDiseaseIndicator' in c]
    if chronic:
        claims_chr = claims_ben[['Provider', 'BeneID']].drop_duplicates().merge(
            ben[['BeneID'] + chronic], on='BeneID', how='left')
        chr_feat = claims_chr.groupby('Provider')[chronic].mean().reset_index()
        features = features.merge(chr_feat, on='Provider', how='outer')

    print(f"   ‚úÖ {features.shape[1]-1} features for {len(features):,} providers")
    return features

train_features = create_features(train_ben, train_inp, train_out, "TRAIN")

# Merge labels
train_final = train_features.merge(train_lab[['Provider', 'PotentialFraud']], on='Provider', how='left')

# Handle test data
if has_test_labels:
    test_features = create_features(test_ben, test_inp, test_out, "TEST")
    test_final = test_features.merge(test_lab[['Provider', 'PotentialFraud']], on='Provider', how='left')
else:
    # Split training data
    train_prov = train_final[['Provider']].drop_duplicates()
    train_p, val_p = train_test_split(train_prov, test_size=0.2, random_state=42)
    test_final = train_final[train_final['Provider'].isin(val_p['Provider'])].copy()
    train_final = train_final[train_final['Provider'].isin(train_p['Provider'])].copy()
    print(f"\n   Split: {len(train_final):,} train, {len(test_final):,} validation")

# ============================================================================
# STEP 5: PREPROCESSING
# ============================================================================

print("\n" + "="*80)
print("STEP 5: PREPROCESSING")
print("="*80)

numeric = train_final.select_dtypes(include=[np.number]).columns
numeric = [c for c in numeric if c != 'Provider']

imputer = SimpleImputer(strategy='median')
train_final[numeric] = imputer.fit_transform(train_final[numeric])
test_final[numeric] = imputer.transform(test_final[numeric])
print("‚úÖ Missing values handled")

train_final['PotentialFraud'] = train_final['PotentialFraud'].replace({
    'Yes': 1, 'yes': 1, 'YES': 1, 'Y': 1, 1: 1,
    'No': 0, 'no': 0, 'NO': 0, 'N': 0, 0: 0
})
test_final['PotentialFraud'] = test_final['PotentialFraud'].replace({
    'Yes': 1, 'yes': 1, 'YES': 1, 'Y': 1, 1: 1,
    'No': 0, 'no': 0, 'NO': 0, 'N': 0, 0: 0
})
print("‚úÖ Target encoded")

train_final.to_csv('processed_train.csv', index=False)
test_final.to_csv('processed_validation.csv', index=False)
print("üíæ Data saved\n")

# ============================================================================
# STEP 6: MODELING
# ============================================================================

print("="*80)
print("STEP 6: MODELING")
print("="*80)

X_train = train_final.drop(['Provider', 'PotentialFraud'], axis=1)
y_train = train_final['PotentialFraud']
X_test = test_final.drop(['Provider', 'PotentialFraud'], axis=1)
y_test = test_final['PotentialFraud']

print(f"\nüìä Train: {X_train.shape}, Fraud: {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train)*100:.1f}%)")
print(f"üìä Validation: {X_test.shape}, Fraud: {(y_test==1).sum():,} ({(y_test==1).sum()/len(y_test)*100:.1f}%)")

scaler = StandardScaler()
X_train_sc = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_sc = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
print("\n‚úÖ Scaled")

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_bal, y_train_bal = smote.fit_resample(X_train_sc, y_train)
print(f"‚úÖ SMOTE: {(y_train_bal==1).sum():,} fraud, {(y_train_bal==0).sum():,} non-fraud\n")

# ============================================================================
# STEP 7: TRAINING
# ============================================================================

print("="*80)
print("STEP 7: TRAINING MODELS")
print("="*80)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

trained = {}
results = {}

for idx, (name, model) in enumerate(models.items(), 1):
    print(f"\n{'='*80}")
    print(f"Model {idx}/3: {name}")
    start = datetime.now()
    model.fit(X_train_bal, y_train_bal)
    t = (datetime.now() - start).total_seconds()

    trained[name] = model
    y_pred = model.predict(X_test_sc)
    y_proba = model.predict_proba(X_test_sc)[:, 1] if hasattr(model, 'predict_proba') else None

    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1': f1_score(y_test, y_pred, zero_division=0),
        'AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else 0,
        'Pred': y_pred,
        'Proba': y_proba
    }

    print(f"‚úÖ {t:.2f}s | Acc: {results[name]['Accuracy']*100:.2f}% | F1: {results[name]['F1']:.4f}")

# ============================================================================
# STEP 8: RESULTS
# ============================================================================

print("\n" + "="*80)
print("STEP 8: RESULTS")
print("="*80)

comp = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['Accuracy'] for m in results],
    'Precision': [results[m]['Precision'] for m in results],
    'Recall': [results[m]['Recall'] for m in results],
    'F1': [results[m]['F1'] for m in results],
    'AUC': [results[m]['AUC'] for m in results]
})

comp = comp.sort_values('F1', ascending=False).reset_index(drop=True)
print("\n" + comp.to_string(index=False))

best = comp.iloc[0]['Model']
print(f"\nüèÜ BEST MODEL: {best}")
print(f"   Accuracy:  {results[best]['Accuracy']*100:.2f}%")
print(f"   Precision: {results[best]['Precision']*100:.2f}%")
print(f"   Recall:    {results[best]['Recall']*100:.2f}%")
print(f"   F1-Score:  {results[best]['F1']:.4f}")
print(f"   ROC-AUC:   {results[best]['AUC']:.4f}")

# Feature importance
if 'Random Forest' in trained:
    feat_imp = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': trained['Random Forest'].feature_importances_
    }).sort_values('Importance', ascending=False)

    print(f"\nüìä Top 15 Features:")
    for i, r in feat_imp.head(15).iterrows():
        print(f"   {i+1:2d}. {r['Feature']:<40} {r['Importance']:.6f}")

    feat_imp.to_csv('feature_importance.csv', index=False)

# Save models
with open(f'best_model_{best.replace(" ", "_").lower()}.pkl', 'wb') as f:
    pickle.dump(trained[best], f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)
print("\nüíæ Models saved")

# ============================================================================
# STEP 9: VISUALIZATIONS
# ============================================================================

print("\n" + "="*80)
print("STEP 9: CREATING VISUALIZATIONS")
print("="*80)

# 1. Model Comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Performance', fontsize=18, fontweight='bold')

metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
for idx, m in enumerate(metrics):
    ax = axes[idx//2, idx%2]
    data = comp.sort_values(m, ascending=False)
    ax.barh(data['Model'], data[m], color=['#e74c3c', '#3498db', '#2ecc71'])
    ax.set_xlabel(m, fontsize=12, fontweight='bold')
    ax.set_title(m, fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.grid(alpha=0.3, axis='x')
    for i, v in enumerate(data[m]):
        ax.text(v + 0.01, i, f'{v:.3f}', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
print("   ‚úÖ model_comparison.png")
plt.close()

# 2. Confusion Matrix
cm = confusion_matrix(y_test, results[best]['Pred'])
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'],
            cbar_kws={'label': 'Count'}, ax=ax, linewidths=2,
            annot_kws={'fontsize': 14, 'fontweight': 'bold'})
ax.set_ylabel('Actual', fontsize=12, fontweight='bold')
ax.set_xlabel('Predicted', fontsize=12, fontweight='bold')
ax.set_title(f'Confusion Matrix - {best}', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
print("   ‚úÖ confusion_matrix.png")
plt.close()

# 3. ROC Curves
fig, ax = plt.subplots(figsize=(10, 8))
for name in results:
    if results[name]['Proba'] is not None:
        fpr, tpr, _ = roc_curve(y_test, results[name]['Proba'])
        ax.plot(fpr, tpr, linewidth=3, label=f'{name} (AUC={results[name]["AUC"]:.3f})')
ax.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random')
ax.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
ax.set_title('ROC Curves', fontsize=14, fontweight='bold')
ax.legend(loc='lower right', fontsize=11)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('roc_curves.png', dpi=300, bbox_inches='tight')
print("   ‚úÖ roc_curves.png")
plt.close()

# 4. Feature Importance
if 'Random Forest' in trained:
    fig, ax = plt.subplots(figsize=(12, 8))
    top15 = feat_imp.head(15)
    ax.barh(range(len(top15)), top15['Importance'],
            color=plt.cm.viridis(np.linspace(0.3, 0.9, 15)))
    ax.set_yticks(range(len(top15)))
    ax.set_yticklabels(top15['Feature'], fontsize=10)
    ax.set_xlabel('Importance', fontsize=12, fontweight='bold')
    ax.set_title('Top 15 Features', fontsize=14, fontweight='bold')
    ax.invert_yaxis()
    ax.grid(alpha=0.3, axis='x')
    for i, v in enumerate(top15['Importance']):
        ax.text(v + 0.001, i, f'{v:.4f}', va='center', fontweight='bold', fontsize=9)
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    print("   ‚úÖ feature_importance.png")
    plt.close()

# 5. Class Distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Class Distribution', fontsize=16, fontweight='bold')

for idx, (data, title) in enumerate([(y_train, 'Training'), (y_train_bal, 'After SMOTE'), (y_test, 'Validation')]):
    axes[idx].bar(['Non-Fraud', 'Fraud'], [(data==0).sum(), (data==1).sum()],
                  color=['#2ecc71', '#e74c3c'], edgecolor='black', linewidth=2)
    axes[idx].set_title(title, fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Count', fontweight='bold')
    axes[idx].grid(alpha=0.3, axis='y')
    for i, v in enumerate([(data==0).sum(), (data==1).sum()]):
        pct = v/len(data)*100
        axes[idx].text(i, v+50, f'{v:,}\n({pct:.1f}%)', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
print("   ‚úÖ class_distribution.png")
plt.close()

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*80)
print("üéâ PROJECT COMPLETED!")
print("="*80)

print(f"\nüìä Summary:")
print(f"   Training: {len(train_final):,} providers, {X_train.shape[1]} features")
print(f"   Validation: {len(test_final):,} providers")

print(f"\nüíæ Generated Files (10):")
print(f"   üìä Analytics: 1 dashboard")
print(f"   üìà ML Visualizations: 5 charts")
print(f"   üìÅ Data & Models: 4 files")

print(f"\nüéì Mohammed Haqib (RA2512049015044)")
print(f"üè´ SRM Institute of Science and Technology")
print(f"üìÖ October 2025")
print(f"\n{'='*80}\n")


HEALTHCARE FRAUD DETECTION - FINAL WORKING VERSION

üöÄ Starting execution...

üì¶ Installing packages...
‚úÖ Packages installed!

‚úÖ Libraries imported!

STEP 1: LOADING DATA
‚úÖ Loaded 138,556 train beneficiaries
‚úÖ Loaded 63,968 test beneficiaries

STEP 2: FIXING LABELS

üîç Train labels: ['Provider', 'PotentialFraud']
üîç Test labels: ['Provider']
‚ö†Ô∏è  Test labels missing - will use train/val split

üîß Converting chronic conditions...
‚úÖ Conversions done

STEP 3: DATA OVERVIEW

üìä Fraud Distribution:
   No: 4,904 (90.6%)
   Yes: 506 (9.4%)

‚úÖ Overview saved

STEP 4: FEATURE ENGINEERING

üîÑ TRAIN...
   ‚úÖ 29 features for 5,410 providers

   Split: 4,328 train, 1,082 validation

STEP 5: PREPROCESSING
‚úÖ Missing values handled
‚úÖ Target encoded
üíæ Data saved

STEP 6: MODELING

üìä Train: (4328, 29), Fraud: 401 (9.3%)
üìä Validation: (1082, 29), Fraud: 105 (9.7%)

‚úÖ Scaled
‚úÖ SMOTE: 3,927 fraud, 3,927 non-fraud

STEP 7: TRAINING MODELS

Model 1/3: Logistic R