In [None]:
# 01_data_exploration_and_feature_engineering.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
import os
warnings.filterwarnings('ignore')


In [None]:

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("=" * 80)
print("HEALTHCARE PROVIDER FRAUD DETECTION - DATA EXPLORATION")
print("=" * 80)

# Create directories if they don't exist
os.makedirs('reports/figures', exist_ok=True)
os.makedirs('models', exist_ok=True)


# 1. LOAD DATA

In [None]:

print("\n1. LOADING DATASETS...")

# Load training datasets
train_beneficiary = pd.read_csv('../data/Train_Beneficiarydata-1542865627584.csv')
train_inpatient = pd.read_csv('../data/Train_Inpatientdata-1542865627584.csv')
train_outpatient = pd.read_csv('../data/Train_Outpatientdata-1542865627584.csv')
train_labels = pd.read_csv('../data/Train-1542865627584.csv')

# Load test datasets
test_beneficiary = pd.read_csv('../data/Test_Beneficiarydata-1542969243754.csv')
test_inpatient = pd.read_csv('../data/Test_Inpatientdata-1542969243754.csv')
test_outpatient = pd.read_csv('../data/Test_Outpatientdata-1542969243754.csv')
test_labels = pd.read_csv('../data/Test-1542969243754.csv')

print(f"✓ Train Beneficiary: {train_beneficiary.shape}")
print(f"✓ Train Inpatient: {train_inpatient.shape}")
print(f"✓ Train Outpatient: {train_outpatient.shape}")
print(f"✓ Train Labels: {train_labels.shape}")
print(f"\n✓ Test Beneficiary: {test_beneficiary.shape}")
print(f"✓ Test Inpatient: {test_inpatient.shape}")
print(f"✓ Test Outpatient: {test_outpatient.shape}")
print(f"✓ Test Labels: {test_labels.shape}")


# 2. DATA STRUCTURE ANALYSIS

In [None]:

print("\n2. UNDERSTANDING DATA STRUCTURE...")

print("\n--- Beneficiary Data Sample ---")
print(train_beneficiary.head(3))
print(f"\nColumns ({len(train_beneficiary.columns)}): {train_beneficiary.columns.tolist()}")

print("\n--- Inpatient Data Sample ---")
print(train_inpatient.head(3))
print(f"\nColumns ({len(train_inpatient.columns)}): {train_inpatient.columns.tolist()}")

print("\n--- Outpatient Data Sample ---")
print(train_outpatient.head(3))
print(f"\nColumns ({len(train_outpatient.columns)}): {train_outpatient.columns.tolist()}")

print("\n--- Labels Sample ---")
print(train_labels.head())
print(f"\nColumns: {train_labels.columns.tolist()}")

# Check fraud distribution
print(f"\n--- Training Set Fraud Distribution ---")
print(train_labels['PotentialFraud'].value_counts())
fraud_pct = (train_labels['PotentialFraud'] == 'Yes').sum() / len(train_labels) * 100
print(f"Fraud Percentage: {fraud_pct:.2f}%")

print(f"\n--- Test Set Fraud Distribution ---")
print(test_labels['PotentialFraud'].value_counts())
test_fraud_pct = (test_labels['PotentialFraud'] == 'Yes').sum() / len(test_labels) * 100
print(f"Fraud Percentage: {test_fraud_pct:.2f}%")



# 3. DATA QUALITY ASSESSMENT


In [None]:

print("\n3. DATA QUALITY ASSESSMENT...")

def assess_quality(df, name):
    print(f"\n--- {name} ---")
    print(f"Shape: {df.shape}")
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"Missing values:\n{missing[missing > 0]}")
    else:
        print("No missing values")
    print(f"Duplicates: {df.duplicated().sum()}")

print("\n=== TRAINING DATA ===")
assess_quality(train_beneficiary, "Train Beneficiary")
assess_quality(train_inpatient, "Train Inpatient")
assess_quality(train_outpatient, "Train Outpatient")
assess_quality(train_labels, "Train Labels")

print("\n=== TEST DATA ===")
assess_quality(test_beneficiary, "Test Beneficiary")
assess_quality(test_inpatient, "Test Inpatient")
assess_quality(test_outpatient, "Test Outpatient")




# 4. KEY RELATIONSHIP ANALYSIS


In [None]:
print("\n4. ANALYZING RELATIONSHIPS...")

print("\n--- TRAINING DATA ---")
print(f"Unique Beneficiaries: {train_beneficiary['BeneID'].nunique()}")
print(f"Unique Providers in Inpatient: {train_inpatient['Provider'].nunique()}")
print(f"Unique Providers in Outpatient: {train_outpatient['Provider'].nunique()}")
print(f"Unique Providers in Labels: {train_labels['Provider'].nunique()}")
print(f"Avg Inpatient claims/provider: {len(train_inpatient) / train_inpatient['Provider'].nunique():.2f}")
print(f"Avg Outpatient claims/provider: {len(train_outpatient) / train_outpatient['Provider'].nunique():.2f}")

print("\n--- TEST DATA ---")
print(f"Unique Beneficiaries: {test_beneficiary['BeneID'].nunique()}")
print(f"Unique Providers in Inpatient: {test_inpatient['Provider'].nunique()}")
print(f"Unique Providers in Outpatient: {test_outpatient['Provider'].nunique()}")
print(f"Unique Providers in Labels: {test_labels['Provider'].nunique()}")



# 5. FEATURE ENGINEERING FUNCTIONS


In [None]:

print("\n5. DEFINING FEATURE ENGINEERING FUNCTIONS...")

def create_beneficiary_features(bene_df):
    """Aggregate beneficiary characteristics"""
    
    features = pd.DataFrame()
    features['BeneID'] = bene_df['BeneID']
    
    # Demographics
    if 'Gender' in bene_df.columns:
        features['Gender'] = bene_df['Gender']
    
    if 'Race' in bene_df.columns:
        features['Race'] = bene_df['Race']
    
    # Age calculation
    if 'DOB' in bene_df.columns:
        bene_df['DOB'] = pd.to_datetime(bene_df['DOB'], errors='coerce')
        reference_date = pd.to_datetime('2009-12-01')
        features['Age'] = (reference_date - bene_df['DOB']).dt.days / 365.25
    
    # Count chronic conditions
    chronic_cols = [col for col in bene_df.columns if 'Chronic' in col]
    if chronic_cols:
        features['ChronicConditionsCount'] = bene_df[chronic_cols].sum(axis=1)
    
    # Renal disease indicator
    if 'RenalDiseaseIndicator' in bene_df.columns:
        features['HasRenalDisease'] = bene_df['RenalDiseaseIndicator'].map({'Y': 1, '0': 0}).fillna(0)
    
    # Is deceased
    if 'DOD' in bene_df.columns:
        features['IsDeceased'] = bene_df['DOD'].notna().astype(int)
    
    return features

def aggregate_claims(claims_df, claim_type):
    """Aggregate claim-level data to provider level"""
    
    provider_features = []
    
    for provider, group in claims_df.groupby('Provider'):
        features = {'Provider': provider}
        
        # Basic claim statistics
        features[f'{claim_type}_NumClaims'] = len(group)
        features[f'{claim_type}_NumBeneficiaries'] = group['BeneID'].nunique()
        features[f'{claim_type}_AvgClaimsPerBeneficiary'] = len(group) / group['BeneID'].nunique()
        
        # Financial features
        if 'InscClaimAmtReimbursed' in group.columns:
            features[f'{claim_type}_TotalReimbursed'] = group['InscClaimAmtReimbursed'].sum()
            features[f'{claim_type}_AvgReimbursed'] = group['InscClaimAmtReimbursed'].mean()
            features[f'{claim_type}_StdReimbursed'] = group['InscClaimAmtReimbursed'].std()
            features[f'{claim_type}_MaxReimbursed'] = group['InscClaimAmtReimbursed'].max()
            features[f'{claim_type}_MinReimbursed'] = group['InscClaimAmtReimbursed'].min()
        
        if 'DeductibleAmtPaid' in group.columns:
            features[f'{claim_type}_TotalDeductible'] = group['DeductibleAmtPaid'].sum()
            features[f'{claim_type}_AvgDeductible'] = group['DeductibleAmtPaid'].mean()
        
        # Physician statistics
        physician_cols = [col for col in group.columns if 'Physician' in col]
        if physician_cols:
            features[f'{claim_type}_NumUniquePhysicians'] = group[physician_cols].nunique().sum()
        
        # Diagnosis codes
        diag_cols = [col for col in group.columns if 'ClmDiagnosisCode' in col]
        if diag_cols:
            unique_diagnoses = set()
            for col in diag_cols:
                unique_diagnoses.update(group[col].dropna().unique())
            features[f'{claim_type}_UniqueDiagnoses'] = len(unique_diagnoses)
        
        # Procedure codes
        proc_cols = [col for col in group.columns if 'ClmProcedureCode' in col]
        if proc_cols:
            unique_procedures = set()
            for col in proc_cols:
                unique_procedures.update(group[col].dropna().unique())
            features[f'{claim_type}_UniqueProcedures'] = len(unique_procedures)
        
        # Admission duration (for inpatient only)
        if claim_type == 'Inpatient' and 'AdmissionDt' in group.columns and 'DischargeDt' in group.columns:
            group['AdmissionDt'] = pd.to_datetime(group['AdmissionDt'], errors='coerce')
            group['DischargeDt'] = pd.to_datetime(group['DischargeDt'], errors='coerce')
            group['LOS'] = (group['DischargeDt'] - group['AdmissionDt']).dt.days
            features[f'{claim_type}_AvgLOS'] = group['LOS'].mean()
            features[f'{claim_type}_TotalLOS'] = group['LOS'].sum()
            features[f'{claim_type}_MaxLOS'] = group['LOS'].max()
        
        # Claim dates analysis
        if 'ClaimStartDt' in group.columns and 'ClaimEndDt' in group.columns:
            group['ClaimStartDt'] = pd.to_datetime(group['ClaimStartDt'], errors='coerce')
            group['ClaimEndDt'] = pd.to_datetime(group['ClaimEndDt'], errors='coerce')
            group['ClaimDuration'] = (group['ClaimEndDt'] - group['ClaimStartDt']).dt.days
            features[f'{claim_type}_AvgClaimDuration'] = group['ClaimDuration'].mean()
        
        provider_features.append(features)
    
    return pd.DataFrame(provider_features)

def aggregate_bene_by_provider(claims_df, bene_features):
    """Aggregate beneficiary characteristics at provider level"""
    
    # Merge claims with beneficiary features
    claims_with_bene = claims_df.merge(bene_features, on='BeneID', how='left')
    
    # Aggregate by provider
    agg_dict = {}
    
    if 'Age' in claims_with_bene.columns:
        agg_dict['Age'] = ['mean', 'std', 'min', 'max']
    if 'Gender' in claims_with_bene.columns:
        agg_dict['Gender'] = 'mean'
    if 'ChronicConditionsCount' in claims_with_bene.columns:
        agg_dict['ChronicConditionsCount'] = ['mean', 'max', 'sum']
    if 'IsDeceased' in claims_with_bene.columns:
        agg_dict['IsDeceased'] = 'sum'
    if 'HasRenalDisease' in claims_with_bene.columns:
        agg_dict['HasRenalDisease'] = 'sum'
    
    if not agg_dict:
        return pd.DataFrame({'Provider': claims_df['Provider'].unique()})
    
    provider_bene = claims_df.merge(bene_features, on='BeneID', how='left')\
                              .groupby('Provider').agg(agg_dict).reset_index()
    
    # Flatten column names
    provider_bene.columns = ['_'.join(col).strip('_') if col[1] else col[0] 
                             for col in provider_bene.columns.values]
    
    return provider_bene




# 6. PROCESS TRAINING DATA


In [None]:

print("\n6. PROCESSING TRAINING DATA...")

print("Creating beneficiary features...")
train_bene_features = create_beneficiary_features(train_beneficiary)

print("Aggregating inpatient claims...")
train_inpatient_features = aggregate_claims(train_inpatient, 'Inpatient')

print("Aggregating outpatient claims...")
train_outpatient_features = aggregate_claims(train_outpatient, 'Outpatient')

print("Aggregating beneficiary info by provider (inpatient)...")
train_inpatient_bene = aggregate_bene_by_provider(train_inpatient, train_bene_features)

print("Aggregating beneficiary info by provider (outpatient)...")
train_outpatient_bene = aggregate_bene_by_provider(train_outpatient, train_bene_features)

# Merge all features
print("Merging all provider features...")
train_provider_data = train_labels.copy()
train_provider_data = train_provider_data.merge(train_inpatient_features, on='Provider', how='left')
train_provider_data = train_provider_data.merge(train_outpatient_features, on='Provider', how='left')
train_provider_data = train_provider_data.merge(train_inpatient_bene, on='Provider', how='left')
train_provider_data = train_provider_data.merge(train_outpatient_bene, on='Provider', how='left', 
                                                 suffixes=('_Inp', '_Out'))

# Fill NaN (providers with no claims in one category)
train_provider_data = train_provider_data.fillna(0)

print(f"\n✓ Training provider dataset: {train_provider_data.shape}")
print(f"✓ Features created: {train_provider_data.shape[1] - 2}")




# 7. PROCESS TEST DATA


In [None]:

print("\n7. PROCESSING TEST DATA...")

print("Creating beneficiary features...")
test_bene_features = create_beneficiary_features(test_beneficiary)

print("Aggregating inpatient claims...")
test_inpatient_features = aggregate_claims(test_inpatient, 'Inpatient')

print("Aggregating outpatient claims...")
test_outpatient_features = aggregate_claims(test_outpatient, 'Outpatient')

print("Aggregating beneficiary info by provider (inpatient)...")
test_inpatient_bene = aggregate_bene_by_provider(test_inpatient, test_bene_features)

print("Aggregating beneficiary info by provider (outpatient)...")
test_outpatient_bene = aggregate_bene_by_provider(test_outpatient, test_bene_features)

# Merge all features
print("Merging all provider features...")
test_provider_data = test_labels.copy()
test_provider_data = test_provider_data.merge(test_inpatient_features, on='Provider', how='left')
test_provider_data = test_provider_data.merge(test_outpatient_features, on='Provider', how='left')
test_provider_data = test_provider_data.merge(test_inpatient_bene, on='Provider', how='left')
test_provider_data = test_provider_data.merge(test_outpatient_bene, on='Provider', how='left', 
                                               suffixes=('_Inp', '_Out'))

# Fill NaN
test_provider_data = test_provider_data.fillna(0)

print(f"\n✓ Test provider dataset: {test_provider_data.shape}")
print(f"✓ Features created: {test_provider_data.shape[1] - 2}")




# 8. EXPLORATORY DATA ANALYSIS


In [None]:

print("\n8. EXPLORATORY DATA ANALYSIS...")

# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

train_fraud_counts = train_provider_data['PotentialFraud'].value_counts()
axes[0].bar(train_fraud_counts.index, train_fraud_counts.values, color=['green', 'red'])
axes[0].set_title('Training Set - Fraud Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Potential Fraud')
axes[0].set_ylabel('Count')
for i, v in enumerate(train_fraud_counts.values):
    axes[0].text(i, v + 20, str(v), ha='center', fontweight='bold')

test_fraud_counts = test_provider_data['PotentialFraud'].value_counts()
axes[1].bar(test_fraud_counts.index, test_fraud_counts.values, color=['green', 'red'])
axes[1].set_title('Test Set - Fraud Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Potential Fraud')
axes[1].set_ylabel('Count')
for i, v in enumerate(test_fraud_counts.values):
    axes[1].text(i, v + 20, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('reports/figures/target_distribution.png', dpi=300, bbox_inches='tight')
print("✓ Saved: target_distribution.png")
plt.close()

# Fraud comparison - key metrics
fraud_yes = train_provider_data[train_provider_data['PotentialFraud'] == 'Yes']
fraud_no = train_provider_data[train_provider_data['PotentialFraud'] == 'No']

numeric_cols = train_provider_data.select_dtypes(include=[np.number]).columns.tolist()

# Select key comparison metrics
comparison_metrics = [col for col in [
    'Inpatient_TotalReimbursed', 'Outpatient_TotalReimbursed',
    'Inpatient_NumClaims', 'Outpatient_NumClaims',
    'Age_mean_Inp', 'IsDeceased_sum_Inp'
] if col in numeric_cols]

if len(comparison_metrics) >= 6:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, col in enumerate(comparison_metrics[:6]):
        axes[i].boxplot([fraud_no[col].dropna(), fraud_yes[col].dropna()], 
                       labels=['No Fraud', 'Fraud'])
        axes[i].set_title(col, fontweight='bold')
        axes[i].set_ylabel('Value')
        axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('reports/figures/fraud_comparison.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: fraud_comparison.png")
    plt.close()

# Correlation heatmap
if len(numeric_cols) > 1:
    plt.figure(figsize=(12, 10))
    correlation_matrix = train_provider_data[numeric_cols].corr()
    n_features = min(20, len(numeric_cols))
    sns.heatmap(correlation_matrix.iloc[:n_features, :n_features], 
               annot=False, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('reports/figures/correlation_heatmap.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: correlation_heatmap.png")
    plt.close()




# 9. SAVE PROCESSED DATA


In [None]:

print("\n9. SAVING PROCESSED DATA...")

train_provider_data.to_csv('data/processed_train_provider_data.csv', index=False)
test_provider_data.to_csv('data/processed_test_provider_data.csv', index=False)

print("✓ Saved: processed_train_provider_data.csv")
print("✓ Saved: processed_test_provider_data.csv")

print("\n" + "=" * 80)
print("DATA EXPLORATION COMPLETE!")
print("=" * 80)
print(f"\nTraining Dataset Summary:")
print(f"  Total Providers: {len(train_provider_data)}")
print(f"  Fraudulent: {len(fraud_yes)} ({len(fraud_yes)/len(train_provider_data)*100:.2f}%)")
print(f"  Legitimate: {len(fraud_no)} ({len(fraud_no)/len(train_provider_data)*100:.2f}%)")
print(f"  Total Features: {len(numeric_cols)}")
print(f"\nTest Dataset Summary:")
print(f"  Total Providers: {len(test_provider_data)}")
print(f"  Total Features: {test_provider_data.shape[1] - 2}")
print(f"\nNext Step: Run 02_modeling.ipynb")