In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import warnings

In [None]:

try:
    beneficiary_df = pd.read_csv('/content/Train_Beneficiarydata.csv')
    inpatient_df = pd.read_csv('/content/Train_Inpatientdata.csv')
    outpatient_df = pd.read_csv('/content/Train_Outpatientdata.csv')
    labels_df = pd.read_csv('/content/Train_labels.csv') 
except:
    # If files aren't available, we'll create sample code structure
    print("Please download the dataset from Kaggle and update the file paths")
    # Create sample data structures for code demonstration
    beneficiary_df = pd.DataFrame()
    inpatient_df = pd.DataFrame()
    outpatient_df = pd.DataFrame()
    labels_df = pd.DataFrame()

# %%
# Basic data exploration
def explore_dataset(df, name):
    print(f"\n=== {name} Dataset ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    if not df.empty:
        print(f"Missing values:\n{df.isnull().sum()}")
        print(f"Data types:\n{df.dtypes}")

explore_dataset(beneficiary_df, "Beneficiary")
explore_dataset(inpatient_df, "Inpatient")
explore_dataset(outpatient_df, "Outpatient")
explore_dataset(labels_df, "Labels")

# %%
# Check target distribution
if not labels_df.empty:
    print("\n=== Target Distribution ===")
    print(labels_df['PotentialFraud'].value_counts())
    plt.figure(figsize=(8, 6))
    labels_df['PotentialFraud'].value_counts().plot(kind='bar')
    plt.title('Target Class Distribution')
    plt.xlabel('Potential Fraud')
    plt.ylabel('Count')
    plt.show()


# ## 1.2 Data Preprocessing and Cleaning

# %%
def preprocess_beneficiary_data(df):
    """Preprocess beneficiary demographic data"""
    if df.empty:
        return df

    # Handle missing values
    df = df.fillna({'DOD': '1900-01-01'})  # Fill missing death dates

    # Convert dates
    date_cols = ['DOB', 'DOD', 'ClaimStartDate', 'ClaimEndDate']
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    # Calculate age
    if 'DOB' in df.columns:
        reference_date = pd.to_datetime('2020-12-31')  # Assuming current date
        df['Age'] = (reference_date - df['DOB']).dt.days // 365

    # Create flags for chronic conditions
    chronic_conditions = ['ChronicCond_Heartfailure', 'ChronicCond_Alzheimer',
                         'ChronicCond_Cancer', 'ChronicCond_KidneyDisease',
                         'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
                         'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
                         'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
                         'ChronicCond_stroke']
    for condition in chronic_conditions:
        if condition in df.columns:
            df[f'{condition}_Flag'] = df[condition].fillna(0).astype(int)

    return df

# %%
def preprocess_claims_data(df, claim_type):
    """Preprocess inpatient/outpatient claims data"""
    if df.empty:
        return df

    # Convert dates
    date_cols = ['ClaimStartDate', 'ClaimEndDate', 'AdmissionDate', 'DischargeDate']
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    # Calculate claim duration
    if 'ClaimStartDate' in df.columns and 'ClaimEndDate' in df.columns:
        df['ClaimDuration'] = (df['ClaimEndDate'] - df['ClaimStartDate']).dt.days

    # Create amount-related features
    amount_cols = ['DeductibleAmtPaid', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
                   'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']
    for col in amount_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    return df


# ## 1.3 Feature Engineering

# %%
def create_provider_level_features(beneficiary_df, inpatient_df, outpatient_df, labels_df):
    """Aggregate claim-level data to provider-level features"""
    provider_features = []

    if not labels_df.empty:
        providers = labels_df['Provider'].unique()
    else:
        # Sample providers for demonstration
        providers = ['PRV00001', 'PRV00002']

    for provider in providers:
        provider_data = {}

        # Provider ID
        provider_data['Provider'] = provider

        # Get provider's claims
        if not inpatient_df.empty:
            provider_inpatient = inpatient_df[inpatient_df['Provider'] == provider]
        else:
            provider_inpatient = pd.DataFrame()

        if not outpatient_df.empty:
            provider_outpatient = outpatient_df[outpatient_df['Provider'] == provider]
        else:
            provider_outpatient = pd.DataFrame()

        # Basic counts
        provider_data['Total_Inpatient_Claims'] = len(provider_inpatient)
        provider_data['Total_Outpatient_Claims'] = len(provider_outpatient)
        provider_data['Total_Claims'] = provider_data['Total_Inpatient_Claims'] + provider_data['Total_Outpatient_Claims']

        # Amount features
        if not provider_inpatient.empty and 'InscClaimAmtReimbursed' in provider_inpatient.columns:
            provider_data['Avg_Inpatient_Claim_Amount'] = provider_inpatient['InscClaimAmtReimbursed'].mean()
            provider_data['Total_Inpatient_Amount'] = provider_inpatient['InscClaimAmtReimbursed'].sum()
        else:
            provider_data['Avg_Inpatient_Claim_Amount'] = 0
            provider_data['Total_Inpatient_Amount'] = 0

        if not provider_outpatient.empty and 'InscClaimAmtReimbursed' in provider_outpatient.columns:
            provider_data['Avg_Outpatient_Claim_Amount'] = provider_outpatient['InscClaimAmtReimbursed'].mean()
            provider_data['Total_Outpatient_Amount'] = provider_outpatient['InscClaimAmtReimbursed'].sum()
        else:
            provider_data['Avg_Outpatient_Claim_Amount'] = 0
            provider_data['Total_Outpatient_Amount'] = 0

        provider_data['Total_Amount_Reimbursed'] = provider_data['Total_Inpatient_Amount'] + provider_data['Total_Outpatient_Amount']

        # Ratio features
        if provider_data['Total_Claims'] > 0:
            provider_data['Inpatient_Claim_Ratio'] = provider_data['Total_Inpatient_Claims'] / provider_data['Total_Claims']
        else:
            provider_data['Inpatient_Claim_Ratio'] = 0

        # Physician count features
        if not provider_inpatient.empty:
            provider_data['Unique_AttendingPhysicians_Inpatient'] = provider_inpatient['AttendingPhysician'].nunique()
        else:
            provider_data['Unique_AttendingPhysicians_Inpatient'] = 0

        if not provider_outpatient.empty:
            provider_data['Unique_AttendingPhysicians_Outpatient'] = provider_outpatient['AttendingPhysician'].nunique()
        else:
            provider_data['Unique_AttendingPhysicians_Outpatient'] = 0

        provider_features.append(provider_data)

    return pd.DataFrame(provider_features)

# %%
# Create feature dataset
features_df = create_provider_level_features(beneficiary_df, inpatient_df, outpatient_df, labels_df)

# Merge with labels
if not labels_df.empty and not features_df.empty:
    final_df = pd.merge(features_df, labels_df, on='Provider', how='left')
else:
    final_df = features_df

print("Final feature dataset shape:", final_df.shape)
print("Final features:", list(final_df.columns))


# ## 1.4 Exploratory Data Analysis

# %%
def perform_eda(df):
    """Perform exploratory data analysis"""
    if df.empty:
        return

    # Numerical features distribution
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    if len(numerical_cols) > 0:
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.ravel()
        for i, col in enumerate(numerical_cols[:6]):
            df[col].hist(bins=30, ax=axes[i])
            axes[i].set_title(f'Distribution of {col}')
        plt.tight_layout()
        plt.show()

    # Correlation heatmap
    if len(numerical_cols) > 1:
        plt.figure(figsize=(12, 8))
        correlation_matrix = df[numerical_cols].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Feature Correlation Heatmap')
        plt.show()

    # Target vs features analysis
    if 'PotentialFraud' in df.columns:
        fraud_df = df[df['PotentialFraud'] == 'Yes']
        non_fraud_df = df[df['PotentialFraud'] == 'No']

        # Compare means
        comparison_df = pd.DataFrame({
            'Fraud_Mean': fraud_df[numerical_cols].mean(),
            'Non_Fraud_Mean': non_fraud_df[numerical_cols].mean()
        })
        comparison_df['Difference'] = comparison_df['Fraud_Mean'] - comparison_df['Non_Fraud_Mean']
        print("\nFeature Comparison (Fraud vs Non-Fraud):")
        print(comparison_df)

perform_eda(final_df)

Please download the dataset from Kaggle and update the file paths


NameError: name 'pd' is not defined