In [1]:
def load_clean_data():
    """
    Load all clean data from S3.
    """
    inpatient_dtypes = {
    'ClaimID': 'object',
    'ClaimStartDt': 'object',
    'ClaimEndDt': 'object',
    'Provider' : 'object',
    'InscClaimAmtReimbursed' : 'float64',
    'AttendingPhysician' :'object',
    'OperatingPhysician' :'object',
    'OtherPhysician' :'object',
    'AdmissionDt'  :'object',
    'ClmAdmitDiagnosisCode' :'object',
    'DeductibleAmtPaid' :'float64',
    'DischargeDt' :'object',
    'ClmAdmitDiagnosisCode': 'object',
    'ClmDiagnosisCode_1': 'object',
    'ClmDiagnosisCode_2': 'object',
    'ClmDiagnosisCode_3': 'object',
    'ClmDiagnosisCode_4': 'object',
    'ClmDiagnosisCode_5': 'object',
    'ClmDiagnosisCode_6': 'object',
    'ClmDiagnosisCode_7': 'object',
    'ClmDiagnosisCode_8': 'object',
    'ClmDiagnosisCode_9': 'object',
    'ClmDiagnosisCode_10': 'object',
    'DeductibleAmtPaid': 'float64',  # Keeping as float64 as inferred, even if int64 was expected by Dask
    'DiagnosisGroupCode': 'object'
    }
    outpatient_dtypes = {
    'ClaimID': 'object',
    'ClaimStartDt': 'object',
    'ClaimEndDt': 'object',
    'Provider' : 'object',
    'InscClaimAmtReimbursed' : 'float64',
    'AttendingPhysician' :'object',
    'OperatingPhysician' :'object',
    'OtherPhysician' :'object',
    #'AdmissionDt'  :'object',
    'ClmAdmitDiagnosisCode' :'object',
    'DeductibleAmtPaid' :'float64',
    #'DischargeDt' :'object',
    'ClmAdmitDiagnosisCode': 'object',
    'ClmDiagnosisCode_1': 'object',
    'ClmDiagnosisCode_2': 'object',
    'ClmDiagnosisCode_3': 'object',
    'ClmDiagnosisCode_4': 'object',
    'ClmDiagnosisCode_5': 'object',
    'ClmDiagnosisCode_6': 'object',
    'ClmDiagnosisCode_7': 'object',
    'ClmDiagnosisCode_8': 'object',
    'ClmDiagnosisCode_9': 'object',
    'ClmDiagnosisCode_10': 'object',
    'DeductibleAmtPaid': 'float64',  
    'DiagnosisGroupCode': 'object'
    }

    df_train_in = dd.read_csv("s3://medicare-fraud-data-25-05-2025/clean/train_inpatient/*.csv", dtype=inpatient_dtypes)
    df_train_out = dd.read_csv("s3://medicare-fraud-data-25-05-2025/clean/train_outpatient/*.csv", dtype=outpatient_dtypes)
    df_test_in = dd.read_csv("s3://medicare-fraud-data-25-05-2025/clean/test_inpatient/*.csv", dtype=inpatient_dtypes)
    df_test_out = dd.read_csv("s3://medicare-fraud-data-25-05-2025/clean/test_outpatient/*.csv", dtype=outpatient_dtypes)
    df_train_bene = dd.read_csv("s3://medicare-fraud-data-25-05-2025/clean/train_beneficiary/*.csv")
    df_test_bene = dd.read_csv("s3://medicare-fraud-data-25-05-2025/clean/test_beneficiary/*.csv")
    df_train_labels = dd.read_csv("s3://medicare-fraud-data-25-05-2025/clean/train_labels/*.csv")
    df_test_labels = dd.read_csv("s3://medicare-fraud-data-25-05-2025/clean/test_labels/*.csv")
    
    return (df_train_in, df_train_out, df_test_in, df_test_out, df_train_bene, df_test_bene, df_train_labels, df_test_labels)
import dask.dataframe as dd
(df_train_in, df_train_out, df_test_in, df_test_out, df_train_bene, df_test_bene, df_train_labels, df_test_labels) = load_clean_data()
def merge_all_data(df_train_in, df_train_out, df_train_bene, df_train_labels):
    """
    Merge all data from inpatient, outpatient, beneficiary, and labels.
    
    Returns:
        Dask DataFrame: Merged DataFrame containing all relevant data.
    """
    # Load all clean data
    
    
    # Step 1: Identify common columns between inpatient and outpatient
    common_cols = [col for col in df_train_in.columns if col in df_train_out.columns]
    print(f"✅ Common columns: {len(common_cols)}")
    
    # Step 2: Outer merge inpatient + outpatient on shared columns
    df_train_in_out = dd.merge(
        df_train_in,
        df_train_out,
        how='outer',
        on=common_cols
    )
    print("✅ Merged inpatient + outpatient shape (approx):")
    print(df_train_in_out.shape)
    
    # Step 3: Inner merge with beneficiary data on 'BeneID'
    df_train_in_out_bene = dd.merge(
        df_train_in_out,
        df_train_bene,
        how='inner',
        on='BeneID'
    )
    print("✅ Final merged shape with beneficiary info (approx):")
    print(df_train_in_out_bene.shape)
    
    # Merge with labels
    df_train_full = dd.merge(
        df_train_in_out_bene,
        df_train_labels,
        how='inner',
        on='Provider'  # assuming this is the key for labels
    )
    
    return df_train_full
df_train_full = merge_all_data(df_train_in, df_train_out, df_train_bene, df_train_labels)
df_test_full = merge_all_data(df_test_in, df_test_out, df_test_bene, df_test_labels)

✅ Common columns: 28
✅ Merged inpatient + outpatient shape (approx):
(<dask_expr.expr.Scalar: expr=Merge(3590383).size() // 32, dtype=int64>, 32)
✅ Final merged shape with beneficiary info (approx):
(<dask_expr.expr.Scalar: expr=Merge(1478a82).size() // 56, dtype=int64>, 56)
✅ Common columns: 28
✅ Merged inpatient + outpatient shape (approx):
(<dask_expr.expr.Scalar: expr=Merge(af4d745).size() // 32, dtype=int64>, 32)
✅ Final merged shape with beneficiary info (approx):
(<dask_expr.expr.Scalar: expr=Merge(7e2afd3).size() // 56, dtype=int64>, 56)


In [10]:
rows, cols = df_train_full.shape
rows = rows.compute()  # Dask returns a delayed object for rows
print(f"Rows: {rows}, Columns: {cols}")


Rows: 558211, Columns: 57


In [12]:
df_train_full["Provider"].nunique().compute()

np.int64(5410)

In [14]:
# Unique Claim numbers
df_train_full["ClaimID"].nunique().compute()

np.int64(558211)