In [72]:
# 1. Import necessary libraries
import dask.dataframe as dd

In [73]:
# 2. Define the function to load merged data
def load_merged_data():
    """
    Load all merged data from S3.
    """
    merged_dtypes = {
    'ClaimID': 'object',
    'ClaimStartDt': 'object',
    'ClaimEndDt': 'object',
    'Provider' : 'object',
    'InscClaimAmtReimbursed' : 'float64',
    'AttendingPhysician' :'object',
    'OperatingPhysician' :'object',
    'OtherPhysician' :'object',
    'AdmissionDt'  :'object',
    'ClmAdmitDiagnosisCode' :'object',
    'DeductibleAmtPaid' :'float64',
    'IPAnnualReimbursementAmt': 'float64',
    'OPAnnualReimbursementAmt': 'float64',
    'DischargeDt' :'object',
    'ClmAdmitDiagnosisCode': 'object',
    'ClmDiagnosisCode_1': 'object',
    'ClmDiagnosisCode_2': 'object',
    'ClmDiagnosisCode_3': 'object',
    'ClmDiagnosisCode_4': 'object',
    'ClmDiagnosisCode_5': 'object',
    'ClmDiagnosisCode_6': 'object',
    'ClmDiagnosisCode_7': 'object',
    'ClmDiagnosisCode_8': 'object',
    'ClmDiagnosisCode_9': 'object',
    'ClmDiagnosisCode_10': 'object', 
    'DiagnosisGroupCode': 'object',
    'IPAnnualDeductibleAmt': 'float64',
    'OPAnnualDeductibleAmt': 'float64',
    }
    #date_columns_in = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']
    clean_path = "s3://medicare-fraud-data-25-05-2025/clean/"
    df_train = dd.read_csv(clean_path+"train_full/*.csv", dtype=merged_dtypes)
    df_test = dd.read_csv(clean_path+"test_full/*.csv", dtype=merged_dtypes)
    print("Data loaded successfully")
    
    return (df_train, df_test)

In [74]:
# 3. Call the function to load data
df_train, df_test = load_merged_data()


Data loaded successfully


In [75]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [76]:
df_train.columns

Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'ClmProcedureCode_6', 'ClaimDuration', 'HospitalDuration', 'DOB', 'DOD',
       'Gender', 'Race', 'RenalDiseaseIndicator', 'State', 'County',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabete

In [77]:
from dask.diagnostics import ProgressBar
pbar = ProgressBar(); pbar.register()

In [79]:
def aggregate_provider_dask(df):
    numerical_cols = ['ClaimDuration', 'HospitalDuration', 'DeductibleAmtPaid']
    binary_cols = [
        'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
        'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
        'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
        'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke'
    ]
    annual_cols = [
        'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
        'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt'
    ]
    nunique_cols = [
        'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
        'ClmProcedureCode_1', 'ClmProcedureCode_2',
        'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician',
        'Gender'
    ]
    mode_cols = ['Race', 'State', 'County']

    agg_dict = {}
    for col in numerical_cols:
        agg_dict[col] = ['sum', 'mean', 'std', 'max', 'min']
    for col in annual_cols:
        agg_dict[col] = ['mean', 'max']
    for col in binary_cols:
        agg_dict[col] = ['sum', 'mean']
    agg_dict['ClaimID'] = 'count'

    agg_df = df.groupby('Provider').agg(agg_dict)
    agg_df.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in agg_df.columns]
    agg_df = agg_df.reset_index()

    # Compute nunique per column individually and merge
    nunique_dfs = []
    for col in nunique_cols:
        nunique_col_df = df.groupby('Provider')[col].nunique().rename(f"{col}_nunique").reset_index()
        nunique_dfs.append(nunique_col_df)

    from functools import reduce
    import dask.dataframe as dd

    nunique_df = reduce(lambda left, right: left.merge(right, on='Provider'), nunique_dfs)

    # Mode calculation remains the same (your existing map_partitions approach)
    def compute_mode(pdf, col):
        return pdf[[col, 'Provider']].groupby('Provider')[col].agg(
            lambda x: x.value_counts().idxmax() if not x.empty else None
        ).rename(f"{col}_mode")

    mode_dfs = [df.map_partitions(compute_mode, col) for col in mode_cols]
    mode_df = dd.concat(mode_dfs, axis=1).reset_index()

    merged = agg_df.merge(nunique_df, on='Provider').merge(mode_df, on='Provider')

    return merged


In [80]:
provider_train_df = aggregate_provider_dask(df_train_copy)


In [81]:
provider_test_df = aggregate_provider_dask(df_test_copy)

In [82]:
provider_train_df.head(5)

[########################################] | 100% Completed | 18.00 ss
[########################################] | 100% Completed | 18.09 s


Unnamed: 0,Provider,ClaimDuration_sum,ClaimDuration_mean,ClaimDuration_std,ClaimDuration_max,ClaimDuration_min,HospitalDuration_sum,HospitalDuration_mean,HospitalDuration_std,HospitalDuration_max,...,ClmDiagnosisCode_3_nunique,ClmProcedureCode_1_nunique,ClmProcedureCode_2_nunique,AttendingPhysician_nunique,OperatingPhysician_nunique,OtherPhysician_nunique,Gender_nunique,Race_mode,State_mode,County_mode
0,PRV57172,2968,1.686364,4.935564,35,0,516.0,5.863636,6.282949,35.0,...,366,36,7,104,101,75,2,1,49,430
1,PRV55158,988,1.415473,4.433069,20,0,151.0,5.206897,3.67792,14.0,...,193,16,3,11,11,5,2,1,33,420
2,PRV54966,2287,1.273385,4.421823,20,0,0.0,,,,...,309,1,0,10,9,6,2,1,33,420
3,PRV54890,193,2.506494,6.570715,26,0,35.0,8.75,11.528949,26.0,...,29,2,1,53,12,22,2,1,33,420
4,PRV52145,335,1.367347,4.708267,20,0,0.0,,,,...,74,0,0,50,20,21,2,1,10,150


In [83]:
# If PotentialFraud exists and is unique per provider
label_df = df_train[['Provider', 'PotentialFraud']].drop_duplicates().compute()
provider_train_df = provider_train_df.merge(label_df, on='Provider', how='left')


[########################################] | 100% Completed | 2.42 ss
[########################################] | 100% Completed | 2.51 s


In [84]:
# dictionary to hold physician columns
physician_cols_test = [col for col in df_test.columns if "Physician" in col]
physician_cols_train = [col for col in df_train.columns if "Physician" in col]

Starting from here with the new Features V2.0

In [85]:
# 1. Replacing NANs in all Physician Columns by Zero
cols_to_fill = ['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']
df_test[cols_to_fill] = df_test[cols_to_fill].fillna(0)
df_train[cols_to_fill] = df_train[cols_to_fill].fillna(0)


In [86]:
# 2. Sum of the Beneficiary Age for every Provider
prv_bene_age_sum_test = df_test.groupby("Provider")["Bene_Age"].sum().reset_index()
prv_bene_age_sum_test = prv_bene_age_sum_test.rename(columns={"Bene_Age": "Bene_Age_Sum"})

In [87]:
prv_bene_age_sum_train = df_train.groupby("Provider")["Bene_Age"].sum().reset_index()
prv_bene_age_sum_train = prv_bene_age_sum_train.rename(columns={"Bene_Age": "Bene_Age_Sum"})

In [88]:
# 3. Number of Total Claims per Provider. The original Idea was to identify the Total Number of false Claims by a Provider. For that he subtract the number of fradulent claims from the number of total claims
prv_total_claims_test = df_test.groupby("Provider")["ClaimID"].count().reset_index()
prv_total_claims_test.columns = ["Provider", "TotalClaims"]

In [89]:
prv_total_claims_train = df_train.groupby("Provider")["ClaimID"].count().reset_index()
prv_total_claims_train.columns = ["Provider", "TotalClaims"]

In [90]:
# 4. Define a function to compute total claims per provider for the attending physician
def prv_total_claims_for_physicians(df):
    """
    Compute total claims per provider for each physician type, and return one merged Dask DataFrame.
    """

    # Count total claims per provider-physician type
    att = df.groupby(["Provider", "AttendingPhysician"])["ClaimID"].count().reset_index()
    att = att.rename(columns={"ClaimID": "AttendingPhysician_TotalClaims"})

    #op = df.groupby(["Provider", "OperatingPhysician"])["ClaimID"].count().reset_index()
    #op = op.rename(columns={"ClaimID": "OperatingPhysician_TotalClaims"})
#
    #ot = df.groupby(["Provider", "OtherPhysician"])["ClaimID"].count().reset_index()
    #ot = ot.rename(columns={"ClaimID": "OtherPhysician_TotalClaims"})
#
    ## Now reduce these to provider-level totals by summing claims per provider
    att_sum = att.groupby("Provider")["AttendingPhysician_TotalClaims"].sum().reset_index()
    #op_sum = op.groupby("Provider")["OperatingPhysician_TotalClaims"].sum().reset_index()
    #ot_sum = ot.groupby("Provider")["OtherPhysician_TotalClaims"].sum().reset_index()

    # Merge safely
    #merged = att_sum.merge(op_sum, on="Provider", how="outer")
    #merged = merged.merge(ot_sum, on="Provider", how="outer")

    return att_sum



In [91]:
prv_total_claims_for_physicians_test =prv_total_claims_for_physicians(df_test)

In [92]:
prv_total_claims_for_physicians_train =prv_total_claims_for_physicians(df_train)

In [93]:
# 7. Prv_Physician_Count
def prv_physician_count(df, physician_col):
    """
    Count unique physicians for each provider.
    If multiple columns are provided, all unique physician IDs across them are counted.
    Works with Dask DataFrames.
    """
    if isinstance(physician_col, list):
        # Combine provider with all physician columns, then reshape and deduplicate
        dfs = []
        for col in physician_col:
            temp = df[["Provider", col]].rename(columns={col: "Physician"}).dropna()
            dfs.append(temp)
        
        combined = dd.concat(dfs)
        unique_counts = (
            combined.dropna()
            .drop_duplicates()
            .groupby("Provider")["Physician"]
            .nunique()
            .reset_index()
        )
        unique_counts = unique_counts.rename(columns={"Physician": "Prv_Physician_Count"})

    else:
        unique_counts = (
            df.groupby("Provider")[physician_col]
            .nunique()
            .reset_index()
            .rename(columns={physician_col: f"{physician_col}_Count"})
        )

    return unique_counts


In [94]:
#prv_Attphysician_count = prv_physician_count(df_test, "AttendingPhysician")
#prv_OPphysician_count = prv_physician_count(df_test, "OperatingPhysician")
#prv_Otphysician_count = prv_physician_count(df_test, "OtherPhysician")
prv_Allphysician_count_test = prv_physician_count(df_test, ["AttendingPhysician", "OperatingPhysician", "OtherPhysician"])
prv_Allphysician_count_train = prv_physician_count(df_train, ["AttendingPhysician", "OperatingPhysician", "OtherPhysician"])


In [95]:
# 10. Provider_Insurance_Clam_Reimbursement_Amt
def prv_insc_claim_reimb_amt(df):
    """
    Calculate the total insurance reimbursement amount per provider.
    """
    return df.groupby("Provider")["InscClaimAmtReimbursed"].sum().reset_index().rename(
        columns={"InscClaimAmtReimbursed": "Provider_Insurance_Claim_Reimbursement_Amt"}
    )



In [96]:
prv_insc_claim_reimb_amt_test = prv_insc_claim_reimb_amt(df_test)
prv_insc_claim_reimb_amt_train = prv_insc_claim_reimb_amt(df_train)

In [97]:
# 11. Provider_Total_Bene
def prv_total_bene(df):
    """
    Calculate the total number of unique beneficiaries per provider.
    """
    return df.groupby("Provider")["BeneID"].nunique().reset_index().rename(
        columns={"BeneID": "Provider_Total_Patients"}
    )



In [98]:
provider_total_bene_test = prv_total_bene(df_test)
provider_total_bene_train = prv_total_bene(df_train)

In [99]:
# 12. Provider_Total_Chronic_Beneficiaries

def prv_total_chron_bene(df, chronic_cols):
    """
    Calculates the total number of beneficiaries per provider for each chronic condition.

    Parameters:
        df (Dask or Pandas DataFrame): Input beneficiary DataFrame
        chronic_cols (list of str): List of chronic condition columns (values should be 0 or 1)

    Returns:
        DataFrame with one row per provider and total counts of each chronic condition.
    """
    # Check if all columns exist
    missing = [col for col in chronic_cols if col not in df.columns]
    if missing:
        raise ValueError(f"The following columns are missing: {missing}")
    
    # Group and sum per provider
    agg_df = df.groupby("Provider")[chronic_cols].sum().reset_index()

    # Rename columns
    agg_df = agg_df.rename(columns={col: f"Provider_Total_{col}_Patients" for col in chronic_cols})

    return agg_df
chronic_cols = [
    "ChronicCond_Alzheimer",
    "ChronicCond_Heartfailure",
    "ChronicCond_KidneyDisease",
    "ChronicCond_Cancer",
    "ChronicCond_ObstrPulmonary",
    "ChronicCond_Depression",
    "ChronicCond_Diabetes",
    "ChronicCond_IschemicHeart",
    "ChronicCond_Osteoporasis",
    "ChronicCond_rheumatoidarthritis",
    "ChronicCond_stroke"
]



In [100]:
provider_total_chronic_bene_test = prv_total_chron_bene(df_test, chronic_cols)
provider_total_chronic_bene_train = prv_total_chron_bene(df_train, chronic_cols)

In [101]:
# 14. count of diagnosis for every Provider
import dask.dataframe as dd

def prv_diagnosis_count(df, diagnosis_cols):
    """
    Count non-null occurrences of the ClmDiagnosisCode 1-3 per provider.
    
    Parameters:
        df (Dask DataFrame): Input DataFrame containing diagnosis codes
        diagnosis_cols (list of str): List of diagnosis code columns
    
    Returns:
        Dask DataFrame with counts of each diagnosis column per provider
    """
    # Start with the first column's counts
    result = df.groupby("Provider")[diagnosis_cols[0]].count().reset_index().rename(
        columns={diagnosis_cols[0]: f"{diagnosis_cols[0]}_Count"}
    )
    
    # Iterate through remaining diagnosis columns and join counts
    for col in diagnosis_cols[1:]:
        temp = df.groupby("Provider")[col].count().reset_index().rename(
            columns={col: f"{col}_Count"}
        )
        result = result.merge(temp, on="Provider", how="outer")

    return result
diagnosis_cols = [
    "ClmAdmitDiagnosisCode",
    "ClmDiagnosisCode_1",
    "ClmDiagnosisCode_2",
    "ClmDiagnosisCode_3"
]



In [102]:
prv_diagnosis_count_test = prv_diagnosis_count(df_test, diagnosis_cols)
prv_diagnosis_count_train = prv_diagnosis_count(df_train, diagnosis_cols)


In [103]:
# 18. Most frequent Claimcodes for every Provider
from functools import reduce
import dask.dataframe as dd

def prv_most_frequent_claim_codes(df, claim_code_cols):
    """
    Find the most frequent claim code for each provider across multiple columns.
    
    Parameters:
        df (Dask DataFrame): Input DataFrame containing claim codes
        claim_code_cols (list of str): List of claim code column names
    
    Returns:
        Dask DataFrame: Each row contains Provider and the most frequent code per claim column
    """
    results = []

    for col in claim_code_cols:
        # Count frequencies per Provider per code
        code_counts = (
            df.groupby(["Provider", col])
            .size()
            .reset_index()
            .rename(columns={0: "Count"})
        )

        # Sort within each partition, then drop duplicates to get most frequent
        most_frequent = (
            code_counts.map_partitions(lambda pdf: pdf.sort_values("Count", ascending=False))
            .drop_duplicates(subset="Provider")
            .rename(columns={col: f"{col}_Most_Frequent"})
            .drop(columns=["Count"])
        )

        results.append(most_frequent)

    # Merge all the most frequent codes per column
    final_result = reduce(lambda left, right: left.merge(right, on="Provider", how="outer"), results)

    return final_result

claim_code_cols = [
    "ClmAdmitDiagnosisCode",
    "ClmDiagnosisCode_1",
    "ClmDiagnosisCode_2",
    "ClmDiagnosisCode_3",
    
]


In [104]:
prv_most_frequent_claim_codes_test = prv_most_frequent_claim_codes(df_test, claim_code_cols)
prv_most_frequent_claim_codes_train = prv_most_frequent_claim_codes(df_train, claim_code_cols)

In [105]:
from functools import reduce

def prv_most_frequent_physicians(df, physician_cols):
    """
    Find the most frequent physician for each provider across multiple physician columns.
    
    Parameters:
        df (Dask DataFrame): Input DataFrame containing provider and physician columns
        physician_cols (list of str): List of physician column names
    
    Returns:
        Dask DataFrame: Each row contains Provider and the most frequent physician per column
    """
    results = []

    for col in physician_cols:
        # Count frequencies per Provider per Physician
        physician_counts = (
            df.groupby(["Provider", col])
            .size()
            .reset_index()
            .rename(columns={0: "Count"})
        )

        # Sort by frequency, then get most frequent physician per provider
        most_frequent = (
            physician_counts.map_partitions(lambda pdf: pdf.sort_values("Count", ascending=False))
            .drop_duplicates(subset="Provider")
            .rename(columns={col: f"{col}_Most_Frequent"})
            .drop(columns=["Count"])
        )

        results.append(most_frequent)

    # Merge all the most frequent physician columns on Provider
    final_df = reduce(lambda left, right: left.merge(right, on="Provider", how="outer"), results)

    return final_df
physician_cols = [
    "AttendingPhysician",
    "OperatingPhysician",
    "OtherPhysician"
]


In [106]:
prv_most_frequent_physicians_test = prv_most_frequent_physicians(df_test, physician_cols)
prv_most_frequent_physicians_train = prv_most_frequent_physicians(df_train, physician_cols)

In [107]:
# 16. bene deductible and claimcost amount
def bene_calculate_amount(df):
    """
    Return a Dask DataFrame with BeneID, AllocatedAmount (as-is), and summed Deductible & Reimbursed amounts.

    Parameters:
        df (Dask DataFrame): Input with reimbursement and deductible fields

    Returns:
        Dask DataFrame with columns: BeneID, AllocatedAmount, DeductibleAmtPaid (sum), InscClaimAmtReimbursed (sum)
    """
    

    # Calculate AllocatedAmount (not to be summed)
    df["AllocatedAmount"] = df["IPAnnualReimbursementAmt"] + df["OPAnnualReimbursementAmt"]

    # Get first AllocatedAmount per BeneID (assuming same for all rows of that BeneID)
    allocated = df[["BeneID", "AllocatedAmount"]].drop_duplicates(subset="BeneID")

    # Sum the other columns per BeneID
    summed = df.groupby("BeneID")[["DeductibleAmtPaid", "InscClaimAmtReimbursed"]].sum().reset_index().rename(columns={'DeductibleAmtPaid':'bene_DeductibleAmtPaid', 'InscClaimAmtReimbursed': 'bene_InscClaimAmtReimbursed'})
   

    # Merge
    result = allocated.merge(summed, on="BeneID", how="left")

    return result



In [108]:
bene_calculate_amount_test = bene_calculate_amount(df_test)
bene_calculate_amount_train = bene_calculate_amount(df_train)

In [109]:
prv_bene_amount_avg_test= dd.merge(
    df_test[["BeneID", "Provider"]],
    bene_calculate_amount_test,
    on="BeneID",
    how="left"
).groupby("Provider")["AllocatedAmount", 'bene_DeductibleAmtPaid', "bene_InscClaimAmtReimbursed"].mean().reset_index()
prv_bene_amount_avg_test = prv_bene_amount_avg_test.rename(
    columns={"AllocatedAmount": "Avg_allocated_Amount_Per_Provider", 'bene_DeductibleAmtPaid': "Avg_Deductible_Amt_Paid_Per_Provider", "bene_InscClaimAmtReimbursed": "Avg_InscClaimAmtReimbursed_Per_Provider" }
)
prv_bene_amount_avg_test['perc_allocated_used'] = (prv_bene_amount_avg_test["Avg_InscClaimAmtReimbursed_Per_Provider"] - prv_bene_amount_avg_test["Avg_Deductible_Amt_Paid_Per_Provider"]) /(prv_bene_amount_avg_test["Avg_allocated_Amount_Per_Provider"] - prv_bene_amount_avg_test["Avg_Deductible_Amt_Paid_Per_Provider"])
prv_bene_amount_avg_test.head(5)


[########################################] | 100% Completed | 1.07 sms
[########################################] | 100% Completed | 1.16 s


Unnamed: 0,Provider,Avg_allocated_Amount_Per_Provider,Avg_Deductible_Amt_Paid_Per_Provider,Avg_InscClaimAmtReimbursed_Per_Provider,perc_allocated_used
0,PRV51002,7526.0,72.409756,1322.585366,0.167728
1,PRV51006,6169.215686,75.156863,1617.254902,0.253049
2,PRV51009,5445.128205,121.384615,1767.435897,0.309191
3,PRV51010,7044.210526,449.684211,4747.105263,0.651665
4,PRV51018,7551.894737,53.968421,1332.0,0.170451


In [110]:
prv_bene_amount_avg_train= dd.merge(
    df_train[["BeneID", "Provider"]],
    bene_calculate_amount_train,
    on="BeneID",
    how="left"
).groupby("Provider")["AllocatedAmount", 'bene_DeductibleAmtPaid', "bene_InscClaimAmtReimbursed"].mean().reset_index()
prv_bene_amount_avg_train = prv_bene_amount_avg_train.rename(
    columns={"AllocatedAmount": "Avg_allocated_Amount_Per_Provider", 'bene_DeductibleAmtPaid': "Avg_Deductible_Amt_Paid_Per_Provider", "bene_InscClaimAmtReimbursed": "Avg_InscClaimAmtReimbursed_Per_Provider" }
)
prv_bene_amount_avg_train['perc_allocated_used'] = (prv_bene_amount_avg_train["Avg_InscClaimAmtReimbursed_Per_Provider"] - prv_bene_amount_avg_train["Avg_Deductible_Amt_Paid_Per_Provider"]) /(prv_bene_amount_avg_train["Avg_allocated_Amount_Per_Provider"] - prv_bene_amount_avg_train["Avg_Deductible_Amt_Paid_Per_Provider"])
prv_bene_amount_avg_train.head(5)

[########################################] | 100% Completed | 3.58 ss
[########################################] | 100% Completed | 3.68 s


Unnamed: 0,Provider,Avg_allocated_Amount_Per_Provider,Avg_Deductible_Amt_Paid_Per_Provider,Avg_InscClaimAmtReimbursed_Per_Provider,perc_allocated_used
0,PRV51001,20221.2,957.44,19703.6,0.973131
1,PRV51003,10246.363636,846.69697,8799.166667,0.846037
2,PRV51004,6546.778523,339.463087,5233.825503,0.788483
3,PRV51005,5733.725322,351.278112,5063.519313,0.875483
4,PRV51007,4779.722222,456.805556,4495.833333,0.934329


In [111]:
# creat a new column to indicate the quarter of the year for every claim
def add_quarter_column(df):
    """
    Add a 'Quarter' column based on ClaimStartDt.
    - 2009 quarters are numbered 1–4
    - 2008 quarters are numbered 0
    """
    # Convert to datetime safely
    df['ClaimStartDt'] = dd.to_datetime(df['ClaimStartDt'], errors='coerce')

    # Extract year and quarter
    df['Year'] = df['ClaimStartDt'].dt.year
    df['Quarter'] = df['ClaimStartDt'].dt.quarter

    # Apply conditional logic: use different quarter labels for 2008
    df['Quarter'] = df.apply(
        lambda row: row.Quarter if row.Year == 2009 else (0 if row.Year == 2008 else None),
        axis=1,
        meta=('Quarter', 'float64')  # must specify meta for Dask apply
    )

    # Drop temporary year column if not needed
    df = df.drop('Year', axis=1)

    return df


In [112]:
df_train_qu=add_quarter_column(df_train)
df_test_qu=add_quarter_column(df_test)

In [None]:

#mode_val = df_train_qu['Quarter'].value_counts().nlargest(1).compute()
#
#print("Mode:")
#print(mode_val)

[                                        ] | 0% Completed | 151.35 us

[########################################] | 100% Completed | 16.37 ss
[########################################] | 100% Completed | 16.47 s
Mode:
Quarter
2    144743
Name: count, dtype: int64


Refine the df 

Merging Stage


In [118]:
# Merge the train DataFrame with all the calculated features
df_train_merged = dd.merge(
    provider_train_df,
    prv_bene_age_sum_train,
    on="Provider",
    how="left"
).merge(
    prv_total_claims_train,
    on="Provider",
    how="left"
).merge(
    prv_total_claims_for_physicians_train,
    on="Provider",
    how="left"
).merge(
    prv_Allphysician_count_train,
    on="Provider",
    how="left"
).merge(
    prv_insc_claim_reimb_amt_train,
    on="Provider",
    how="left"
).merge(
    provider_total_bene_train,
    on="Provider",
    how="left"
).merge(
    provider_total_chronic_bene_train,
    on="Provider",
    how="left"
).merge(
    prv_diagnosis_count_train,
    on="Provider",
    how="left"
).merge(
    prv_most_frequent_claim_codes_train,
    on="Provider",
    how="left"
).merge(
    prv_most_frequent_physicians_train,
    on="Provider",
    how="left"
).merge(
    prv_bene_amount_avg_train,
    on="Provider",
    how="left"
)

In [119]:
df_test_merged= dd.merge(
    provider_test_df,
    prv_bene_age_sum_test,
    on="Provider",
    how="left"
).merge(
    prv_total_claims_test,
    on="Provider",
    how="left"
).merge(
    prv_total_claims_for_physicians_test,
    on="Provider",
    how="left"
).merge(
    prv_Allphysician_count_test,
    on="Provider",
    how="left"
).merge(
    prv_insc_claim_reimb_amt_test,
    on="Provider",
    how="left"
).merge(
    provider_total_bene_test,
    on="Provider",
    how="left"
).merge(
    provider_total_chronic_bene_test,
    on="Provider",
    how="left"
).merge(
    prv_diagnosis_count_test,
    on="Provider",
    how="left"
).merge(
    prv_most_frequent_claim_codes_test,
    on="Provider",
    how="left"
).merge(
    prv_most_frequent_physicians_test,
    on="Provider",
    how="left"
).merge(
    prv_bene_amount_avg_test,
    on="Provider",
    how="left"
)

In [121]:
df_train_merged.head(5)

[########################################] | 100% Completed | 26.25 s
[########################################] | 100% Completed | 26.34 s


Unnamed: 0,Provider,ClaimDuration_sum,ClaimDuration_mean,ClaimDuration_std,ClaimDuration_max,ClaimDuration_min,HospitalDuration_sum,HospitalDuration_mean,HospitalDuration_std,HospitalDuration_max,...,ClmDiagnosisCode_1_Most_Frequent,ClmDiagnosisCode_2_Most_Frequent,ClmDiagnosisCode_3_Most_Frequent,AttendingPhysician_Most_Frequent,OperatingPhysician_Most_Frequent,OtherPhysician_Most_Frequent,Avg_allocated_Amount_Per_Provider,Avg_Deductible_Amt_Paid_Per_Provider,Avg_InscClaimAmtReimbursed_Per_Provider,perc_allocated_used
0,PRV52145,335,1.367347,4.708267,20,0,0.0,,,,...,4019,4019,V5869,PHY373236,0,0,5285.673469,373.779592,4569.55102,0.854206
1,PRV55104,72,1.384615,4.822959,20,0,0.0,,,,...,4019,4019,2724,PHY317949,0,0,8637.307692,577.230769,6325.769231,0.713211
2,PRV54894,1024,4.471616,6.659338,35,0,918.0,6.652174,7.147714,35.0,...,389,4019,4019,PHY424317,0,0,17460.960699,1191.240175,16329.039301,0.930428
3,PRV54927,162,1.588235,5.041124,20,0,0.0,,,,...,4011,4019,4019,PHY345039,0,0,9065.196078,442.431373,7331.470588,0.798936
4,PRV55215,5671,1.671382,4.93539,34,0,792.0,5.538462,5.823422,34.0,...,4011,4019,4019,PHY336787,0,0,7084.815797,429.007663,5706.926024,0.792979


In [122]:
df_train_merged.columns

Index(['Provider', 'ClaimDuration_sum', 'ClaimDuration_mean',
       'ClaimDuration_std', 'ClaimDuration_max', 'ClaimDuration_min',
       'HospitalDuration_sum', 'HospitalDuration_mean', 'HospitalDuration_std',
       'HospitalDuration_max', 'HospitalDuration_min', 'DeductibleAmtPaid_sum',
       'DeductibleAmtPaid_mean', 'DeductibleAmtPaid_std',
       'DeductibleAmtPaid_max', 'DeductibleAmtPaid_min',
       'IPAnnualReimbursementAmt_mean', 'IPAnnualReimbursementAmt_max',
       'IPAnnualDeductibleAmt_mean', 'IPAnnualDeductibleAmt_max',
       'OPAnnualReimbursementAmt_mean', 'OPAnnualReimbursementAmt_max',
       'OPAnnualDeductibleAmt_mean', 'OPAnnualDeductibleAmt_max',
       'ChronicCond_Alzheimer_sum', 'ChronicCond_Alzheimer_mean',
       'ChronicCond_Heartfailure_sum', 'ChronicCond_Heartfailure_mean',
       'ChronicCond_KidneyDisease_sum', 'ChronicCond_KidneyDisease_mean',
       'ChronicCond_Cancer_sum', 'ChronicCond_Cancer_mean',
       'ChronicCond_ObstrPulmonary_sum', 'Chr

Saving the Data Frame

In [123]:
def save_dask_to_s3(df, path, file_format="csv", single_file=False, index=False):
    """
    Save a Dask DataFrame to S3 in CSV or Parquet format.
    
    Parameters:
        df (dask.DataFrame): The Dask DataFrame to save
        path (str): S3 path (e.g. s3://bucket/folder/)
        file_format (str): 'csv' or 'parquet'
        single_file (bool): Save as single file (only for small data)
        index (bool): Whether to save the index
    """
    if file_format == "csv":
        if single_file:
            df.compute().to_csv(path, index=index)
        else:
            df.to_csv(path + "part-*.csv", index=index)
    elif file_format == "parquet":
        df.to_parquet(path, write_index=index)
    else:
        raise ValueError("Unsupported file_format: choose 'csv' or 'parquet'")


In [124]:
merged_path = "s3://medicare-fraud-data-25-05-2025/merged_ready/"
# Save the merged DataFrames to S3
save_dask_to_s3(df_train_merged, merged_path + "train/")
save_dask_to_s3(df_test_merged, merged_path + "test/")

[########################################] | 100% Completed | 25.54 s
[########################################] | 100% Completed | 25.54 s
[########################################] | 100% Completed | 6.84 ss
[########################################] | 100% Completed | 6.94 s


After Merging calcution

In [None]:
# 16. Real Average Claims per Provider we have to run this after merging the dataframes
#df_train['real_avg_claims_per_provider'] = df_train['TotalClaims'] - df_train['TotalClaims'].mean()

In [None]:
# 16. Average of Claimcost for every Provider
#df_train['real_avg_claim_cost_per_provider'] = df_train['Provider_Insurance_Claim_Reimbursement_Amt'] - df_train['Provider_Insurance_Claim_Reimbursement_Amt'].mean()  

In [None]:
# 17. Median of the Claimscost for every Provider
#df_train['real_median_claim_cost_per_provider'] = df_train['Provider_Insurance_Claim_Reimbursement_Amt'] - df_train['Provider_Insurance_Claim_Reimbursement_Amt'].median()