In [1]:
# 1. Import necessary libraries
import dask.dataframe as dd

In [2]:
# 2. Define the function to load merged data
def load_merged_data():
    """
    Load all merged data from S3.
    """
    merged_dtypes = {
    'ClaimID': 'object',
    'ClaimStartDt': 'object',
    'ClaimEndDt': 'object',
    'Provider' : 'object',
    'InscClaimAmtReimbursed' : 'float64',
    'AttendingPhysician' :'object',
    'OperatingPhysician' :'object',
    'OtherPhysician' :'object',
    'AdmissionDt'  :'object',
    'ClmAdmitDiagnosisCode' :'object',
    'DeductibleAmtPaid' :'float64',
    'IPAnnualReimbursementAmt': 'float64',
    'OPAnnualReimbursementAmt': 'float64',
    'DischargeDt' :'object',
    'ClmAdmitDiagnosisCode': 'object',
    'ClmDiagnosisCode_1': 'object',
    'ClmDiagnosisCode_2': 'object',
    'ClmDiagnosisCode_3': 'object',
    'ClmDiagnosisCode_4': 'object',
    'ClmDiagnosisCode_5': 'object',
    'ClmDiagnosisCode_6': 'object',
    'ClmDiagnosisCode_7': 'object',
    'ClmDiagnosisCode_8': 'object',
    'ClmDiagnosisCode_9': 'object',
    'ClmDiagnosisCode_10': 'object', 
    'DiagnosisGroupCode': 'object',
    'IPAnnualDeductibleAmt': 'float64',
    'OPAnnualDeductibleAmt': 'float64',
    }
    #date_columns_in = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']
    clean_path = "s3://medicare-fraud-data-25-05-2025/clean/"
    df_train = dd.read_csv(clean_path+"train_full/*.csv", dtype=merged_dtypes)
    df_test = dd.read_csv(clean_path+"test_full/*.csv", dtype=merged_dtypes)
    print("Data loaded successfully")
    
    return (df_train, df_test)

In [3]:
# 3. Call the function to load data
df_train, df_test = load_merged_data()


Data loaded successfully


In [None]:
# 4. Convert date columns to datetime format
#def convert_dates(df):
#    """
#    Convert date columns to datetime format.
#    """
#    date_columns_in = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']
#    for col in date_columns_in:
#        df[col] = dd.to_datetime(df[col], errors='coerce')
#    return df
    

In [None]:
# 5. Uncomment the following lines if you want to revert dates back to strings
#def revert_dates(df):
#    """
#    Konvertiert Datetime-Spalten zurück in Strings im ISO-Format (YYYY-MM-DD).
#    """
#    date_columns_in = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']
#    for col in date_columns_in:
#        # Prüfen, ob Spalte im DataFrame existiert und vom Datetime-Typ ist
#        if col in df.columns:
#            df[col] = df[col].asstype('object')  # Konvertiert Datetime zurück in String
#    return df


In [8]:
# dictionary to hold physician columns
physician_cols_test = [col for col in df_test.columns if "Physician" in col]
physician_cols_train = [col for col in df_train.columns if "Physician" in col]

Starting from here with the new Features V2.0

In [7]:
# 1. Replacing NANs in all Physician Columns by Zero
cols_to_fill = ['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']
df_test[cols_to_fill] = df_test[cols_to_fill].fillna(0)
df_train[cols_to_fill] = df_train[cols_to_fill].fillna(0)


In [9]:
# 2. Sum of the Beneficiary Age for every Provider
prv_bene_age_sum_test = df_test.groupby("Provider")["Bene_Age"].sum().reset_index()
prv_bene_age_sum_test = prv_bene_age_sum_test.rename(columns={"Bene_Age": "Bene_Age_Sum"})

In [10]:
prv_bene_age_sum_train = df_train.groupby("Provider")["Bene_Age"].sum().reset_index()
prv_bene_age_sum_train = prv_bene_age_sum_train.rename(columns={"Bene_Age": "Bene_Age_Sum"})

In [12]:
# 3. Number of Total Claims per Provider. The original Idea was to identify the Total Number of false Claims by a Provider. For that he subtract the number of fradulent claims from the number of total claims
prv_total_claims_test = df_test.groupby("Provider")["ClaimID"].count().reset_index()
prv_total_claims_test.columns = ["Provider", "TotalClaims"]

In [11]:
prv_total_claims_train = df_train.groupby("Provider")["ClaimID"].count().reset_index()
prv_total_claims_train.columns = ["Provider", "TotalClaims"]

In [14]:
# 4. Define a function to compute total claims per provider for the attending physician
def prv_total_claims_for_physicians(df):
    """
    Compute total claims per provider for each physician type, and return one merged Dask DataFrame.
    """

    # Count total claims per provider-physician type
    att = df.groupby(["Provider", "AttendingPhysician"])["ClaimID"].count().reset_index()
    att = att.rename(columns={"ClaimID": "AttendingPhysician_TotalClaims"})

    #op = df.groupby(["Provider", "OperatingPhysician"])["ClaimID"].count().reset_index()
    #op = op.rename(columns={"ClaimID": "OperatingPhysician_TotalClaims"})
#
    #ot = df.groupby(["Provider", "OtherPhysician"])["ClaimID"].count().reset_index()
    #ot = ot.rename(columns={"ClaimID": "OtherPhysician_TotalClaims"})
#
    ## Now reduce these to provider-level totals by summing claims per provider
    att_sum = att.groupby("Provider")["AttendingPhysician_TotalClaims"].sum().reset_index()
    #op_sum = op.groupby("Provider")["OperatingPhysician_TotalClaims"].sum().reset_index()
    #ot_sum = ot.groupby("Provider")["OtherPhysician_TotalClaims"].sum().reset_index()

    # Merge safely
    #merged = att_sum.merge(op_sum, on="Provider", how="outer")
    #merged = merged.merge(ot_sum, on="Provider", how="outer")

    return att_sum



In [15]:
prv_total_claims_for_physicians_test =prv_total_claims_for_physicians(df_test)

In [16]:
prv_total_claims_for_physicians_train =prv_total_claims_for_physicians(df_train)

In [17]:
# 7. Prv_Physician_Count
def prv_physician_count(df, physician_col):
    """
    Count unique physicians for each provider.
    If multiple columns are provided, all unique physician IDs across them are counted.
    Works with Dask DataFrames.
    """
    if isinstance(physician_col, list):
        # Combine provider with all physician columns, then reshape and deduplicate
        dfs = []
        for col in physician_col:
            temp = df[["Provider", col]].rename(columns={col: "Physician"}).dropna()
            dfs.append(temp)
        
        combined = dd.concat(dfs)
        unique_counts = (
            combined.dropna()
            .drop_duplicates()
            .groupby("Provider")["Physician"]
            .nunique()
            .reset_index()
        )
        unique_counts = unique_counts.rename(columns={"Physician": "Prv_Physician_Count"})

    else:
        unique_counts = (
            df.groupby("Provider")[physician_col]
            .nunique()
            .reset_index()
            .rename(columns={physician_col: f"{physician_col}_Count"})
        )

    return unique_counts


In [18]:
#prv_Attphysician_count = prv_physician_count(df_test, "AttendingPhysician")
#prv_OPphysician_count = prv_physician_count(df_test, "OperatingPhysician")
#prv_Otphysician_count = prv_physician_count(df_test, "OtherPhysician")
prv_Allphysician_count_test = prv_physician_count(df_test, ["AttendingPhysician", "OperatingPhysician", "OtherPhysician"])
prv_Allphysician_count_train = prv_physician_count(df_train, ["AttendingPhysician", "OperatingPhysician", "OtherPhysician"])


In [20]:
# 10. Provider_Insurance_Clam_Reimbursement_Amt
def prv_insc_claim_reimb_amt(df):
    """
    Calculate the total insurance reimbursement amount per provider.
    """
    return df.groupby("Provider")["InscClaimAmtReimbursed"].sum().reset_index().rename(
        columns={"InscClaimAmtReimbursed": "Provider_Insurance_Claim_Reimbursement_Amt"}
    )



In [21]:
prv_insc_claim_reimb_amt_test = prv_insc_claim_reimb_amt(df_test)
prv_insc_claim_reimb_amt_train = prv_insc_claim_reimb_amt(df_train)

In [22]:
# 11. Provider_Total_Bene
def prv_total_bene(df):
    """
    Calculate the total number of unique beneficiaries per provider.
    """
    return df.groupby("Provider")["BeneID"].nunique().reset_index().rename(
        columns={"BeneID": "Provider_Total_Patients"}
    )



In [25]:
provider_total_bene_test = prv_total_bene(df_test)
provider_total_bene_train = prv_total_bene(df_train)

In [24]:
# 12. Provider_Total_Chronic_Beneficiaries

def prv_total_chron_bene(df, chronic_cols):
    """
    Calculates the total number of beneficiaries per provider for each chronic condition.

    Parameters:
        df (Dask or Pandas DataFrame): Input beneficiary DataFrame
        chronic_cols (list of str): List of chronic condition columns (values should be 0 or 1)

    Returns:
        DataFrame with one row per provider and total counts of each chronic condition.
    """
    # Check if all columns exist
    missing = [col for col in chronic_cols if col not in df.columns]
    if missing:
        raise ValueError(f"The following columns are missing: {missing}")
    
    # Group and sum per provider
    agg_df = df.groupby("Provider")[chronic_cols].sum().reset_index()

    # Rename columns
    agg_df = agg_df.rename(columns={col: f"Provider_Total_{col}_Patients" for col in chronic_cols})

    return agg_df
chronic_cols = [
    "ChronicCond_Alzheimer",
    "ChronicCond_Heartfailure",
    "ChronicCond_KidneyDisease",
    "ChronicCond_Cancer",
    "ChronicCond_ObstrPulmonary",
    "ChronicCond_Depression",
    "ChronicCond_Diabetes",
    "ChronicCond_IschemicHeart",
    "ChronicCond_Osteoporasis",
    "ChronicCond_rheumatoidarthritis",
    "ChronicCond_stroke"
]



In [26]:
provider_total_chronic_bene_test = prv_total_chron_bene(df_test, chronic_cols)
provider_total_chronic_bene_train = prv_total_chron_bene(df_train, chronic_cols)

In [27]:
# 14. count of diagnosis for every Provider
import dask.dataframe as dd

def prv_diagnosis_count(df, diagnosis_cols):
    """
    Count non-null occurrences of the ClmDiagnosisCode 1-3 per provider.
    
    Parameters:
        df (Dask DataFrame): Input DataFrame containing diagnosis codes
        diagnosis_cols (list of str): List of diagnosis code columns
    
    Returns:
        Dask DataFrame with counts of each diagnosis column per provider
    """
    # Start with the first column's counts
    result = df.groupby("Provider")[diagnosis_cols[0]].count().reset_index().rename(
        columns={diagnosis_cols[0]: f"{diagnosis_cols[0]}_Count"}
    )
    
    # Iterate through remaining diagnosis columns and join counts
    for col in diagnosis_cols[1:]:
        temp = df.groupby("Provider")[col].count().reset_index().rename(
            columns={col: f"{col}_Count"}
        )
        result = result.merge(temp, on="Provider", how="outer")

    return result
diagnosis_cols = [
    "ClmAdmitDiagnosisCode",
    "ClmDiagnosisCode_1",
    "ClmDiagnosisCode_2",
    "ClmDiagnosisCode_3"
]



In [28]:
prv_diagnosis_count_test = prv_diagnosis_count(df_test, diagnosis_cols)
prv_diagnosis_count_train = prv_diagnosis_count(df_train, diagnosis_cols)


In [29]:
# 18. Most frequent Claimcodes for every Provider
from functools import reduce
import dask.dataframe as dd

def prv_most_frequent_claim_codes(df, claim_code_cols):
    """
    Find the most frequent claim code for each provider across multiple columns.
    
    Parameters:
        df (Dask DataFrame): Input DataFrame containing claim codes
        claim_code_cols (list of str): List of claim code column names
    
    Returns:
        Dask DataFrame: Each row contains Provider and the most frequent code per claim column
    """
    results = []

    for col in claim_code_cols:
        # Count frequencies per Provider per code
        code_counts = (
            df.groupby(["Provider", col])
            .size()
            .reset_index()
            .rename(columns={0: "Count"})
        )

        # Sort within each partition, then drop duplicates to get most frequent
        most_frequent = (
            code_counts.map_partitions(lambda pdf: pdf.sort_values("Count", ascending=False))
            .drop_duplicates(subset="Provider")
            .rename(columns={col: f"{col}_Most_Frequent"})
            .drop(columns=["Count"])
        )

        results.append(most_frequent)

    # Merge all the most frequent codes per column
    final_result = reduce(lambda left, right: left.merge(right, on="Provider", how="outer"), results)

    return final_result

claim_code_cols = [
    "ClmAdmitDiagnosisCode",
    "ClmDiagnosisCode_1",
    "ClmDiagnosisCode_2",
    "ClmDiagnosisCode_3",
    
]


In [30]:
prv_most_frequent_claim_codes_test = prv_most_frequent_claim_codes(df_test, claim_code_cols)
prv_most_frequent_claim_codes_train = prv_most_frequent_claim_codes(df_train, claim_code_cols)

In [31]:
from functools import reduce

def prv_most_frequent_physicians(df, physician_cols):
    """
    Find the most frequent physician for each provider across multiple physician columns.
    
    Parameters:
        df (Dask DataFrame): Input DataFrame containing provider and physician columns
        physician_cols (list of str): List of physician column names
    
    Returns:
        Dask DataFrame: Each row contains Provider and the most frequent physician per column
    """
    results = []

    for col in physician_cols:
        # Count frequencies per Provider per Physician
        physician_counts = (
            df.groupby(["Provider", col])
            .size()
            .reset_index()
            .rename(columns={0: "Count"})
        )

        # Sort by frequency, then get most frequent physician per provider
        most_frequent = (
            physician_counts.map_partitions(lambda pdf: pdf.sort_values("Count", ascending=False))
            .drop_duplicates(subset="Provider")
            .rename(columns={col: f"{col}_Most_Frequent"})
            .drop(columns=["Count"])
        )

        results.append(most_frequent)

    # Merge all the most frequent physician columns on Provider
    final_df = reduce(lambda left, right: left.merge(right, on="Provider", how="outer"), results)

    return final_df
physician_cols = [
    "AttendingPhysician",
    "OperatingPhysician",
    "OtherPhysician"
]


In [32]:
prv_most_frequent_physicians_test = prv_most_frequent_physicians(df_test, physician_cols)
prv_most_frequent_physicians_train = prv_most_frequent_physicians(df_train, physician_cols)

In [37]:
# 16. bene deductible and claimcost amount
def bene_calculate_amount(df):
    """
    Return a Dask DataFrame with BeneID, AllocatedAmount (as-is), and summed Deductible & Reimbursed amounts.

    Parameters:
        df (Dask DataFrame): Input with reimbursement and deductible fields

    Returns:
        Dask DataFrame with columns: BeneID, AllocatedAmount, DeductibleAmtPaid (sum), InscClaimAmtReimbursed (sum)
    """
    

    # Calculate AllocatedAmount (not to be summed)
    df["AllocatedAmount"] = df["IPAnnualReimbursementAmt"] + df["OPAnnualReimbursementAmt"]

    # Get first AllocatedAmount per BeneID (assuming same for all rows of that BeneID)
    allocated = df[["BeneID", "AllocatedAmount"]].drop_duplicates(subset="BeneID")

    # Sum the other columns per BeneID
    summed = df.groupby("BeneID")[["DeductibleAmtPaid", "InscClaimAmtReimbursed"]].sum().reset_index()
   

    # Merge
    result = allocated.merge(summed, on="BeneID", how="left")

    return result



In [38]:
bene_calculate_amount_test = bene_calculate_amount(df_test)
bene_calculate_amount_train = bene_calculate_amount(df_train)

In [40]:
bene_amount_avg_prv_test= dd.merge(
    df_test[["BeneID", "Provider"]],
    bene_calculate_amount_test,
    on="BeneID",
    how="left"
).groupby("Provider")["AllocatedAmount", 'DeductibleAmtPaid', "InscClaimAmtReimbursed"].mean().reset_index()
bene_amount_avg_prv_test = bene_amount_avg_prv_test.rename(
    columns={"AllocatedAmount": "Avg_allocated_Amount_Per_Provider", 'DeductibleAmtPaid': "Avg_Deductible_Amt_Paid_Per_Provider", "InscClaimAmtReimbursed": "Avg_InscClaimAmtReimbursed_Per_Provider" }
)
bene_amount_avg_prv_test['perc_allocated_used'] = (bene_amount_avg_prv_test["Avg_InscClaimAmtReimbursed_Per_Provider"] - bene_amount_avg_prv_test["Avg_Deductible_Amt_Paid_Per_Provider"]) /(bene_amount_avg_prv_test["Avg_allocated_Amount_Per_Provider"] - bene_amount_avg_prv_test["Avg_Deductible_Amt_Paid_Per_Provider"])
bene_amount_avg_prv_test.head(5)

Unnamed: 0,Provider,Avg_allocated_Amount_Per_Provider,Avg_Deductible_Amt_Paid_Per_Provider,Avg_InscClaimAmtReimbursed_Per_Provider,perc_allocated_used
0,PRV51002,7526.0,72.409756,1322.585366,0.167728
1,PRV51006,6169.215686,75.156863,1617.254902,0.253049
2,PRV51009,5445.128205,121.384615,1767.435897,0.309191
3,PRV51010,7044.210526,449.684211,4747.105263,0.651665
4,PRV51018,7551.894737,53.968421,1332.0,0.170451


In [45]:
max_perc=bene_amount_avg_prv_test['perc_allocated_used'].max().compute()
max_perc

np.float64(1.0)

In [46]:
min_perc=bene_amount_avg_prv_test['perc_allocated_used'].min().compute()
min_perc

np.float64(0.0)

In [41]:
df_train_date = df_train.copy()
max_date = dd.to_datetime(df_train_date['ClaimStartDt'], errors='coerce').max().compute()


In [42]:
max_date

Timestamp('2009-12-31 00:00:00')

In [43]:
min_date = dd.to_datetime(df_train_date['ClaimStartDt'], errors='coerce').min().compute()

In [44]:
min_date

Timestamp('2008-11-27 00:00:00')

In [55]:
# creat a new column to indicate the quarter of the year for every claim
def add_quarter_column(df):
    """
    Add a 'Quarter' column based on ClaimStartDt.
    - 2009 quarters are numbered 1–4
    - 2008 quarters are numbered 101–104
    """
    # Convert to datetime safely
    df['ClaimStartDt'] = dd.to_datetime(df['ClaimStartDt'], errors='coerce')

    # Extract year and quarter
    df['Year'] = df['ClaimStartDt'].dt.year
    df['Quarter'] = df['ClaimStartDt'].dt.quarter

    # Apply conditional logic: use different quarter labels for 2008
    df['Quarter'] = df.apply(
        lambda row: row.Quarter if row.Year == 2009 else (0 if row.Year == 2008 else None),
        axis=1,
        meta=('Quarter', 'float64')  # must specify meta for Dask apply
    )

    # Drop temporary year column if not needed
    df = df.drop('Year', axis=1)

    return df


In [56]:
add_quarter_column(df_train)
add_quarter_column(df_test)

Unnamed: 0_level_0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,ClaimDuration,HospitalDuration,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Bene_Age,Bene_Alive,AllocatedAmount,Quarter
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
,string,string,datetime64[ns],string,string,float64,string,string,string,string,string,float64,string,string,string,string,string,string,string,string,string,string,string,string,float64,float64,float64,float64,float64,float64,int64,float64,string,string,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,float64,float64,float64,float64,int64,int64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [57]:
max=df_train['Quarter'].min().compute() 
max

np.int64(0)

Merging Stage


Saving the Data Frame

After Merging calcution

In [None]:
# 16. Real Average Claims per Provider we have to run this after merging the dataframes
#df_train['real_avg_claims_per_provider'] = df_train['TotalClaims'] - df_train['TotalClaims'].mean()

In [None]:
# 16. Average of Claimcost for every Provider
df_train['real_avg_claim_cost_per_provider'] = df_train['Provider_Insurance_Claim_Reimbursement_Amt'] - df_train['Provider_Insurance_Claim_Reimbursement_Amt'].mean()  

In [None]:
# 17. Median of the Claimscost for every Provider
df_train['real_median_claim_cost_per_provider'] = df_train['Provider_Insurance_Claim_Reimbursement_Amt'] - df_train['Provider_Insurance_Claim_Reimbursement_Amt'].median()