In [None]:
# Import Librarys
import dask.dataframe as dd
import pandas as pd

In [3]:
# Function for loading merged and cleaned data
def load_merged_data():
    """
    Load all merged data from S3.
    """
    merged_dtypes = {
    'ClaimID': 'object',
    'ClaimStartDt': 'object',
    'ClaimEndDt': 'object',
    'Provider' : 'object',
    'InscClaimAmtReimbursed' : 'float64',
    'AttendingPhysician' :'object',
    'OperatingPhysician' :'object',
    'OtherPhysician' :'object',
    'AdmissionDt'  :'object',
    'ClmAdmitDiagnosisCode' :'object',
    'DeductibleAmtPaid' :'float64',
    'DischargeDt' :'object',
    'ClmAdmitDiagnosisCode': 'object',
    'ClmDiagnosisCode_1': 'object',
    'ClmDiagnosisCode_2': 'object',
    'ClmDiagnosisCode_3': 'object',
    'ClmDiagnosisCode_4': 'object',
    'ClmDiagnosisCode_5': 'object',
    'ClmDiagnosisCode_6': 'object',
    'ClmDiagnosisCode_7': 'object',
    'ClmDiagnosisCode_8': 'object',
    'ClmDiagnosisCode_9': 'object',
    'ClmDiagnosisCode_10': 'object',
    'ClmProcedureCode_1' : 'object',
    'ClmProcedureCode_2' : 'object',
    'ClmProcedureCode_3' : 'object',
    'ClmProcedureCode_4' : 'object',
    'ClmProcedureCode_5' : 'object',
    'ClmProcedureCode_6' : 'object',
    'DiagnosisGroupCode': 'object'
    }
    date_columns_in = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']
    clean_path = "s3://medicare-fraud-data-25-05-2025/clean/"
    df_train = dd.read_csv(clean_path+"train_full/*.csv",parse_dates=date_columns_in, dtype=merged_dtypes)
    df_test = dd.read_csv(clean_path+"test_full/*.csv", parse_dates=date_columns_in, dtype=merged_dtypes)
    print("Data loaded successfully")
    
    return (df_train, df_test)

In [4]:
# Function for converting Dates
def convert_dates(df):
    """
    Convert date columns to datetime format.
    """
    date_columns_in = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']
    for col in date_columns_in:
        df[col] = dd.to_datetime(df[col], errors='coerce')
    return df

In [None]:
def revert_dates(df):
    """
    Konvertiert Datetime-Spalten zurück in Strings im ISO-Format (YYYY-MM-DD).
    """
    date_columns_in = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']
    for col in date_columns_in:
        # Prüfen, ob Spalte im DataFrame existiert und vom Datetime-Typ ist
        if col in df.columns:
            df[col] = df[col].dt.strftime('%Y-%m-%d')
    return df


In [5]:
# Loading DataSets
df_train, df_test = load_merged_data()

Data loaded successfully


In [6]:
df_train.dtypes

BeneID                             string[pyarrow]
ClaimID                            string[pyarrow]
ClaimStartDt                       string[pyarrow]
ClaimEndDt                         string[pyarrow]
Provider                           string[pyarrow]
InscClaimAmtReimbursed                     float64
AttendingPhysician                 string[pyarrow]
OperatingPhysician                 string[pyarrow]
OtherPhysician                     string[pyarrow]
AdmissionDt                        string[pyarrow]
ClmAdmitDiagnosisCode              string[pyarrow]
DeductibleAmtPaid                          float64
DischargeDt                        string[pyarrow]
DiagnosisGroupCode                 string[pyarrow]
ClmDiagnosisCode_1                 string[pyarrow]
ClmDiagnosisCode_2                 string[pyarrow]
ClmDiagnosisCode_3                 string[pyarrow]
ClmDiagnosisCode_4                 string[pyarrow]
ClmDiagnosisCode_5                 string[pyarrow]
ClmDiagnosisCode_6             

In [7]:
# Convert Dates
df_train=convert_dates(df_train)
df_test=convert_dates(df_test)

In [8]:
dx_codes = dd.read_csv(
    "s3://icd9cm-bucket-2025/V27LONG_SHORT_DX_110909.csv",
    encoding='latin1',
    dtype={'DIAGNOSIS CODE': 'object'}
)

proc_codes = dd.read_csv(
    "s3://icd9cm-bucket-2025/CMS27_DESC_LONG_SHORT_SG_092709.csv",
    dtype={'PROCEDURE CODE': 'object'}
)


In [9]:
dx_codes.head(10)

Unnamed: 0,DIAGNOSIS CODE,LONG DESCRIPTION,SHORT DESCRIPTION
0,10,Cholera due to vibrio cholerae,Cholera d/t vib cholerae
1,11,Cholera due to vibrio cholerae el tor,Cholera d/t vib el tor
2,19,"Cholera, unspecified",Cholera NOS
3,20,Typhoid fever,Typhoid fever
4,21,Paratyphoid fever A,Paratyphoid fever a
5,22,Paratyphoid fever B,Paratyphoid fever b
6,23,Paratyphoid fever C,Paratyphoid fever c
7,29,"Paratyphoid fever, unspecified",Paratyphoid fever NOS
8,30,Salmonella gastroenteritis,Salmonella enteritis
9,31,Salmonella septicemia,Salmonella septicemia


In [10]:
def reshape_icd_columns(df, cols, value_name):
    # Die Spalten "aufschmelzen", um sie leichter zu validieren
    df_long = dd.melt(df, value_vars=cols, value_name=value_name)
    df_long = df_long.drop('variable', axis=1)
    df_long[value_name] = df_long[value_name].map(normalize_code, meta=(value_name, 'object'))
    return df_long


In [11]:
df_train["ClmDiagnosisCode_1"].head(10)

0    79678
1    71656
2    64881
3    36401
4     2948
5    31400
6    42490
7    11590
8    72089
9     4271
Name: ClmDiagnosisCode_1, dtype: object

In [12]:
codes_sample = df_train["ClmDiagnosisCode_1"].head(10)
print(codes_sample)


: 

In [13]:
codes_sample = df_train["ClmDiagnosisCode_1"].head(10)  # Dask DataFrame, gibt Dask Series zurück
codes_sample_pd = codes_sample.compute()  # Erst jetzt compute auf kleiner Serie

: 

In [18]:
# 1. Erste 10 Codes aus df_train auswählen (Dask DataFrame -> zuerst compute)
codes_sample = df_train["ClmDiagnosisCode_1"].head(10).compute() if hasattr(df_train, "compute") else df_train["ClmDiagnosisCode_1"].head(10)

# 2. In dx_codes nach diesen Codes filtern
valid_codes = dx_codes[dx_codes['normalized_code'].isin(codes_sample)]

# 3. Ergebnis anzeigen
print(valid_codes)

: 

In [11]:
# Sicherstellen, dass die Spalte erzeugt wird
def normalize_code(code):
    if pd.isna(code):
        return ''
    return str(code).replace('.', '').strip()

dx_codes['normalized_code'] = dx_codes['DIAGNOSIS CODE'].map(normalize_code, meta=('normalized_code', 'object'))
proc_codes['normalized_code'] = proc_codes['PROCEDURE CODE'].map(normalize_code, meta=('normalized_code', 'object'))


In [12]:
# Spalten definieren
diag_cols = [f'ClmDiagnosisCode_{i}' for i in range(1, 11)]
proc_cols = [f'ClmProcedureCode_{i}' for i in range(1, 7)]

# Diagnose- & Prozedurcodes extrahieren
diagnoses_long = reshape_icd_columns(df_test, diag_cols, 'code')
procedures_long = reshape_icd_columns(df_test, proc_cols, 'code')

# Nur relevante Referenzcodes vorbereiten
dx_ref = dx_codes[['normalized_code']].rename(columns={'normalized_code': 'code'})
proc_ref = proc_codes[['normalized_code']].rename(columns={'normalized_code': 'code'})

# Validierung durch Join (left join + _merge)
valid_diagnoses = diagnoses_long.merge(dx_ref, on='code', how='left', indicator=True)
valid_procedures = procedures_long.merge(proc_ref, on='code', how='left', indicator=True)

# Ergebnis aggregieren
diag_summary = valid_diagnoses['_merge'].value_counts().compute()
proc_summary = valid_procedures['_merge'].value_counts().compute()

# Anzahl gültiger Codes aus den Referenzlisten
total_valid_dx_codes = dx_codes['normalized_code'].nunique().compute()
total_valid_proc_codes = proc_codes['normalized_code'].nunique().compute()

# Report mit Validierungsstatistiken
report = {
    'Valid Diagnoses': int(diag_summary.get('both', 0)),
    'Invalid Diagnoses': int(diag_summary.get('left_only', 0)),
    'Valid Procedures': int(proc_summary.get('both', 0)),
    'Invalid Procedures': int(proc_summary.get('left_only', 0)),
    'Total Unique Valid Diagnosis Codes (Reference)': int(total_valid_dx_codes),
    'Total Unique Valid Procedure Codes (Reference)': int(total_valid_proc_codes)
}

In [73]:
print("=== ICD Validation Report ===")
for k, v in report.items():
    print(f"{k}: {v:,}")


=== ICD Validation Report ===
Valid Diagnoses: 404,086
Invalid Diagnoses: 949,834
Valid Procedures: 40
Invalid Procedures: 812,312
Total Unique Valid Diagnosis Codes (Reference): 14,315
Total Unique Valid Procedure Codes (Reference): 3,838


In [13]:
print("=== ICD Validation Report ===")
for k, v in report.items():
    print(f"{k}: {v:,}")


=== ICD Validation Report ===
Valid Diagnoses: 404,086
Invalid Diagnoses: 949,834
Valid Procedures: 40
Invalid Procedures: 812,312
Total Unique Valid Diagnosis Codes (Reference): 14,315
Total Unique Valid Procedure Codes (Reference): 3,838


In [74]:
invalid_dx_codes = valid_diagnoses[valid_diagnoses['_merge'] == 'left_only']['code'].dropna().unique().compute()


In [75]:
valid_diagnoses[valid_diagnoses['_merge'] == 'left_only']['code'].value_counts().compute().head(10)


code
        948743
0080         3
0430        12
0431        10
0449        17
1362         1
2398        13
2740        64
2794         4
3488        22
Name: count, dtype: int64