Importation of libraries

In [24]:
import pandas as pd 
from datetime import datetime

In [25]:
import pandas as pd
from datetime import datetime

# -------------------------
# 1️⃣ CSV Column Definitions
# -------------------------
patients_cols = ['patient_id','first_name','last_name','date_of_birth','gender','mrn']
specialties_cols = ['specialty_id','specialty_name','specialty_code']
departments_cols = ['department_id','department_name','floor','capacity']
providers_cols = ['provider_id','first_name','last_name','credential','specialty_id','department_id']
encounters_cols = ['encounter_id','patient_id','provider_id','encounter_type','encounter_date','discharge_date','department_id']
diagnoses_cols = ['diagnosis_id','icd10_code','icd10_description']
enc_diag_cols = ['encounter_diagnosis_id','encounter_id','diagnosis_id','diagnosis_sequence']
procedures_cols = ['procedure_id','cpt_code','cpt_description']
enc_proc_cols = ['encounter_procedure_id','encounter_id','procedure_id','procedure_date']
billing_cols = ['billing_id','encounter_id','claim_amount','allowed_amount','claim_date','claim_status']

In [26]:
# -------------------------
# 2️⃣ Helper Functions
# -------------------------
def load_csv(path, columns):
    return pd.read_csv(path, header=None, names=columns)

In [27]:
def create_dim_date(encounters_df):
    encounters_df['encounter_date'] = pd.to_datetime(encounters_df['encounter_date'])
    encounters_df['discharge_date'] = pd.to_datetime(encounters_df['discharge_date'])
    all_dates = pd.concat([encounters_df['encounter_date'], encounters_df['discharge_date']]).dropna().unique()
    dim_date_df = pd.DataFrame({'calendar_date': pd.to_datetime(all_dates)})
    dim_date_df['date_key'] = dim_date_df['calendar_date'].dt.strftime('%Y%m%d').astype(int)
    dim_date_df['year'] = dim_date_df['calendar_date'].dt.year
    dim_date_df['month'] = dim_date_df['calendar_date'].dt.month
    dim_date_df['quarter'] = dim_date_df['calendar_date'].dt.quarter
    dim_date_df = dim_date_df.sort_values('calendar_date').reset_index(drop=True)
    return dim_date_df[['date_key','calendar_date','year','month','quarter']]

def create_dim_patient(patients_df):
    df = patients_df.copy()
    df['full_name'] = df['last_name'] + ' ' + df['first_name']
    current_year = datetime.now().year
    df['age_group'] = pd.cut(
        current_year - pd.to_datetime(df['date_of_birth']).dt.year,
        bins=[0,18,30,45,60,75,100],
        labels=['0-17','18-30','31-45','46-60','61-75','76-100']
    )
    df = df.sort_values('patient_id').reset_index(drop=True)
    df['patient_key'] = df.index + 1
    return df[['patient_key','patient_id','full_name','gender','age_group','mrn']]

def create_dim_specialty(specialties_df):
    df = specialties_df.copy()
    df['specialty_key'] = df.index + 1
    return df[['specialty_key','specialty_id','specialty_name']]

def create_dim_department(departments_df):
    df = departments_df.copy()
    df['department_key'] = df.index + 1
    return df[['department_key','department_id','department_name']]

def create_dim_encounter_type(encounters_df):
    df = encounters_df[['encounter_type']].drop_duplicates().sort_values('encounter_type').reset_index(drop=True)
    df['encounter_type_key'] = df.index + 1
    df = df.rename(columns={'encounter_type':'type_name'})
    return df[['encounter_type_key','type_name']]

def create_dim_diagnosis(diagnoses_df):
    df = diagnoses_df.rename(columns={'icd10_description':'description'})
    df['diagnosis_key'] = df.index + 1
    return df[['diagnosis_key','diagnosis_id','icd10_code','description']]

def create_dim_procedure(procedures_df):
    df = procedures_df.rename(columns={'cpt_description':'description'})
    df['procedure_key'] = df.index + 1
    return df[['procedure_key','procedure_id','cpt_code','description']]


In [28]:

def create_fact_encounters(encounters_df, dim_date_df, dim_patient_df, dim_specialty_df,
                           dim_department_df, dim_enc_type_df, providers_df, billing_df, enc_diag_df, enc_proc_df):
    df = encounters_df.copy()
    
    # Date key
    df = df.merge(dim_date_df[['calendar_date','date_key']], left_on='encounter_date', right_on='calendar_date', how='left')
    
    # Patient key
    df = df.merge(dim_patient_df[['patient_id','patient_key']], on='patient_id', how='left')
    
    # Provider → Specialty key
    df = df.merge(providers_df[['provider_id','specialty_id']], on='provider_id', how='left')
    df = df.merge(dim_specialty_df[['specialty_id','specialty_key']], on='specialty_id', how='left')
    
    # Department key
    df = df.merge(dim_department_df[['department_id','department_key']], on='department_id', how='left')
    
    # Encounter type key
    df = df.merge(dim_enc_type_df[['type_name','encounter_type_key']], left_on='encounter_type', right_on='type_name', how='left')
    
    # Measures
    df['encounter_count'] = 1
    df['total_allowed_amount'] = df['encounter_id'].map(billing_df.set_index('encounter_id')['allowed_amount']).fillna(0)
    df['total_claim_amount'] = df['encounter_id'].map(billing_df.set_index('encounter_id')['claim_amount']).fillna(0)
    df['diagnosis_count'] = df['encounter_id'].map(enc_diag_df.groupby('encounter_id').size()).fillna(0).astype(int)
    df['procedure_count'] = df['encounter_id'].map(enc_proc_df.groupby('encounter_id').size()).fillna(0).astype(int)
    df['length_of_stay'] = (df['discharge_date'] - df['encounter_date']).dt.days.fillna(0).astype(int)
    
    df = df.sort_values('encounter_id').reset_index(drop=True)
    df['encounter_key'] = df.index + 1
    
    return df[['encounter_key','date_key','patient_key','specialty_key','department_key','encounter_type_key',
               'encounter_count','total_allowed_amount','total_claim_amount','diagnosis_count','procedure_count','length_of_stay','encounter_id']]

Transform for OLAP

In [29]:
def create_bridge(enc_diag_df, enc_proc_df, fact_enc_df, dim_diagnosis_df, dim_procedure_df):
    # Diagnosis bridge
    bridge_diag_df = enc_diag_df.merge(dim_diagnosis_df[['diagnosis_id','diagnosis_key']], on='diagnosis_id', how='left')
    bridge_diag_df = bridge_diag_df.merge(fact_enc_df[['encounter_id','encounter_key']], on='encounter_id', how='left')
    bridge_diag_df = bridge_diag_df[['encounter_key','diagnosis_key']]

    # Procedure bridge
    bridge_proc_df = enc_proc_df.merge(dim_procedure_df[['procedure_id','procedure_key']], on='procedure_id', how='left')
    bridge_proc_df = bridge_proc_df.merge(fact_enc_df[['encounter_id','encounter_key']], on='encounter_id', how='left')
    bridge_proc_df = bridge_proc_df[['encounter_key','procedure_key']]

    return bridge_diag_df, bridge_proc_df



In [30]:
def export_csv(df, path, name):
    df.to_csv(path + name, index=False, header=False, sep=',', na_rep='\\N')

In [31]:
# -------------------------
def main():
    base_path = '../../data/oltp/'
    output_path = '../../data/olap/'

    # Load CSVs
    patients_df = load_csv(base_path+'patients.csv', patients_cols)
    specialties_df = load_csv(base_path+'specialties.csv', specialties_cols)
    departments_df = load_csv(base_path+'departments.csv', departments_cols)
    providers_df = load_csv(base_path+'providers.csv', providers_cols)
    encounters_df = load_csv(base_path+'encounters.csv', encounters_cols)
    diagnoses_df = load_csv(base_path+'diagnoses.csv', diagnoses_cols)
    enc_diag_df = load_csv(base_path+'encounter_diagnoses.csv', enc_diag_cols)
    procedures_df = load_csv(base_path+'procedures.csv', procedures_cols)
    enc_proc_df = load_csv(base_path+'encounter_procedures.csv', enc_proc_cols)
    billing_df = load_csv(base_path+'billing.csv', billing_cols)

    # Transform
    dim_date_df = create_dim_date(encounters_df)
    dim_patient_df = create_dim_patient(patients_df)
    dim_specialty_df = create_dim_specialty(specialties_df)
    dim_department_df = create_dim_department(departments_df)
    dim_enc_type_df = create_dim_encounter_type(encounters_df)
    dim_diagnosis_df = create_dim_diagnosis(diagnoses_df)
    dim_procedure_df = create_dim_procedure(procedures_df)
    
    fact_encounters_df = create_fact_encounters(encounters_df, dim_date_df, dim_patient_df, dim_specialty_df,
                                                dim_department_df, dim_enc_type_df, providers_df, billing_df,
                                                enc_diag_df, enc_proc_df)
    
    bridge_diag_df, bridge_proc_df = create_bridge(enc_diag_df, enc_proc_df, fact_encounters_df, dim_diagnosis_df, dim_procedure_df)

    # Export CSVs
    export_csv(dim_date_df, output_path, 'dim_date.csv')
    export_csv(dim_patient_df, output_path, 'dim_patient.csv')
    export_csv(dim_specialty_df, output_path, 'dim_specialty.csv')
    export_csv(dim_department_df, output_path, 'dim_department.csv')
    export_csv(dim_enc_type_df, output_path, 'dim_encounter_type.csv')
    export_csv(dim_diagnosis_df, output_path, 'dim_diagnosis.csv')
    export_csv(dim_procedure_df, output_path, 'dim_procedure.csv')
    export_csv(fact_encounters_df, output_path, 'fact_encounters.csv')
    export_csv(bridge_diag_df, output_path, 'bridge_encounter_diagnoses.csv')
    export_csv(bridge_proc_df, output_path, 'bridge_encounter_procedures.csv')

    print("✅ OLAP CSV export complete!")

if __name__ == "__main__":
    main()

✅ OLAP CSV export complete!
