Importation of libraries

In [2]:
import pandas as pd 
from datetime import datetime

In [7]:
patients_cols = ['patient_id','first_name','last_name','date_of_birth','gender','mrn']
specialties_cols = ['specialty_id','specialty_name','specialty_code']
departments_cols = ['department_id','department_name','floor','capacity']
providers_cols = ['provider_id','first_name','last_name','credential','specialty_id','department_id']
encounters_cols = ['encounter_id','patient_id','provider_id','encounter_type','encounter_date','discharge_date','department_id']
diagnoses_cols = ['diagnosis_id','icd10_code','icd10_description']
enc_diag_cols = ['encounter_diagnosis_id','encounter_id','diagnosis_id','diagnosis_sequence']
procedures_cols = ['procedure_id','cpt_code','cpt_description']
enc_proc_cols = ['encounter_procedure_id','encounter_id','procedure_id','procedure_date']
billing_cols = ['billing_id','encounter_id','claim_amount','allowed_amount','claim_date','claim_status']

In [8]:
patients_df = pd.read_csv('../../data/oltp/patients.csv', header=None, names=patients_cols)
specialties_df = pd.read_csv('../../data/oltp/specialties.csv', header=None, names=specialties_cols)
departments_df = pd.read_csv('../../data/oltp/departments.csv', header=None, names=departments_cols)
providers_df = pd.read_csv('../../data/oltp/providers.csv', header=None, names=providers_cols)
encounters_df = pd.read_csv('../../data/oltp/encounters.csv', header=None, names=encounters_cols)
diagnoses_df = pd.read_csv('../../data/oltp/diagnoses.csv', header=None, names=diagnoses_cols)
enc_diag_df = pd.read_csv('../../data/oltp/encounter_diagnoses.csv', header=None, names=enc_diag_cols)
procedures_df = pd.read_csv('../../data/oltp/procedures.csv', header=None, names=procedures_cols)
enc_proc_df = pd.read_csv('../../data/oltp/encounter_procedures.csv', header=None, names=enc_proc_cols)
billing_df = pd.read_csv('../../data/oltp/billing.csv', header=None, names=billing_cols)

In [9]:
print(encounters_df.columns)

Index(['encounter_id', 'patient_id', 'provider_id', 'encounter_type',
       'encounter_date', 'discharge_date', 'department_id'],
      dtype='object')


Transform for OLAP

In [10]:
encounters_df['encounter_date'] = pd.to_datetime(encounters_df['encounter_date'])
encounters_df['discharge_date'] = pd.to_datetime(encounters_df['discharge_date'])
all_dates = pd.concat([encounters_df['encounter_date'], encounters_df['discharge_date']]).dropna().unique()
dim_date_df = pd.DataFrame({'calendar_date': pd.to_datetime(all_dates)})
dim_date_df['date_key'] = dim_date_df['calendar_date'].dt.strftime('%Y%m%d').astype(int)
dim_date_df['year'] = dim_date_df['calendar_date'].dt.year
dim_date_df['month'] = dim_date_df['calendar_date'].dt.month
dim_date_df['quarter'] = dim_date_df['calendar_date'].dt.quarter
dim_date_df = dim_date_df[['date_key', 'calendar_date', 'year', 'month', 'quarter']]

In [12]:
def load_csv(path, columns):
    return pd.read_csv(path, header=None, names=columns)

In [11]:
def create_dim_date(encounters_df):
    encounters_df['encounter_date'] = pd.to_datetime(encounters_df['encounter_date'])
    encounters_df['discharge_date'] = pd.to_datetime(encounters_df['discharge_date'])
    all_dates = pd.concat([encounters_df['encounter_date'], encounters_df['discharge_date']]).dropna().unique()
    dim_date_df = pd.DataFrame({'calendar_date': pd.to_datetime(all_dates)})
    dim_date_df['date_key'] = dim_date_df['calendar_date'].dt.strftime('%Y%m%d').astype(int)
    dim_date_df['year'] = dim_date_df['calendar_date'].dt.year
    dim_date_df['month'] = dim_date_df['calendar_date'].dt.month
    dim_date_df['quarter'] = dim_date_df['calendar_date'].dt.quarter
    return dim_date_df[['date_key', 'calendar_date', 'year', 'month', 'quarter']]

In [13]:
def create_dim_patient(patients_df):
    df = patients_df.copy()
    df['full_name'] = df['last_name'] + ' ' + df['first_name']
    current_year = datetime.now().year
    df['age_group'] = pd.cut(
        current_year - pd.to_datetime(df['date_of_birth']).dt.year,
        bins=[0,18,30,45,60,75,100],
        labels=['0-17','18-30','31-45','46-60','61-75','76-100']
    )
    return df[['patient_id','full_name','gender','age_group','mrn']]

In [14]:
def create_dim_specialty(specialties_df):
    return specialties_df[['specialty_id','specialty_name']]

def create_dim_department(departments_df):
    return departments_df[['department_id','department_name']]

def create_dim_encounter_type(encounters_df):
    return encounters_df[['encounter_type']].drop_duplicates().rename(columns={'encounter_type':'type_name'})

def create_dim_diagnosis(diagnoses_df):
    return diagnoses_df.rename(columns={'icd10_description':'description'})[['icd10_code','description']]

def create_dim_procedure(procedures_df):
    return procedures_df.rename(columns={'cpt_description':'description'})[['cpt_code','description']]

In [15]:
def create_fact_encounters(encounters_df, dim_date_df, dim_patient_df, providers_df, billing_df, enc_diag_df, enc_proc_df):
    fact_df = encounters_df.merge(dim_date_df[['date_key','calendar_date']], left_on='encounter_date', right_on='calendar_date', how='left')
    fact_df = fact_df.merge(dim_patient_df[['patient_id']], on='patient_id', how='left')
    fact_df = fact_df.merge(providers_df[['provider_id','specialty_id']], on='provider_id', how='left')
    fact_df['encounter_count'] = 1
    fact_df['total_allowed_amount'] = fact_df['encounter_id'].map(billing_df.set_index('encounter_id')['allowed_amount'])
    fact_df['total_claim_amount'] = fact_df['encounter_id'].map(billing_df.set_index('encounter_id')['claim_amount'])
    fact_df['diagnosis_count'] = fact_df['encounter_id'].map(enc_diag_df.groupby('encounter_id').size()).fillna(0).astype(int)
    fact_df['procedure_count'] = fact_df['encounter_id'].map(enc_proc_df.groupby('encounter_id').size()).fillna(0).astype(int)
    fact_df['length_of_stay'] = (fact_df['discharge_date'] - fact_df['encounter_date']).dt.days.fillna(0).astype(int)
    return fact_df[['date_key','patient_id','specialty_id','department_id','encounter_type','encounter_count',
                    'total_allowed_amount','total_claim_amount','diagnosis_count','procedure_count','length_of_stay']]

In [16]:
def create_bridge(enc_diag_df, enc_proc_df):
    bridge_diag_df = enc_diag_df[['encounter_id','diagnosis_id']].copy()
    bridge_proc_df = enc_proc_df[['encounter_id','procedure_id']].copy()
    return bridge_diag_df, bridge_proc_df

In [17]:
def export_csv(df, path, name):
    df.to_csv(path + name, index=False)

In [18]:
# -------------------------
# Main ETL
# -------------------------
def main():
    # Paths
    base_path = '../../data/oltp/'
    output_path = '../../data/olap/'

    # Load CSVs
    patients_df = load_csv(base_path+'patients.csv', ['patient_id','first_name','last_name','date_of_birth','gender','mrn'])
    specialties_df = load_csv(base_path+'specialties.csv', ['specialty_id','specialty_name','specialty_code'])
    departments_df = load_csv(base_path+'departments.csv', ['department_id','department_name','floor','capacity'])
    providers_df = load_csv(base_path+'providers.csv', ['provider_id','first_name','last_name','credential','specialty_id','department_id'])
    encounters_df = load_csv(base_path+'encounters.csv', ['encounter_id','patient_id','provider_id','encounter_type','encounter_date','discharge_date','department_id'])
    diagnoses_df = load_csv(base_path+'diagnoses.csv', ['diagnosis_id','icd10_code','icd10_description'])
    enc_diag_df = load_csv(base_path+'encounter_diagnoses.csv', ['encounter_diagnosis_id','encounter_id','diagnosis_id','diagnosis_sequence'])
    procedures_df = load_csv(base_path+'procedures.csv', ['procedure_id','cpt_code','cpt_description'])
    enc_proc_df = load_csv(base_path+'encounter_procedures.csv', ['encounter_procedure_id','encounter_id','procedure_id','procedure_date'])
    billing_df = load_csv(base_path+'billing.csv', ['billing_id','encounter_id','claim_amount','allowed_amount','claim_date','claim_status'])

    # Transform
    dim_date_df = create_dim_date(encounters_df)
    dim_patient_df = create_dim_patient(patients_df)
    dim_specialty_df = create_dim_specialty(specialties_df)
    dim_department_df = create_dim_department(departments_df)
    dim_enc_type_df = create_dim_encounter_type(encounters_df)
    dim_diagnosis_df = create_dim_diagnosis(diagnoses_df)
    dim_procedure_df = create_dim_procedure(procedures_df)
    fact_encounters_df = create_fact_encounters(encounters_df, dim_date_df, dim_patient_df, providers_df, billing_df, enc_diag_df, enc_proc_df)
    bridge_diag_df, bridge_proc_df = create_bridge(enc_diag_df, enc_proc_df)

    # Export CSVs
    export_csv(dim_date_df, output_path, 'dim_date.csv')
    export_csv(dim_patient_df, output_path, 'dim_patient.csv')
    export_csv(dim_specialty_df, output_path, 'dim_specialty.csv')
    export_csv(dim_department_df, output_path, 'dim_department.csv')
    export_csv(dim_enc_type_df, output_path, 'dim_encounter_type.csv')
    export_csv(dim_diagnosis_df, output_path, 'dim_diagnosis.csv')
    export_csv(dim_procedure_df, output_path, 'dim_procedure.csv')
    export_csv(fact_encounters_df, output_path, 'fact_encounters.csv')
    export_csv(bridge_diag_df, output_path, 'bridge_encounter_diagnoses.csv')
    export_csv(bridge_proc_df, output_path, 'bridge_encounter_procedures.csv')

    print("✅ OLAP CSV export complete!")

if __name__ == "__main__":
    main()


✅ OLAP CSV export complete!
