In [2]:
# etl/etl_csv_diabetes.py
import pandas as pd
from config import pg_engine

# 1. Read CSV
df = pd.read_csv(r"C:\Users\Admin\Documents\GitHub\Healthcare-data-warehouse\source_data\diabetic_data.csv")

# 2. Basic cleaning & missing handling
# -----------------------------------
# Convert "?" to proper missing values
df.replace("?", None, inplace=True)

# Drop rows that are unusable for the warehouse:
#  - no encounter_id (event)
#  - no patient_nbr (cannot link to patient dimension)
df = df.dropna(subset=["encounter_id", "patient_nbr"])

# Optionally drop obvious duplicates based on encounter_id
df = df.drop_duplicates(subset=["encounter_id"])

# (Optional) ensure numeric types where appropriate
numeric_cols = [
    "time_in_hospital",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_outpatient",
    "number_emergency",
    "number_inpatient",
    "number_diagnoses",
]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# 3. Derive flags and age_group
df["readmitted_raw"] = df["readmitted"]
df["readmitted_30d_flag"] = df["readmitted"].apply(lambda x: x == "<30")
df["age_group"] = df["age"]   # can be remapped later if needed

# 4. Build dim_patient
dim_patient = (
    df[["patient_nbr", "race", "gender", "age_group", "payer_code"]]
    .drop_duplicates(subset=["patient_nbr"])
    .copy()
)
dim_patient["source_system"] = "CSV_diabetes"

# 5. Build dim_admission
dim_admission = (
    df[["admission_type_id", "discharge_disposition_id", "admission_source_id"]]
    .drop_duplicates()
    .copy()
)
dim_admission.rename(
    columns={
        "admission_type_id": "admission_type",
        "discharge_disposition_id": "discharge_disposition",
        "admission_source_id": "admission_source",
    },
    inplace=True,
)
dim_admission["source_system"] = "CSV_diabetes"

# 6. Build dim_diagnosis
diag_long = (
    pd.melt(
        df[["encounter_id", "diag_1", "diag_2", "diag_3"]],
        id_vars=["encounter_id"],
        value_vars=["diag_1", "diag_2", "diag_3"],
        var_name="diag_position",
        value_name="diagnosis_code",
    )
    .dropna(subset=["diagnosis_code"])
    .drop_duplicates(subset=["diagnosis_code"])
)
diag_long["icd_category"] = diag_long["diagnosis_code"].str.slice(0, 3)
dim_diagnosis = diag_long[["diagnosis_code", "icd_category"]].drop_duplicates()
dim_diagnosis["source_system"] = "CSV_diabetes"

# 7. Load dimensions
with pg_engine.begin() as conn:
    dim_patient.to_sql("dim_patient", con=conn, if_exists="append", index=False)
    dim_admission.to_sql("dim_admission", con=conn, if_exists="append", index=False)
    dim_diagnosis.to_sql("dim_diagnosis", con=conn, if_exists="append", index=False)

# 8. Re-read dimensions with keys to build fact
with pg_engine.connect() as conn:
    dim_patient_db = pd.read_sql("SELECT * FROM dim_patient", conn)
    dim_admission_db = pd.read_sql("SELECT * FROM dim_admission", conn)
    dim_diag_db = pd.read_sql("SELECT * FROM dim_diagnosis", conn)

patient_key_map = dim_patient_db.set_index("patient_nbr")["patient_key"].to_dict()
admission_key_map = (
    dim_admission_db
    .set_index(["admission_type", "discharge_disposition", "admission_source"])["admission_dim_key"]
    .to_dict()
)
diag_key_map = dim_diag_db.set_index("diagnosis_code")["diagnosis_key"].to_dict()

df["patient_key"] = df["patient_nbr"].map(patient_key_map)
df["admission_dim_key"] = df.apply(
    lambda row: admission_key_map.get(
        (row["admission_type_id"],
         row["discharge_disposition_id"],
         row["admission_source_id"])
    ),
    axis=1,
)
df["primary_diagnosis_key"] = df["diag_1"].map(diag_key_map)
df["secondary_diagnosis_key"] = df["diag_2"].map(diag_key_map)
df["tertiary_diagnosis_key"] = df["diag_3"].map(diag_key_map)

fact_cols = [
    "encounter_id",
    "patient_key",
    "admission_dim_key",
    "primary_diagnosis_key",
    "secondary_diagnosis_key",
    "tertiary_diagnosis_key",
    "time_in_hospital",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_outpatient",
    "number_emergency",
    "number_inpatient",
    "number_diagnoses",
    "readmitted_raw",
    "readmitted_30d_flag",
    "change",
    "diabetesMed",
]

fact_df = df[fact_cols].copy()

# 9. Drop fact rows that can't join to core dimensions
fact_df = fact_df.dropna(subset=["patient_key", "admission_dim_key"])

fact_df["source_system"] = "CSV_diabetes"

with pg_engine.begin() as conn:
    fact_df.to_sql(
        "fact_hospital_admission_parted",
        con=conn,
        if_exists="append",
        index=False,
    )


ModuleNotFoundError: No module named 'config'