In [None]:
import sys
import os

SCRIPT_DIR = os.path.dirname(os.path.abspath("."))
sys.path.append(SCRIPT_DIR)

In [None]:
from mimic.datasets import MIMIC_IV
from mimic.utils.env import Env
from mimic.utils.scaler import MinMax_Scaler, Ordinal_Encoder
from mimic.utils.sheet import Sheet
from mimic.utils.db import DuckDB

Env.load()

In [None]:
import pandas as pd
import re


def transform_omr(db: DuckDB, df: pd.DataFrame) -> pd.DataFrame:
    df["result_name"] = df["result_name"].map(
        {
            "Blood Pressure": "blood_pressure",
            "Weight (Lbs)": "weight",
            "BMI (kg/m2)": "bmi",
            "Height (Inches)": "height",
            "Blood Pressure Sitting": "blood_pressure",
            "Blood Pressure Standing (1 min)": "blood_pressure",
            "BMI": "bmi",
            "Weight": "weight",
            "Height": "height",
            "Blood Pressure Lying": "blood_pressure",
            "Blood Pressure Standing (3 mins)": "blood_pressure",
            "Blood Pressure Standing": "blood_pressure",
            "eGFR": "eGFR",
        }
    )

    df_expanded = df[df["result_name"] == "blood_pressure"].copy()
    df_expanded[["result_value_A", "result_value_B"]] = df_expanded[
        "result_value"
    ].str.split("/", expand=True)

    df_blood_A = df_expanded[["subject_id", "result_value_A"]].rename(
        columns={"result_value_A": "result_value"}
    )
    df_blood_A["result_name"] = "blood_pressure_systolic"

    df_blood_B = df_expanded[["subject_id", "result_value_B"]].rename(
        columns={"result_value_B": "result_value"}
    )
    df_blood_B["result_name"] = "blood_pressure_diastolic"

    df_cleaned = df[df["result_name"] != "blood_pressure"].copy()

    df_transformed = pd.concat([df_cleaned, df_blood_A, df_blood_B])

    df_transformed["result_value"] = df_transformed["result_value"].apply(
        lambda x: re.sub(r"[^0-9/]", "", str(x))
    )

    df_transformed["result_value"] = pd.to_numeric(df_transformed["result_value"])

    df_final = (
        df_transformed.groupby(["subject_id", "result_name"])["result_value"]
        .mean()
        .unstack()
        .reset_index()
    )

    df_final.fillna(df_final.mean(), inplace=True)

    return df_final

In [None]:
root = MIMIC_IV.get_raw_folder("./data")

db = DuckDB(root="./data", db_name="dataset.db")

In [None]:
patients_sheet = Sheet(
    root=root,
    db=db,
    columns={
        "subject_id": "int",
        "gender": "string",
        "anchor_age": "int",
    },
    table_fields={
        "subject_id": "int",
        "gender": "float",
        "anchor_age": "float",
    },
    id_column="subject_id",
    scaler=[
        MinMax_Scaler(transform_columns=["anchor_age"], train=True),
        Ordinal_Encoder(transform_columns=["gender"], train=True),
    ],
    table_name="patients",
    force_insert=False,
    drop_table=True,
)

omr_sheet = Sheet(
    root=root,
    db=db,
    columns={
        "subject_id": "int",
        "result_name": "string",
        "result_value": "string",
    },
    table_fields={
        "subject_id": "int",
        "blood_pressure_systolic": "float",
        "blood_pressure_diastolic": "float",
        "weight": "float",
        "height": "float",
        "bmi": "float",
        "eGFR": "float",
    },
    transform=transform_omr,
    id_column="subject_id",
    scaler=[
        MinMax_Scaler(
            transform_columns=[
                "blood_pressure_systolic",
                "blood_pressure_diastolic",
                "weight",
                "height",
                "bmi",
                "eGFR",
            ],
            train=True,
        ),
    ],
    table_name="omr",
    force_insert=False,
    drop_table=True,
)

In [None]:
dataset = MIMIC_IV(
    root="./data",
    sheets={
        "patients": patients_sheet,
        "omr": omr_sheet,
    },
    download=True,
    db=db,
    column_id="subject_id",
    columns=[
        "gender",
        "anchor_age",
        "blood_pressure_systolic",
        "blood_pressure_diastolic",
        "weight",
        "height",
        "bmi",
        "eGFR",
    ],
)