In [1]:
import os
os.chdir("/home/julien/Documents/stage")
from src.utils.db_utils import run_query
# from src.utils.patient import Patient
import time
import pandas as pd
import json
from datetime import datetime
from importlib import reload 
from src.utils.patient import Patient

In [2]:
hadm_ids = [100108]

In [3]:
time_start = time.time()

query = """
SELECT *
FROM CHARTEVENTS
JOIN D_ITEMS ON CHARTEVENTS.itemid = D_ITEMS.itemid
WHERE hadm_id IN %(hadm_ids)s
"""

df_patients = run_query(query, {"hadm_ids": tuple(hadm_ids)})

In [4]:
save_path = "/home/julien/Documents/stage/data/MIMIC/cohorts_new"

In [5]:
def timestamp_to_string(timestamp):
    """
    Convert a timestamp (datetime object) to a string in ISO format.
    """
    return timestamp.isoformat() if isinstance(timestamp, datetime) else None

def string_to_timestamp(timestamp_str):
    """
    Convert a string in ISO format back to a datetime object.
    """
    return datetime.fromisoformat(timestamp_str) if timestamp_str else None

def get_diagnoses(hadm_id):
    query = """
        SELECT long_title
        FROM DIAGNOSES_ICD
        JOIN D_ICD_DIAGNOSES ON DIAGNOSES_ICD.icd9_code = D_ICD_DIAGNOSES.icd9_code
        WHERE hadm_id = %(hadm_id)s
        """
    return list(run_query(query, {"hadm_id": hadm_id}))


class Patient:
    def __init__(self, df: pd.DataFrame, root_path):
        print("starting patient creation")
        self.subject_id = int(df.iloc[0]["subject_id"])
        self.hadm_id = int(df.iloc[0]["hadm_id"])
        print(self.subject_id)
        print(root_path)
        print(self.hadm_id)
        self.main_path = os.path.join(root_path, str(self.subject_id), str(self.hadm_id))
        self.save_path = os.path.join(self.main_path, "data")
        self.diagnoses = get_diagnoses(self.hadm_id)

        print("Middle")

        if not os.path.exists(os.path.join(root_path, str(self.subject_id))):
            os.makedirs(os.path.join(root_path, str(self.subject_id)))
        if not os.path.exists(self.main_path):
            os.makedirs(self.main_path)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.time_start = df["charttime"].min()

        rows = []

        for index, row in df.iterrows():
            # print(row)
            time_feature = (row["charttime"] - self.time_start).total_seconds() / 3600
            label = row["label"]
            value = row["value"]
            rows.append(({"patient_id": self.subject_id, "hadm_id": self.hadm_id,
                          "time": time_feature, label: value}))

        df = pd.DataFrame(rows).sort_values(by="time").reset_index(drop=True)
        self.raw_df = df.ffill()

        print("ending patient creation")
        self.save_raw_df()
        self.save_infos()
        self.add_patient_to_global_df(root_path)

    def save_infos(self):
        """
        Save the patient's data to the specified directory.
        """

        # Save other patient information as JSON
        patient_info = {
            "subject_id": self.subject_id,
            "hadm_id": self.hadm_id,
            "diagnoses": self.diagnoses,
            "time_start": timestamp_to_string(self.time_start)
        }

        with open(os.path.join(self.main_path, "patient_info.json"), "w") as f:
            json.dump(patient_info, f, indent=4)

    def save_raw_df(self):
        self.raw_df.to_csv(os.path.join(self.save_path, "raw_df.csv"), index=False)

    def add_patient_to_global_df(self, root_path):
        csv_path = os.path.join(root_path, "patients.csv")
        patient_dic = {"subject_id": self.subject_id,
                       "hadm_id": self.hadm_id}
        if not os.path.exists(csv_path):
            pd.DataFrame([patient_dic]).to_csv(csv_path, index=False)
        else:
            pd.DataFrame([patient_dic]).to_csv(csv_path, index=False, mode="a")
    @classmethod
    def load(cls, load_dir):
        """
        Load the patient's data from the specified directory.
        """
        # Load patient information
        with open(os.path.join(load_dir, "patient_info.json"), "r") as f:
            patient_info = json.load(f)
        
        patient_info["time_start"] = string_to_timestamp(patient_info["time_start"])
        # Load medical data
        medical_data_path = os.path.join(load_dir, "medical_data.csv")
        medical_data = pd.read_csv(medical_data_path) if os.path.exists(medical_data_path) else pd.DataFrame()

        # Return an instance of the Patient class
        return cls(
            patient_id=patient_info["patient_id"],
            encounter_id=patient_info["encounter_id"],
            hadm_id=patient_info["hadm_id"],
            diagnoses=patient_info.get("diagnoses", []),
            medical_data=medical_data
        )

    def __repr__(self):
        return f"Patient(patient_id={self.patient_id}, encounter_id={self.encounter_id}, hadm_id={self.hadm_id})"

In [7]:
print(f"SQL request done in {time.time() - time_start} seconds")
print(hadm_ids)

for i, hadm_id in enumerate(hadm_ids):
    df_patient = df_patients[df_patients["hadm_id"] == hadm_id]
    Patient(df_patient, save_path)

SQL request done in 16.354612112045288 seconds
[100108]
starting patient creation
20587
/home/julien/Documents/stage/data/MIMIC/cohorts_new
100108
Middle
ending patient creation
