In [1]:
import os
from typing import Tuple

import numpy as np
import pandas as pd

from sqlalchemy import create_engine
from tqdm import tqdm
from collections import Counter

import openmimic as om

# MySQL 데이터베이스 연결 정보
username = 'root'
password = os.getenv('AIMED_PW')
host = '172.28.8.103'
port = '3306'  # 예: '3306'
database = "MIMIC_III"
db_engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

# om configuration
om.Config.mimic_path = "../mimic3_csv/"
processed_data_path = "./processed_data/"

In [6]:
from importlib import reload

reload(om)  # mymodule을 다시 로드하여 변경 사항 반영.

<module 'mipipe' from '/home/jgpark/PycharmProjects/MIMIC_preprocessing/mipipe/__init__.py'>

In [2]:
# read json file
import json

with open('X.feature_names.json') as f:
    x_feature_names = json.load(f)

d_items = pd.read_csv("../mimic3_csv/D_ITEMS.csv")  # D_ITEMS.csv
d_labitems = pd.read_csv("../mimic3_csv/D_LABITEMS.csv")  # D_LABITEMS.csv

chartitem_map = {}
labitem_map = {}

for item_id in x_feature_names:
    id = item_id.split("_")[0]
    # if id is number
    if id.isdigit():
        id = int(id)
        try:
            label = d_items[d_items["ITEMID"] == id]
            chartitem_map[id] = label["LABEL"].values[0]
        except:
            try:
                label = d_labitems[d_labitems["ITEMID"] == id]
                labitem_map[id] = label["LABEL"].values[0]
            except:
                print("NOT FOUND: ", id)

# Patient_static preprocessing

In [2]:
# query = "SELECT * FROM patient_static"
# patients_all = pd.read_sql(query, db_engine)
patients_static_csv = pd.read_csv(processed_data_path+"patients_static.csv")
patients_static_T_info_csv = pd.read_csv(processed_data_path+"patients_static_T_info.csv")

patients_static = om.PatientStatic()
patients_static.load_processed(patients_static_csv, patients_static_T_info_csv)
# patients_static.load(patients_all)
# patients_static.patients_T_info

# Chartevents preprocessing

In [12]:
chartevents_items = (769, 220644, 772, 1521, 227456, 773, 225612, 227073, 770, 220587, 227443, 848, 225690, 1538, 225651, 803, 781, 1162, 225624, 225625, 786, 1522, 816, 225667, 116, 89, 90, 220074, 113, 220602, 226536, 1523, 788, 789, 1524, 220603, 787, 857, 225698, 777, 223679, 791, 1525, 220615, 224643, 225310, 220180, 8555, 220051, 8368, 8441, 8440, 227468, 1528, 806, 189, 727, 223835, 190, 198, 220621, 225664, 811, 807, 226537, 1529, 211, 220045, 226707, 226730, 1394, 813, 220545, 220228, 814, 818, 225668, 1531, 220635, 1532, 821, 456, 220181, 224, 225312, 220052, 52, 6702, 224322, 646, 834, 220277, 220227, 226062, 778, 220235, 779, 227466, 825, 1533, 535, 224695, 860, 223830, 1126, 780, 220274, 1534, 225677, 827, 224696, 543, 828, 227457, 224700, 506, 220339, 512, 829, 1535, 227464, 227442, 227467, 1530, 815, 1286, 824, 227465, 491, 492, 220059, 504, 833, 224422, 618, 220210, 224689, 614, 651, 224690, 615, 224688, 619, 837, 1536, 220645, 226534, 626, 442, 227243, 224167, 220179, 225309, 6701, 220050, 51, 455, 223761, 677, 676, 679, 678, 223762, 224685, 682, 224684, 683, 684, 224686, 1539, 849, 851, 227429, 859, 226531, 763, 224639, 226512, 861, 1542, 220546, 1127 )
query = f"SELECT * FROM CHARTEVENTS WHERE ITEMID IN {chartevents_items} ORDER BY CHARTTIME;"
icu_patient_original = pd.read_sql(query, db_engine)

In [16]:
icu_patient_original.to_csv(processed_data_path+"icu_patient_original.csv", index=False)
len(icu_patient_original)

66635422

In [3]:
icu_patient_original = pd.read_csv(processed_data_path+"icu_patient_original.csv")
icu_patient_original["CHARTTIME"] = pd.to_datetime(icu_patient_original["CHARTTIME"])
icu_patient_original["ICUSTAY_ID"].unique()

  icu_patient_original = pd.read_csv(processed_data_path+"icu_patient_original.csv")


In [5]:
# aa = icu_patient_original[icu_patient_original["ICUSTAY_ID"].isin([202134, 293407, 222148, 252411, 202836, 234115])]
chartevents2 = om.Chartevents()
chartevents2.load(icu_patient_original, patients_static.patients_T_info)
chartevents2.filter()
data = chartevents2.data
chartevents2.process(["mean"])

Chartevents data updated!
-----------------------------------
Filtering...
-> filter_remove_unassociated_columns...	 Complete!	1.21s
-> filter_remove_no_ICUSTAY_ID...	 Complete!	2.31s
-> filter_remove_error...	 Complete!	1.79s
-> filter_remove_labitems...	 Complete!	1.26s
Chartevents data updated!
Filtering Complete!
=> Before: 66,635,422, After: 66,534,355 : 99.85% remained.
-----------------------------------
Processing...
-> process_group_variables_from_fiddle...	 Complete!	20.19s
Chartevents data updated!
-> process_aggregator...	 Complete!	6m 23.00s
-> process_interval_shift_alignment...	 Complete!	2.95s
Processing Complete!


# Inputevents_MV preprocessing

In [3]:
columns = "ROW_ID, ICUSTAY_ID, STARTTIME, ENDTIME, ITEMID, AMOUNT, AMOUNTUOM, RATE, RATEUOM, PATIENTWEIGHT"
columns = "*"
query = f"SELECT {columns} FROM INPUTEVENTS_MV"
inputevents_mv_all = pd.read_sql(query, db_engine)

In [4]:
inputevents_mv = om.InputeventsMV()
inputevents_mv.load(inputevents_mv_all, patients_static.patients_T_info)
inputevents_mv.process()

-----------------------------------
Filtering...
-> filter_remove_unassociated_columns...	 Complete!	0.27s
-> filter_remove_no_ICUSTAY_ID...	 Complete!	0.35s
-> filter_remove_error...	 Complete!	0.66s
-> filter_remove_zero_input...	 Complete!	0.61s
-> filter_remove_continuous_uom_missing...	 Complete!	0.74s
Filtering Complete!
=> Before: 3,618,991, After: 2,535,497 : 70.06% remained.
-----------------------------------
Processing...
-> process_rateuom_into_hour_unit...	 Complete!	3.06s
-> process_unite_convertable_uom_by_D_ITEMS...	 Complete!	4.88s
-> process_split_ITEMID_by_unit...	 Complete!	3.73s
-> process_transform_T_cohort...	 Complete!	4m 10.16s
Processing Complete!


In [6]:
print(f"{330712483:,}")

330,712,483


In [5]:
inputevents_mv.data

Unnamed: 0,ICUSTAY_ID,T,220949.0,221347.0,225152.0,225158.0,225837.0,225863.0,225883.0,225907.0,...,226046.0,226045.0,225876.1,227978.0,225991.0,225996.0,227518.0,227691.1,225896.0,225909.1
0,200001.0,0,,,,,,,,,...,,,,,,,,,,
1,200001.0,1,,,,,,,,,...,,,,,,,,,,
2,200001.0,2,,,,,,,,,...,,,,,,,,,,
3,200001.0,3,,,,,,,,,...,,,,,,,,,,
4,200001.0,4,,,,,,2.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,299998.0,42,,,,,,,,,...,,,,,,,,,,
43,299998.0,43,,,,,,,,,...,,,,,,,,,,
44,299998.0,44,,,,,,,,,...,,,,,,,,,,
45,299998.0,45,,,,,,,,,...,,,,,,,,,,


# Cohort Integration


In [46]:
class Cohort:
    def __init__(self, patients_static=None, chartevents=None, inputevents_mv=None, cohort=None):
        self.patients_static = patients_static
        self.chartevents = chartevents
        self.inputevents_mv = inputevents_mv
        self.data = cohort

    def make_cohort(self):
        self.data = self.patients_static.data
        if self.chartevents is not None:
            self.data = self.data.merge(self.chartevents.data, on="ICUSTAY_ID", how="left")
        if self.inputevents_mv is not None:
            self.data = self.data.merge(self.inputevents_mv.data, on="ICUSTAY_ID", how="left")
        return self.data


cohort = Cohort(patients_static, chartevents2, inputevents_mv)
cohort.make_cohort()
cohort.data