In [2]:
import pandas as pd
import json
import os
from tqdm import tqdm
import sys

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

subjects_root_path = '/home/fwu/Documents/inferProjects/mimic3-benchmarks/data/root/'
folders = os.listdir(subjects_root_path)
folders = list((filter(str.isdigit, folders)))

all_samples = folders
all_samples.sort()

np.random.seed(42)
train_size = 0.7
val_size = 0.1
test_size = 0.2

train_samples, test_val_samples = train_test_split(all_samples, test_size=1 - train_size, random_state=42)
val_samples, test_samples = train_test_split(test_val_samples, test_size=test_size / (test_size + val_size), random_state=42)

with open('testset.csv', 'w') as f:
    for item in all_samples:
        if item in test_samples:
            f.write("%s,1\n" % item)
        else:
            f.write("%s,0\n" % item)
train_val_samples = list(set(all_samples) - set(test_samples))
train_val_samples.sort()

# with open('valset.csv', 'w') as f:
#     for item in train_val_samples:
#         if item in val_samples:
        #     f.write("%s,1\n" % item)
        # else:
        #     f.write("%s,0\n" % item)
print("length of train set: ", len(train_samples))
print("length of val set: ", len(val_samples))
print("length of test set: ", len(test_samples))
print("length of all samples: ", len(all_samples))
print("length of train_val set: ", len(train_val_samples))


In [3]:

def dataframe_from_csv(path, header=0, index_col=0):
    return pd.read_csv(path, header=header, index_col=index_col)

In [25]:
fn = "./itemid_to_variable_map.csv"
pr_fn = "./clinical_predictor_map.json"
variable_column='LEVEL2'

def _read_itemid_to_variable_map(fn, pr_fn, variable_column='LEVEL2'):
    var_map = dataframe_from_csv(fn, index_col=None).fillna('').astype(str)
    with open(pr_fn) as f:
        cli_pres_map = json.load(f)
    var_map["PREDICTOR"] = ""
    for category in cli_pres_map.values():
        for predictor_name, predictor in category.items():
            if predictor["LEVEL2"] != "NOT-FOUND":
                var_map.loc[var_map["LEVEL2"] == predictor["LEVEL2"], "PREDICTOR"] = predictor_name
                # print(f"Index of predictor {predictor_name}: {var_map[var_map.PREDICTOR == predictor_name].index}")
                continue
            if predictor["LEVEL1"] != "NOT-FOUND":
                var_map.loc[var_map["LEVEL1"] == predictor["LEVEL1"], "PREDICTOR"] = predictor_name
                # print(f"Index of predictor {predictor_name}: {var_map[var_map.PREDICTOR == predictor_name].index}")
                continue
            var_map.loc[var_map["MIMIC LABEL"].isin(predictor["MIMIC LABEL"]), "PREDICTOR"] = predictor_name
            # print(f"Index of predictor {predictor_name}: {var_map[var_map.PREDICTOR == predictor_name].index}")
    print("length of var_map.PREDICTOR: ", len(var_map.PREDICTOR.unique()))
    var_map = var_map[var_map["PREDICTOR"] != ""]
    var_map['COUNT'] = var_map['COUNT'].apply(lambda x: round(float(x)) if x else 0)
    var_map_ex = var_map[(var_map.PREDICTOR == 'Arterial Base Excess') | (var_map.PREDICTOR == 'Phosphorus')]
    var_map = var_map[(var_map.STATUS == 'ready') & (var_map.COUNT > 0)] 
    var_map = pd.concat([var_map, var_map_ex], ignore_index=True)

    print("length of var_map.PREDICTOR: ", len(var_map.PREDICTOR.unique()))
    var_map.ITEMID = var_map.ITEMID.astype(int)      
    var_map = var_map[[variable_column, 'ITEMID', 'MIMIC LABEL', 'PREDICTOR']].set_index('ITEMID')
    print("length of var_map.PREDICTOR: ", len(var_map.PREDICTOR.unique()))
    print(var_map.PREDICTOR.unique())

    return var_map.rename({variable_column: 'VARIABLE', 'MIMIC LABEL': 'MIMIC_LABEL'}, axis=1)



# var_map.to_csv("itemid_to_variable_map_with_predictor.csv")

In [21]:
subject_path = "../data/root/17"

stays = dataframe_from_csv(os.path.join(subject_path, 'stays.csv'), index_col=None)
stays.INTIME = pd.to_datetime(stays.INTIME)
stays.OUTTIME = pd.to_datetime(stays.OUTTIME)
stays.DOB = pd.to_datetime(stays.DOB)
stays.DOD = pd.to_datetime(stays.DOD)
stays.DEATHTIME = pd.to_datetime(stays.DEATHTIME)
stays.sort_values(by=['INTIME', 'OUTTIME'], inplace=True)
stays['READMISSION_30D'] = 0
for idx, row in stays.iterrows():
    if idx == 0:
        continue
    prev_row = stays.iloc[idx-1]
    prev_outtime = pd.to_datetime(prev_row.OUTTIME)
    cur_intime = pd.to_datetime(row.INTIME)
    if (cur_intime - prev_outtime).days < 30:
        stays.loc[idx-1, 'READMISSION_30D'] = 1
print(stays.columns)
for idx, row in stays.iterrows():
    print(f"index:{idx}, Readmission_30D: {row.READMISSION_30D}")

Index(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'LAST_CAREUNIT', 'DBSOURCE',
       'INTIME', 'OUTTIME', 'LOS', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ETHNICITY', 'DIAGNOSIS', 'GENDER', 'DOB', 'DOD', 'AGE',
       'MORTALITY_INUNIT', 'MORTALITY', 'MORTALITY_INHOSPITAL',
       'READMISSION_30D'],
      dtype='object')
index:0, Readmission_30D: 0
index:1, Readmission_30D: 0


In [5]:
sys.path.append("../../../../")
from dataset.mimic3.mimic3benchmark.subject import read_stays, read_diagnoses, read_events
from dataset.mimic3.mimic3benchmark.preprocessing import map_itemids_to_variables, clean_events , read_itemid_to_variable_map

In [7]:
subjects_root_path = "../../data/root/"
fn = "./itemid_to_variable_map.csv"
pr_fn = "./clinical_predictor_map.json"

modes = ["test"]

str_value4predictor = {}
ct_map = {}
valueom = {}

var_map = read_itemid_to_variable_map(fn, pr_fn)
# variables = var_map.VARIABLE.unique()
variables = var_map.PREDICTOR.unique()

for mode in modes:
    root_path = os.path.join(subjects_root_path, mode)
    print("root_path: ", root_path)
    for subject_dir in tqdm(os.listdir(root_path), desc='Iterating over subjects'):
        dn = os.path.join(root_path, subject_dir)
        try:
            subject_id = int(subject_dir)
            if not os.path.isdir(dn):
                raise Exception
        except:
            continue

        try:
            # reading tables of this subject
            stays = read_stays(os.path.join(root_path, subject_dir))
            diagnoses = read_diagnoses(os.path.join(root_path, subject_dir))
            events = read_events(os.path.join(root_path, subject_dir))
        except:
            sys.stderr.write('Error reading from disk for subject: {}\n'.format(subject_id))
            continue

        events = map_itemids_to_variables(events, var_map)
        events = clean_events(events)
        if events.shape[0] == 0:
            # no valid events for this subject
            continue
#         for idx, row in events.iterrows():
#             if row['PREDICTOR'] in variables:
#                 if row['PREDICTOR'] not in valueom:
#                     valueom[row['PREDICTOR']] = []
#                 try :
#                     float(row['VALUE'])
#                     if row['VALUEUOM'] not in valueom[row['PREDICTOR']]:
#                         valueom[row['PREDICTOR']].append(row['VALUEUOM'])
#                 except ValueError as e:
#                     continue
# print(valueom)

        variable_column='PREDICTOR'
        metadata = events[['CHARTTIME', 'ICUSTAY_ID']].sort_values(by=['CHARTTIME', 'ICUSTAY_ID'])\
                .drop_duplicates(keep='first').set_index('CHARTTIME')
        timeseries = events[['CHARTTIME', variable_column, 'VALUE']]\
                        .sort_values(by=['CHARTTIME', variable_column, 'VALUE'], axis=0)\
                        .drop_duplicates(subset=['CHARTTIME', variable_column], keep='last')
        timeseries = timeseries.pivot(index='CHARTTIME', columns=variable_column, values='VALUE')\
                        .merge(metadata, left_index=True, right_index=True)\
                        .sort_index(axis=0).reset_index()
        for idx, row in timeseries.iterrows():
            for v in variables:
                if v in row:
                    try:
                        value = float(row[v])
                        # if v in ct_map:
                        #     if 'num' not in ct_map[v]:
                        #         ct_map[v]['num'] = 1
                        #     else:
                        #         ct_map[v]['num'] += 1
                        # else:
                        #     ct_map[v] = {}
                        #     ct_map[v]['num'] = 1
                    except ValueError as e:
                        value = row[v]
                        # if v in ct_map:
                        #     if 'str' not in ct_map[v]:
                        #         ct_map[v]['str'] = 1
                        #     else:
                        #         ct_map[v]['str'] += 1
                        # else:
                        #     ct_map[v] = {}
                        #     ct_map[v]['str'] = 1
                        if v not in str_value4predictor:
                            str_value4predictor[v] = []
                            str_value4predictor[v].append(value)
                        else:
                            if value not in str_value4predictor[v]:
                                str_value4predictor[v].append(value)
print(str_value4predictor)
# for k, v in ct_map.items():
#     if 'num' in ct_map[k] and 'str' in ct_map[k]:
#         ct_map[k]['num_rate'] = ct_map[k]['num'] / (ct_map[k]['num'] + ct_map[k]['str'])
# print(ct_map)
# json_string = json.dumps(str_value4predictor)
# json_string.tojson("str_value4predictor.json", lines=True, indent = 4)




            

root_path:  ../../data/root/test


  return pd.read_csv(path, header=header, index_col=index_col)
Iterating over subjects: 100%|██████████| 6760/6760 [05:31<00:00, 20.40it/s]

{}





In [6]:
subjects_root_path = "../../data/root/"
fn = "./itemid_to_variable_map.csv"
pr_fn = "./clinical_predictor_map.json"

modes = ["test"]

str_value4predictor = {}
ct_map = {}
valueom = {}

var_map = read_itemid_to_variable_map(fn, pr_fn)
# variables = var_map.VARIABLE.unique()
variables = var_map.PREDICTOR.unique()
print("var_map.PREDICTOR.unique(): ", var_map.PREDICTOR.unique())
print("var_map_VARIABLE.unique(): ", var_map.VARIABLE.unique())

var_map.PREDICTOR.unique():  ['Anion Gap' 'Serum Bicarbonate' 'Blood Urea Nitrogen' 'Ionized Calcium'
 'Serum Chloride' 'Creatinine' 'Diastolic Blood Pressure'
 'Fingerstick Glucose' 'Serum Glucose' 'Heart Rate' 'Serum Hematocrit'
 'Hemoglobin' 'Magnesium' 'Mean Blood Pressure' 'Oxygen Saturation'
 'Arterial Carbon Dioxide Pressure' 'Arterial Oxygen Pressure'
 'Arterial pH' 'Platelet Count' 'Serum Potassium' 'Respiratory Rate'
 'Serum Sodium' 'Systolic Blood Pressure' 'Body Temperature'
 'White Blood Cell Count' 'Arterial Base Excess' 'Phosphorus']
var_map_VARIABLE.unique():  ['Anion gap' 'Bicarbonate' 'Blood urea nitrogen' 'Calcium ionized'
 'Chloride' 'Creatinine' 'Diastolic blood pressure' 'Glucose' 'Heart Rate'
 'Hematocrit' 'Hemoglobin' 'Magnesium' 'Mean blood pressure'
 'Oxygen saturation' 'Partial pressure of carbon dioxide'
 'Partial pressure of oxygen' 'pH' 'Platelets' 'Potassium serum'
 'Respiratory rate' 'Sodium' 'Systolic blood pressure' 'Temperature'
 'White blood cell cou