# MIMIC 4 data - dataset construction admissions

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import numpy as np

from tqdm import tqdm

# import config
import json

with open('../../../../config/config.json') as config_f:
    config = json.load(config_f)


file_path=config['mimic_iv_1.0_path']
file_store_path = './preproc_output'

In [None]:
fn = file_path + '/core/admissions.csv.gz'
adm = pd.read_csv(fn, compression='gzip')
adm.head()

In [3]:
#keep only patients present in patients data
# PREPROC STEP 1: FILTER PATIENTS WHO ARE IN THE ADMISSIONS TABLE
patients_df=pd.read_csv(file_path + '/core/patients.csv.gz')
patients_df[["subject_id","anchor_age"]].head()
adm_dob=pd.merge(patients_df[["subject_id","anchor_age"]],adm,on="subject_id")

df=adm.groupby("subject_id")["hadm_id"].nunique()
subj_ids=list(df[df==1].index)
adm_1=adm_dob.loc[adm_dob["subject_id"].isin(subj_ids)]
print("Number of patients remaining in the dataframe: ")
print(len(adm_1.index))

Number of patients remaining in the dataframe: 
171080


In [4]:
# time of stay in ICU
# PREPROC STEP 2: FILTER PATIENTS WITH ICU STAY >2 and <30
adm_1=adm_1.copy()
adm_1['admittime']=pd.to_datetime(adm_1["admittime"], format='%Y-%m-%d %H:%M:%S')
adm_1['dischtime']=pd.to_datetime(adm_1["dischtime"], format='%Y-%m-%d %H:%M:%S')

adm_1["elapsed_time"]=adm_1["dischtime"]-adm_1["admittime"]
adm_1.head()
adm_1["elapsed_days"]=adm_1["elapsed_time"].dt.days 

adm_2=adm_1.loc[(adm_1["elapsed_days"]<30) & (adm_1["elapsed_days"]>2)]
print("Number of patients remaining in the dataframe: ")
print(len(adm_2.index))

Number of patients remaining in the dataframe: 
65825


In [5]:
# only patients older than 15
# PREPROC STEP 3: FILTER PATIENTS WHO ARE OLDER THAN > 15 YEARS AT ADMISSION TIME
adm_2_15=adm_2.loc[adm_2["anchor_age"]>15].copy()
print("Number of patients remaining in the dataframe: ")
print(len(adm_2_15.index))

Number of patients remaining in the dataframe: 
43967


In [None]:
fn = file_path+'/icu/chartevents.csv.gz'
# this file is huge, we need to read in the data in chunks
# chartevents = pd.read_csv(fn, compression='gzip')

# workaround:
ids = np.array([])
for chunk in tqdm(pd.read_csv(fn, chunksize=1000000), desc='Reading chunk', unit='chunk'):
    ids = np.append(ids, chunk['hadm_id'].unique())
    ids = np.unique(ids) # all admission ids that have chartevents data available

In [7]:
# PREPROC STEP 4: FILTER PATIENTS WHO HAVE CHARTEVENTS DATA AVAILABLE
adm_2_15_chart=adm_2_15.loc[adm_2_15["hadm_id"].isin(ids)].copy()
print("Number of patients remaining in the dataframe: ")
print(len(adm_2_15_chart.index))

Number of patients remaining in the dataframe: 
17874


In [8]:
adm_2_15_chart.to_csv(file_store_path + '/admissions_processed.csv')