# MIMIC 4 data - dataset construction inputevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [31]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import numpy as np

# add these lines to get the config to work
import json

with open('../../../../config/config.json') as config_f:
    config = json.load(config_f)


file_path=config['mimic_iv_1.0_path']
file_store_path = './preproc_output'

In [32]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [33]:
adm = pd.read_csv(file_store_path+'/admissions_processed.csv')

In [None]:
outputs = pd.read_csv(file_path+'/icu/outputevents.csv.gz')
outputs.tail()

In [35]:
# only choose previously selected admission ids
adm_ids=list(adm["hadm_id"])
outputs=outputs.loc[outputs["hadm_id"].isin(adm_ids)]

print("Number of patients remaining in the database: ")
print(outputs["subject_id"].nunique())

Number of patients remaining in the database: 
17707


In [36]:
# get item names
item_id=pd.read_csv(file_path+'/icu/d_items.csv.gz')
item_id_1=item_id[["itemid","label"]]
item_id_1.head()

outputs_2=pd.merge(outputs,item_id_1,on="itemid")
outputs_2.head()
print("Number of patients remaining in the database: ")
print(outputs_2["subject_id"].nunique())

Number of patients remaining in the database: 
17707


In [37]:
# take only the n most used items
"""
n_best=15
pat_for_item=outputs_2.groupby("label")["subject_id"].nunique()
frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]
outputs_3=outputs_2.loc[outputs_2["label"].isin(list(frequent_labels.index))].copy()

print("Number of patients remaining in the database: ")
print(outputs_3["subject_id"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_3.index))

print(frequent_labels)
"""

'\nn_best=15\npat_for_item=outputs_2.groupby("label")["subject_id"].nunique()\nfrequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]\noutputs_3=outputs_2.loc[outputs_2["label"].isin(list(frequent_labels.index))].copy()\n\nprint("Number of patients remaining in the database: ")\nprint(outputs_3["subject_id"].nunique())\nprint("Number of datapoints remaining in the database: ")\nprint(len(outputs_3.index))\n\nprint(frequent_labels)\n'

In [38]:
#outputs_label_list=['Foley', 'Void', 'OR Urine', 'Chest Tube', 'Oral Gastric', 'Pre-Admission', 'TF Residual', 'OR EBL', 'Emesis', 'Nasogastric', 'Stool', 'Jackson Pratt', 'TF Residual Output', 'Fecal Bag', 'Straight Cath']

# added to adapt variables to the ones used for MIMIC-III in GRU-ODE-Bayes (Caution: no corresponding labels for Urine Out Incontinent, Ultrafiltrate Ultrafiltrate)
gruodebayes_var_list = ['Gastric Tube','Stool', 'Foley', 'Void','Condom Cath','Fecal Bag','Ostomy (output)','Chest Tube #1','Chest Tube #2','Jackson Pratt #1','OR EBL','Pre-Admission','TF Residual']
outputs_bis=outputs_2.loc[outputs_2["label"].isin(gruodebayes_var_list)].copy()

print("Number of patients remaining in the database: ")
print(outputs_bis["subject_id"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_bis.index))

outputs_3=outputs_bis.copy()

Number of patients remaining in the database: 
17654
Number of datapoints remaining in the database: 
1148803


In [None]:
# Verification that all input labels have the same amounts units
outputs_3.groupby("label")["valueuom"].value_counts() 

In [40]:
outputs_3.to_csv(file_store_path + "/outputs_processed.csv")