# MIMIC 4 data - dataset construction labevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import numpy as np
from sklearn.model_selection import train_test_split

# add these lines to get the config to work
import json

with open('../../../../config/config.json') as config_f:
    config = json.load(config_f)


file_path=config['mimic_iv_1.0_path']
file_store_path = './preproc_output'

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [None]:
adm = pd.read_csv(file_store_path + "/admissions_processed.csv")
adm.head()

In [None]:
df = pd.DataFrame()
for chunk in pd.read_csv(file_path + "/hosp/labevents.csv.gz", chunksize=500000):
    adm_ids=list(adm["hadm_id"])
    chunk=chunk.loc[chunk["hadm_id"].isin(adm_ids)]
    df = pd.concat([df, chunk[["subject_id","hadm_id","charttime","valuenum","itemid"]]], ignore_index=True)

In [5]:
# only choose previously selected admission ids.
print("Number of patients remaining in the database: ")
print(df["subject_id"].nunique())

Number of patients remaining in the database: 
17854


In [None]:
# get item ids
item_id=pd.read_csv(file_path + "/hosp/d_labitems.csv.gz")
item_id_1=item_id[["itemid","label"]]
item_id_1.head()

In [7]:
# get names of administered items
lab2=pd.merge(df,item_id_1,on="itemid")
lab2.head()
print("Number of patients remaining in the database: ")
print(lab2["subject_id"].nunique())

Number of patients remaining in the database: 
17854


In [8]:
# get only top 150 most used tests
n_best=150
pat_for_item=lab2.groupby("label")["subject_id"].nunique()
frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]
lab3=lab2.loc[lab2["label"].isin(list(frequent_labels.index))].copy()

print("Number of patients remaining in the database: ")
print(lab3["subject_id"].nunique())

Number of patients remaining in the database: 
17853


In [9]:
# only select the subset that was used in the paper (only missing is INR(PT))
# added missing MCHC to make it comparable to GRU-ODE-Bayes MIMIC-III
subset=["Albumin","Alanine Aminotransferase (ALT)","Alkaline Phosphatase","Anion Gap","Asparate Aminotransferase (AST)","Base Excess","Basophils","Bicarbonate","Bilirubin, Total","Calcium, Total","Calculated Total CO2","Chloride","Creatinine","Eosinophils","Glucose","Hematocrit","Hemoglobin",
"Lactate","Lymphocytes","MCH", "MCHC", "MCV","Magnesium","Monocytes","Neutrophils","PT","PTT","Phosphate","Platelet Count","Potassium","RDW","Red Blood Cells","Sodium","Specific Gravity","Urea Nitrogen","White Blood Cells","pCO2","pH","pO2"]

lab3=lab3.loc[lab3["label"].isin(subset)].copy()

In [11]:
print("Number of patients remaining in the database: ")
print(lab3["subject_id"].nunique())

Number of patients remaining in the database: 
17851


In [10]:
lab3.to_csv(file_store_path + "/lab_processed.csv")