In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm

In [2]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

In [3]:
def list_files_in_directory(directory):
    try:
        # List all files in the given directory
        files = os.listdir(directory)
        
        # Filter out only files (excluding directories)
        files = [file for file in files if os.path.isfile(os.path.join(directory, file))]
        
        return files
    except FileNotFoundError:
        return f"Directory '{directory}' not found."
    except Exception as e:
        return str(e)
    
files = list_files_in_directory(os.path.join(ehr_data_dir, "icu"))
for i, file in enumerate(files):
    print(file)

procedureevents.csv.gz
chartevents.csv.gz
index.html
ingredientevents.csv.gz
outputevents.csv.gz
inputevents.csv.gz
caregiver.csv.gz
d_items.csv.gz
icustays.csv.gz
datetimeevents.csv.gz


### ICU stays

In [4]:
icu_icustays_path = os.path.join(ehr_data_dir, "icu/icustays.csv.gz")
df_icu_icustays = pd.read_csv(icu_icustays_path, index_col=False, compression="gzip")
df_icu_icustays.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266
1,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535
2,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032
3,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113
4,10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338588


In [5]:
print(len(df_icu_icustays["subject_id"].unique()))
print(len(df_icu_icustays["hadm_id"].unique()))
print(len(df_icu_icustays["stay_id"].unique()))

print(df_icu_icustays.dtypes)

50920
66239
73181
subject_id          int64
hadm_id             int64
stay_id             int64
first_careunit     object
last_careunit      object
intime             object
outtime            object
los               float64
dtype: object


### Load discharge summary data

In [6]:
note_data_dir = "/home/mengliang/DatasetFolder/mimic-iv-note/2.2"

note_discharge_path = os.path.join(note_data_dir, "note/discharge.csv.gz")
df_note_discharge = pd.read_csv(note_discharge_path, index_col=False, compression='gzip')
df_note_discharge.head()

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...


In [9]:
print(len(df_note_discharge))

331794


In [10]:
# combine icu stay and discharge note dataframes
icu_discharge_merged = pd.merge(df_icu_icustays, df_note_discharge, on=['subject_id', 'hadm_id'])
icu_discharge_merged.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,note_id,note_type,note_seq,charttime,storetime,text
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266,10000032-DS-23,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
1,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535,10000980-DS-21,DS,21,2189-07-03 00:00:00,2189-07-03 19:50:00,\nName: ___ Unit No: ___\n \nAdmi...
2,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032,10001217-DS-4,DS,4,2157-11-25 00:00:00,2157-11-25 17:26:00,\nName: ___ Unit No: ___\n \n...
3,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113,10001217-DS-5,DS,5,2157-12-24 00:00:00,2157-12-24 15:57:00,\nName: ___ Unit No: ___\n \n...
4,10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338588,10001725-DS-12,DS,12,2110-04-14 00:00:00,2110-04-19 17:44:00,\nName: ___ Unit No: ___\n \nA...


In [11]:
print(len(icu_discharge_merged))
print(len(icu_discharge_merged["subject_id"].unique()))

72201
50253


In [12]:
# save the merged icu discharge report data to csv file
icu_discharge_merged.to_csv("outputs/icu_discharge_merged.csv.gz", 
                            compression="gzip", index=False)

In [None]:
def get_readable_file_size(file_path):
    file_size = os.path.getsize(file_path)
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if file_size < 1024:
            return f"{file_size:.2f} {unit}"
        file_size /= 1024
    

file_path = "outputs/icu_discharge_merged.csv.gz"
readable_file_size = get_readable_file_size(file_path)
print(f"File size: {readable_file_size}")

File size: 281.51 MB


In [14]:
df_example = icu_discharge_merged[icu_discharge_merged['hadm_id'] == 27882036]
df_example.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,note_id,note_type,note_seq,charttime,storetime,text
84,10012853,27882036,31338022,Trauma SICU (TSICU),Trauma SICU (TSICU),2176-11-26 02:34:49,2176-11-29 20:58:54,3.766725,10012853-DS-10,DS,10,2176-12-03 00:00:00,2176-12-03 13:41:00,\nName: ___ Unit No: _...


In [15]:
a = df_example.iloc[0,:]["text"]
print(a)

 
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___.
 
Chief Complaint:
Hypoxic and Hypercarbic Respiratory Failure, deep vein 
thromboses
 
Major Surgical or Invasive Procedure:
None
 
History of Present Illness:
___ y/o F with atrial fibrillation on warfarin, PE, CKD III, PVD, 
multinodular goiter s/p biopsy w/ possible follicular neoplasm 
in ___ who was sent to ___ ED after being found to have L 
jugular and subclavian venous thrombosis despite therapeutic INR 
on warfarin. Patient found to have airway stenosis on CT 
imaging, and s/s of hypercarbic respiratory failure, w/ABG 
consistent w/acute on chronic hypercapnic respiratory failure, 
sent to ICU for monitoring and further workup.  

 
Past Medical History:
- T2DM (HbA1c 6.1% in ___, diet controlled)
- HLD
- CKD III
- PVD
- OA
- iron defi