In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from utils.utils import get_readable_file_size

In [2]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

### icustays

In [3]:
icu_icustays_path = os.path.join(ehr_data_dir, "icu/icustays.csv.gz")
df_icu_icustays = pd.read_csv(icu_icustays_path, index_col=False, compression="gzip")
df_icu_icustays.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266
1,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535
2,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032
3,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113
4,10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338588


In [4]:
print(len(df_icu_icustays["subject_id"].unique()))
print(len(df_icu_icustays["hadm_id"].unique()))
print(len(df_icu_icustays["stay_id"].unique()))
print(df_icu_icustays.dtypes)

50920
66239
73181
subject_id          int64
hadm_id             int64
stay_id             int64
first_careunit     object
last_careunit      object
intime             object
outtime            object
los               float64
dtype: object


In [5]:
df_icu_icustays["first_careunit"] = df_icu_icustays["first_careunit"].str.lower()
df_icu_icustays["last_careunit"] = df_icu_icustays["last_careunit"].str.lower()

selected_columns = ["subject_id", "hadm_id", "stay_id", "first_careunit", "last_careunit", "intime", "outtime"]
df_icu_icustays = df_icu_icustays[selected_columns]
df_icu_icustays.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime
0,10000032,29079034,39553978,medical intensive care unit (micu),medical intensive care unit (micu),2180-07-23 14:00:00,2180-07-23 23:50:47
1,10000980,26913865,39765666,medical intensive care unit (micu),medical intensive care unit (micu),2189-06-27 08:42:00,2189-06-27 20:38:27
2,10001217,24597018,37067082,surgical intensive care unit (sicu),surgical intensive care unit (sicu),2157-11-20 19:18:02,2157-11-21 22:08:00
3,10001217,27703517,34592300,surgical intensive care unit (sicu),surgical intensive care unit (sicu),2157-12-19 15:42:24,2157-12-20 14:27:41
4,10001725,25563031,31205490,medical/surgical intensive care unit (micu/sicu),medical/surgical intensive care unit (micu/sicu),2110-04-11 15:52:22,2110-04-12 23:59:56


In [6]:
icu_cxr = pd.read_csv(os.path.join("outputs", "icu_cxr.csv.gz"), compression="gzip")
selected_columns = ["subject_id", "hadm_id"]
icu_subject_hadm_df = icu_cxr[selected_columns].copy()
icu_subject_hadm_df.drop_duplicates(inplace=True, keep="first")
icu_subject_hadm_df.head()

Unnamed: 0,subject_id,hadm_id
0,10001217,27703517
1,10001884,26184834
6,10002013,23581541
10,10002428,28662225
11,10002428,23473524


In [8]:
df_icu_icustays_result = pd.merge(icu_subject_hadm_df, df_icu_icustays, on=["subject_id", "hadm_id"], 
                           how="inner")
df_icu_icustays_result.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime
0,10001217,27703517,34592300,surgical intensive care unit (sicu),surgical intensive care unit (sicu),2157-12-19 15:42:24,2157-12-20 14:27:41
1,10001884,26184834,37510196,medical intensive care unit (micu),medical intensive care unit (micu),2131-01-11 04:20:05,2131-01-20 08:27:30
2,10002013,23581541,39060235,cardiac vascular intensive care unit (cvicu),cardiac vascular intensive care unit (cvicu),2160-05-18 10:00:53,2160-05-19 17:33:33
3,10002428,28662225,33987268,medical intensive care unit (micu),medical intensive care unit (micu),2156-04-12 16:24:18,2156-04-17 15:57:08
4,10002428,28662225,38875437,medical intensive care unit (micu),medical intensive care unit (micu),2156-04-19 18:11:19,2156-04-26 18:58:41


In [9]:
df_icu_icustays_result.to_csv(os.path.join("outputs", "icu_icustays.csv.gz"), 
                              index=False, compression="gzip")

In [10]:
readable_file_size = get_readable_file_size(os.path.join("outputs", "icu_icustays.csv.gz"))
print(f"File size: {readable_file_size}")

File size: 527.29 KB


In [11]:
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [14]:
# Open the file in read mode and read the content as a string
with open('files/icustays_description.txt', 'r') as file:
    description_text = file.read()

with open('files/prompt_text.txt', 'r') as file:
    prompt_text = file.read()
# Print the content of the file
print(description_text)
print(prompt_text)

This is the description to the icu_icustays.csv.gz file. This file is located in outputs/icu_icustays.csv.gz.
subject_id: This is the unique identifier for each patient in the dataset, allowing tracking of patient-level data across different hospital stays and tables.
hadm_id: The unique identifier for each hospital admission, which is specific to a single hospital stay. This ID links data for a patient's entire admission period.
stay_id: A unique identifier for each ICU stay. Since patients can have multiple ICU stays within a single hospital admission, this ID is used to distinguish between them.
first_careunit: This specifies the name of the first care unit where the patient was admitted during this ICU stay. Care units refer to different specialized sections in the ICU (such as Medical ICU, Surgical ICU, etc.).
last_careunit: This indicates the name of the last care unit where the patient stayed before discharge from the ICU.
intime: The date and time the patient was admitted to th

In [15]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "How many hours does the patient 10001217 have been in the ICU?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd
from datetime import timedelta

# Load the data
file_path = 'outputs/icu_icustays.csv.gz'
icu_data = pd.read_csv(file_path)

# Filter for patient 10001217
patient_10001217 = icu_data[icu_data['subject_id'] == 10001217]

# Calculate total hours in ICU
total_hours = timedelta()
for index, row in patient_10001217.iterrows():
    intime = pd.to_datetime(row['intime'])
    outtime = pd.to_datetime(row['outtime'])
    duration = outtime - intime
    total_hours += duration

# Convert total time to hours
result = total_hours.total_seconds() / 3600
print(result)
```


In [16]:
import pandas as pd
from datetime import timedelta

# Load the data
file_path = 'outputs/icu_icustays.csv.gz'
icu_data = pd.read_csv(file_path)

# Filter for patient 10001217
patient_10001217 = icu_data[icu_data['subject_id'] == 10001217]

# Calculate total hours in ICU
total_hours = timedelta()
for index, row in patient_10001217.iterrows():
    intime = pd.to_datetime(row['intime'])
    outtime = pd.to_datetime(row['outtime'])
    duration = outtime - intime
    total_hours += duration

# Convert total time to hours
result = total_hours.total_seconds() / 3600
print(result)

22.754722222222224
