In [1]:
import os
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from utils.utils import get_readable_file_size

In [2]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

In [3]:
icu_outputevents_path = os.path.join(ehr_data_dir, "icu/outputevents.csv.gz")
df_icu_outputevents = pd.read_csv(icu_outputevents_path, index_col=False, compression="gzip")
df_icu_outputevents.head()

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom
0,10000032,29079034,39553978,88981,2180-07-23 15:00:00,2180-07-23 16:00:00,226560,175.0,ml
1,10000980,26913865,39765666,36518,2189-06-27 09:08:00,2189-06-27 09:08:00,226559,450.0,ml
2,10000980,26913865,39765666,36518,2189-06-27 09:08:00,2189-06-27 09:08:00,226633,400.0,ml
3,10000980,26913865,39765666,36518,2189-06-27 11:00:00,2189-06-27 10:51:00,226559,600.0,ml
4,10000980,26913865,39765666,36518,2189-06-27 13:00:00,2189-06-27 12:55:00,226559,800.0,ml


In [4]:
print(len(df_icu_outputevents["subject_id"].unique()))
print(len(df_icu_outputevents["hadm_id"].unique()))
print(len(df_icu_outputevents["stay_id"].unique()))
print(df_icu_outputevents.dtypes)

50198
64692
71111
subject_id        int64
hadm_id           int64
stay_id           int64
caregiver_id      int64
charttime        object
storetime        object
itemid            int64
value           float64
valueuom         object
dtype: object


In [5]:
df_items = pd.read_csv(os.path.join(ehr_data_dir, "icu/d_items.csv.gz"))
df_items.head()

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,220001,Problem List,Problem List,chartevents,General,,Text,,
1,220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,
2,220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
3,220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,
4,220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,


In [6]:
df_icu_outputevents_merge = pd.merge(df_icu_outputevents, df_items, how='left', on='itemid')
df_icu_outputevents_merge.head()

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,10000032,29079034,39553978,88981,2180-07-23 15:00:00,2180-07-23 16:00:00,226560,175.0,ml,Void,Void,outputevents,Output,mL,Numeric,,
1,10000980,26913865,39765666,36518,2189-06-27 09:08:00,2189-06-27 09:08:00,226559,450.0,ml,Foley,Foley,outputevents,Output,mL,Numeric,,
2,10000980,26913865,39765666,36518,2189-06-27 09:08:00,2189-06-27 09:08:00,226633,400.0,ml,Pre-Admission,Pre-Admission,outputevents,Output,mL,Numeric,,
3,10000980,26913865,39765666,36518,2189-06-27 11:00:00,2189-06-27 10:51:00,226559,600.0,ml,Foley,Foley,outputevents,Output,mL,Numeric,,
4,10000980,26913865,39765666,36518,2189-06-27 13:00:00,2189-06-27 12:55:00,226559,800.0,ml,Foley,Foley,outputevents,Output,mL,Numeric,,


In [7]:
selected_columns = ["subject_id", "hadm_id", "stay_id", "charttime", "itemid", "label", 
                    "value", "valueuom"]
df_icu_outputevents_merge = df_icu_outputevents_merge[selected_columns]
df_icu_outputevents_merge["label"] = df_icu_outputevents_merge["label"].str.lower()

df_icu_outputevents_merge.head()

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,itemid,label,value,valueuom
0,10000032,29079034,39553978,2180-07-23 15:00:00,226560,void,175.0,ml
1,10000980,26913865,39765666,2189-06-27 09:08:00,226559,foley,450.0,ml
2,10000980,26913865,39765666,2189-06-27 09:08:00,226633,pre-admission,400.0,ml
3,10000980,26913865,39765666,2189-06-27 11:00:00,226559,foley,600.0,ml
4,10000980,26913865,39765666,2189-06-27 13:00:00,226559,foley,800.0,ml


In [8]:
print(len(df_icu_outputevents_merge))
print(df_icu_outputevents_merge["label"].value_counts())
print(df_icu_outputevents_merge["valueuom"].value_counts())
print(df_icu_outputevents_merge.columns)

4234967
label
foley                      2977848
void                        269351
chest tube #1               261260
cerebral ventricular #1      98095
tf residual                  86211
                            ...   
penrose #2                      81
davol                           55
anderson                        23
anderson (gastric)              18
ewald                           17
Name: count, Length: 71, dtype: int64
valueuom
ml    4234967
Name: count, dtype: int64
Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'label',
       'value', 'valueuom'],
      dtype='object')


In [9]:
icu_cxr = pd.read_csv(os.path.join("outputs", "icu_cxr.csv.gz"), compression="gzip")
selected_columns = ["subject_id", "hadm_id"]
icu_subject_hadm_df = icu_cxr[selected_columns].copy()
icu_subject_hadm_df.drop_duplicates(inplace=True, keep="first")
icu_subject_hadm_df.head()

Unnamed: 0,subject_id,hadm_id
0,10001217,27703517
1,10001884,26184834
6,10002013,23581541
10,10002428,28662225
11,10002428,23473524


In [10]:
df_icu_outputevents_result = pd.merge(icu_subject_hadm_df, df_icu_outputevents_merge, on=["subject_id", "hadm_id"], 
                                      how="inner")
df_icu_outputevents_result.head()

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,itemid,label,value,valueuom
0,10001217,27703517,34592300,2157-12-19 19:40:00,226560,void,650.0,ml
1,10001217,27703517,34592300,2157-12-19 23:00:00,226560,void,900.0,ml
2,10001217,27703517,34592300,2157-12-20 04:00:00,226560,void,225.0,ml
3,10001217,27703517,34592300,2157-12-20 11:00:00,226560,void,500.0,ml
4,10001217,27703517,34592300,2157-12-20 08:00:00,226560,void,200.0,ml


In [None]:
df_icu_outputevents_result.to_csv(os.path.join("outputs", "icu_outputevents.csv.gz"), 
                                  index=False, compression="gzip")

In [12]:
readable_file_size = get_readable_file_size(os.path.join("outputs", "icu_outputevents.csv.gz"))
print(f"File size: {readable_file_size}")

File size: 6.30 MB


In [13]:
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [14]:
# Open the file in read mode and read the content as a string
with open('files/outputevents_description.txt', 'r') as file:
    description_text = file.read()

with open('files/prompt_text.txt', 'r') as file:
    prompt_text = file.read()
# Print the content of the file
print(description_text)
print(prompt_text)

This is the description to the icu_outputevents.csv.gz file. This file is located in outputs/icu_outputevents.csv.gz.
subject_id: A unique identifier for each patient. This ID is specific to each patient to distinguish them from others in the dataset.
hadm_id: A unique identifier for each hospital admission. Each hospital stay has its own hadm_id, allowing for the separation of different hospitalizations for the same patient.
stay_id: In MIMIC-IV, this represents a unique identifier for each ICU stay. Each ICU stay has a distinct stay_id, enabling differentiation between multiple ICU visits for the same patient within or across admissions.
charttime: The recorded time for the output event. This timestamp marks when the output was measured or recorded, often down to the minute or second.
itemid: A unique identifier for the output item. Each itemid code corresponds to a specific type of output, such as urine or drainage.
label: A human-readable name or label for the output item. This pro

In [15]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "How many times foley does the patieng 10001217 have during the hospital stay?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
#response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
file_path = 'outputs/icu_outputevents.csv.gz'
data = pd.read_csv(file_path)

# Filter for patient ID 10001217 and label "foley"
patient_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'foley')]

# Count the number of times foley is recorded
result = len(patient_data)
print(result)
```


In [16]:
import pandas as pd

# Load the dataset
file_path = 'outputs/icu_outputevents.csv.gz'
data = pd.read_csv(file_path)

# Filter for patient ID 10001217 and label "foley"
patient_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'foley')]

# Count the number of times foley is recorded
result = len(patient_data)
print(result)


0


In [17]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "What is the amount of foley does the patieng 10001217 have during the hospital stay?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
#response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the CSV file
file_path = 'outputs/icu_outputevents.csv.gz'
data = pd.read_csv(file_path)

# Filter for patient with subject_id 10001217 and itemid corresponding to foley
patient_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'foley')]

# Sum the value column to get total amount of foley for patient during hospital stay
result = patient_data['value'].sum()
print(result)
```


In [18]:
import pandas as pd

# Load the CSV file
file_path = 'outputs/icu_outputevents.csv.gz'
data = pd.read_csv(file_path)

# Filter for patient with subject_id 10001217 and itemid corresponding to foley
patient_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'foley')]

# Sum the value column to get total amount of foley for patient during hospital stay
result = patient_data['value'].sum()
print(result)


0.0
