In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from utils.utils import get_readable_file_size

In [2]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

In [3]:
icu_cxr = pd.read_csv(os.path.join("outputs", "icu_cxr.csv.gz"), compression="gzip")
selected_columns = ["subject_id", "hadm_id"]
icu_subject_hadm_df = icu_cxr[selected_columns].copy()
icu_subject_hadm_df.drop_duplicates(inplace=True, keep="first")
icu_subject_hadm_df.head()

Unnamed: 0,subject_id,hadm_id
0,10001217,27703517
1,10001884,26184834
6,10002013,23581541
10,10002428,28662225
11,10002428,23473524


In [4]:
ehr_labevents_path = os.path.join(ehr_data_dir, "hosp/labevents.csv.gz")
df_ehr_labevents = pd.read_csv(ehr_labevents_path, index_col=False)
df_ehr_labevents.head()

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
0,1,10000032,,45421181,51237,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,1.4,1.4,,0.9,1.1,abnormal,ROUTINE,
1,2,10000032,,45421181,51274,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,___,15.1,sec,9.4,12.5,abnormal,ROUTINE,VERIFIED.
2,3,10000032,,52958335,50853,P28Z0X,2180-03-23 11:51:00,2180-03-25 11:06:00,___,15.0,ng/mL,30.0,60.0,abnormal,ROUTINE,NEW ASSAY IN USE ___: DETECTS D2 AND D3 25-OH ...
3,4,10000032,,52958335,50861,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,102,102.0,IU/L,0.0,40.0,abnormal,ROUTINE,
4,5,10000032,,52958335,50862,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,3.3,3.3,g/dL,3.5,5.2,abnormal,ROUTINE,


In [5]:
print(df_ehr_labevents.columns)
print(len(df_ehr_labevents))
print(df_ehr_labevents["flag"].value_counts())

Index(['labevent_id', 'subject_id', 'hadm_id', 'specimen_id', 'itemid',
       'order_provider_id', 'charttime', 'storetime', 'value', 'valuenum',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments'],
      dtype='object')
118171367
flag
abnormal    33554155
Name: count, dtype: int64


In [6]:
df_icu_labevents = pd.merge(icu_subject_hadm_df, df_ehr_labevents, 
                            on=["subject_id", "hadm_id"], 
                            how="inner")

selected_columns = ["subject_id", "hadm_id", "itemid", "charttime", "valuenum", "valueuom",
                    "ref_range_lower", "ref_range_upper"]
df_icu_labevents = df_icu_labevents[selected_columns]
df_icu_labevents.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper
0,10001217,27703517,51237,2157-12-19 04:50:00,1.1,,0.9,1.1
1,10001217,27703517,51274,2157-12-19 04:50:00,12.3,sec,9.4,12.5
2,10001217,27703517,51275,2157-12-19 04:50:00,32.8,sec,25.0,36.5
3,10001217,27703517,51146,2157-12-19 04:50:00,1.5,%,0.0,2.0
4,10001217,27703517,51200,2157-12-19 04:50:00,2.7,%,0.0,4.0


In [7]:
df_labitems = pd.read_csv(os.path.join(ehr_data_dir, "hosp/d_labitems.csv.gz"), 
                         index_col=False, compression='gzip')
df_labitems.head()

Unnamed: 0,itemid,label,fluid,category
0,50801,Alveolar-arterial Gradient,Blood,Blood Gas
1,50802,Base Excess,Blood,Blood Gas
2,50803,"Calculated Bicarbonate, Whole Blood",Blood,Blood Gas
3,50804,Calculated Total CO2,Blood,Blood Gas
4,50805,Carboxyhemoglobin,Blood,Blood Gas


In [8]:
df_icu_labevents = pd.merge(df_icu_labevents, df_labitems, 
                            on=["itemid"], 
                            how="inner")
df_icu_labevents.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper,label,fluid,category
0,10001217,27703517,51237,2157-12-19 04:50:00,1.1,,0.9,1.1,INR(PT),Blood,Hematology
1,10001217,27703517,51237,2157-12-20 01:45:00,1.1,,0.9,1.1,INR(PT),Blood,Hematology
2,10001884,26184834,51237,2131-01-11 06:31:00,1.4,,0.9,1.1,INR(PT),Blood,Hematology
3,10001884,26184834,51237,2131-01-12 03:34:00,1.2,,0.9,1.1,INR(PT),Blood,Hematology
4,10001884,26184834,51237,2131-01-12 10:04:00,1.2,,0.9,1.1,INR(PT),Blood,Hematology


In [9]:
print(len(df_icu_labevents))

7918549


In [10]:
df_icu_labevents_ref = df_icu_labevents[df_icu_labevents["ref_range_lower"].notna() & df_icu_labevents["ref_range_upper"].notna()]
print(len(df_icu_labevents_ref))

6770240


In [11]:
df_icu_labevents_ref.dtypes

subject_id           int64
hadm_id              int64
itemid               int64
charttime           object
valuenum           float64
valueuom            object
ref_range_lower    float64
ref_range_upper    float64
label               object
fluid               object
category            object
dtype: object

In [12]:
df_icu_labevents_ref["label"].value_counts()

label
Glucose                  274453
Sodium                   256908
Potassium                256595
Chloride                 255704
Creatinine               241745
                          ...  
Immature Granulocytes         3
Protein C, Antigen            3
Protein S, Antigen            2
Bleeding Time                 1
Epithelial Casts              1
Name: count, Length: 242, dtype: int64

Print the labtest labels and counts. Print the labtest label and the corresponding value unit, ref value.

In [13]:
itemid_counts = df_icu_labevents_ref['itemid'].value_counts().reset_index()
itemid_counts.columns = ['itemid', 'count']

df_unique = df_icu_labevents_ref[['itemid', 'label', 'valueuom', 'ref_range_lower', 'ref_range_upper']].drop_duplicates()

labevents_label_info = pd.merge(df_unique, itemid_counts, on='itemid', how='left')
labevents_label_info.to_csv("outputs/labevents_label_info.csv", index=False)

In [14]:
blood_itemid_list = [51222, 51221, 51301, 51279, 51265, 51250, 51248, 51249, 51277, 52172]
df_icu_labevents_blood = df_icu_labevents[df_icu_labevents['itemid'].isin(blood_itemid_list)]
df_icu_labevents_blood.reset_index(inplace=True, drop=True)

df_icu_labevents_blood["label"] = df_icu_labevents_blood["label"].str.lower()
df_icu_labevents_blood["fluid"] = df_icu_labevents_blood["fluid"].str.lower()
df_icu_labevents_blood["category"] = df_icu_labevents_blood["category"].str.lower()

df_icu_labevents_blood.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_icu_labevents_blood["label"] = df_icu_labevents_blood["label"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_icu_labevents_blood["fluid"] = df_icu_labevents_blood["fluid"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_icu_labevents_blood["category"] = df_icu_labev

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper,label,fluid,category
0,10001217,27703517,51221,2157-12-19 04:50:00,37.4,%,36.0,48.0,hematocrit,blood,hematology
1,10001217,27703517,51221,2157-12-20 01:45:00,37.5,%,36.0,48.0,hematocrit,blood,hematology
2,10001217,27703517,51221,2157-12-21 04:40:00,37.0,%,36.0,48.0,hematocrit,blood,hematology
3,10001217,27703517,51221,2157-12-23 04:30:00,39.1,%,36.0,48.0,hematocrit,blood,hematology
4,10001217,27703517,51221,2157-12-24 03:59:00,38.5,%,36.0,48.0,hematocrit,blood,hematology


In [15]:
print(len(df_icu_labevents_blood))

2045746


In [17]:
df_icu_labevents_blood.to_csv('outputs/icu_labevents_blood.csv.gz', index=False, compression='gzip')

In [16]:
import os
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [18]:
num_gpus = torch.cuda.device_count()
print(f"Available GPUs: {num_gpus}")

Available GPUs: 2


In [19]:
# Open the file in read mode and read the content as a string
with open('files/labevents_description.txt', 'r') as file:
    description_text = file.read()

with open('files/prompt_text.txt', 'r') as file:
    prompt_text = file.read()

# Print the content of the file
print(description_text)
print(prompt_text)

This is the description to the icu_labevents_blood.csv.gz file. This file is located in outputs/icu_labevents_blood.csv.gz.
subject_id: A unique identifier assigned to each patient in the dataset. It is used to track and distinguish individual patients across multiple encounters or tests.
hadm_id: The unique identifier for a hospital admission. It associates each patient's data with a specific hospital stay, allowing for tracking of multiple admissions for the same patient.
itemid: A unique identifier for the specific lab test that was recorded for the patient. This ID helps to categorize and access specific types of tests or clinical observations.
charttime: The timestamp when the test or observation was recorded in the medical chart. It provides the time context for the medical data, which is essential for analyzing trends over time.
valuenum: The numerical value of the recorded test result or measurement. This field holds the quantitative data corresponding to the itemid (e.g., lab 

In [24]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "How many times of white blood cells test were done for patient 10001217?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
#response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
filename = 'outputs/icu_labevents_blood.csv.gz'
data = pd.read_csv(filename)

# Filter data for patient with subject_id 10001217 and test label "white blood cells"
filtered_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'white blood cells')]

# Count the number of white blood cell tests
result = len(filtered_data)
print(result)
```


In [25]:
import pandas as pd

# Load the dataset
file_path = 'outputs/icu_labevents_blood.csv.gz'
data = pd.read_csv(file_path)

# Filter data for patient ID 10001217 and white blood cells test
patient_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'white blood cells')]

# Count the number of tests
result = len(patient_data)
print(result)

5


In [22]:
filename = 'outputs/icu_labevents_blood.csv.gz'
df_icu_labevents_blood = pd.read_csv(filename)
blood_test = df_icu_labevents_blood[df_icu_labevents_blood["label"] == "white blood cells"]
blood_test.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper,label,fluid,category
1754736,10001217,27703517,51301,2157-12-19 04:50:00,5.4,K/uL,4.0,11.0,white blood cells,blood,hematology
1754737,10001217,27703517,51301,2157-12-20 01:45:00,8.4,K/uL,4.0,11.0,white blood cells,blood,hematology
1754738,10001217,27703517,51301,2157-12-21 04:40:00,8.4,K/uL,4.0,11.0,white blood cells,blood,hematology
1754739,10001217,27703517,51301,2157-12-23 04:30:00,7.2,K/uL,4.0,11.0,white blood cells,blood,hematology
1754740,10001217,27703517,51301,2157-12-24 03:59:00,8.8,K/uL,4.0,11.0,white blood cells,blood,hematology


In [26]:
blood_test_a = blood_test[blood_test["subject_id"] == 10001217]
blood_test_a.head(10)

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper,label,fluid,category
1754736,10001217,27703517,51301,2157-12-19 04:50:00,5.4,K/uL,4.0,11.0,white blood cells,blood,hematology
1754737,10001217,27703517,51301,2157-12-20 01:45:00,8.4,K/uL,4.0,11.0,white blood cells,blood,hematology
1754738,10001217,27703517,51301,2157-12-21 04:40:00,8.4,K/uL,4.0,11.0,white blood cells,blood,hematology
1754739,10001217,27703517,51301,2157-12-23 04:30:00,7.2,K/uL,4.0,11.0,white blood cells,blood,hematology
1754740,10001217,27703517,51301,2157-12-24 03:59:00,8.8,K/uL,4.0,11.0,white blood cells,blood,hematology


In [27]:
print(len(blood_test_a))

5


In [28]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "What was the  earliest value of red blood cells of the patient 10001217?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
#response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
file_path = 'outputs/icu_labevents_blood.csv.gz'
data = pd.read_csv(file_path)

# Filter for patient with subject_id 10001217 and label "red blood cells"
filtered_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'red blood cells')]

# Find the earliest value based on charttime
earliest_value = filtered_data.loc[filtered_data['charttime'].idxmin(), 'valuenum']

result = earliest_value
print(result)
```


In [29]:
import pandas as pd

# Load the dataset
file_path = 'outputs/icu_labevents_blood.csv.gz'
data = pd.read_csv(file_path)

# Filter for patient with subject_id 10001217 and label "red blood cells"
filtered_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'red blood cells')]

# Find the earliest value based on charttime
earliest_value = filtered_data.loc[filtered_data['charttime'].idxmin(), 'valuenum']

result = earliest_value
print(result)

4.52


In [30]:
blood_test = df_icu_labevents_blood[(df_icu_labevents_blood["label"] == "red blood cells") &  (df_icu_labevents_blood["subject_id"] == 10001217)]
blood_test.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper,label,fluid,category
1538822,10001217,27703517,51279,2157-12-19 04:50:00,4.52,m/uL,4.2,5.4,red blood cells,blood,hematology
1538823,10001217,27703517,51279,2157-12-20 01:45:00,4.53,m/uL,4.2,5.4,red blood cells,blood,hematology
1538824,10001217,27703517,51279,2157-12-21 04:40:00,4.48,m/uL,4.2,5.4,red blood cells,blood,hematology
1538825,10001217,27703517,51279,2157-12-23 04:30:00,4.66,m/uL,4.2,5.4,red blood cells,blood,hematology
1538826,10001217,27703517,51279,2157-12-24 03:59:00,4.63,m/uL,4.2,5.4,red blood cells,blood,hematology


In [31]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "Please check the most recent test of red blood cells for patient 10001217 and determine whether it is within the normal range or if it indicates an abnormal result."
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
#response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the CSV file
file_path = 'outputs/icu_labevents_blood.csv.gz'
data = pd.read_csv(file_path)

# Filter for patient 10001217 and label "red blood cells"
patient_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'red blood cells')]

# Get the most recent test
latest_test = patient_data.loc[patient_data['charttime'].idxmax()]

# Check if the result is within the normal range
valuenum = latest_test['valuenum']
ref_range_lower = latest_test['ref_range_lower']
ref_range_upper = latest_test['ref_range_upper']

if ref_range_lower <= valuenum <= ref_range_upper:
    result = "Normal"
else:
    result = "Abnormal"

print(result)
```


In [32]:
import pandas as pd

# Load the CSV file
file_path = 'outputs/icu_labevents_blood.csv.gz'
data = pd.read_csv(file_path)

# Filter for patient 10001217 and label "red blood cells"
patient_data = data[(data['subject_id'] == 10001217) & (data['label'] == 'red blood cells')]

# Get the most recent test
latest_test = patient_data.loc[patient_data['charttime'].idxmax()]

# Check if the result is within the normal range
valuenum = latest_test['valuenum']
ref_range_lower = latest_test['ref_range_lower']
ref_range_upper = latest_test['ref_range_upper']

if ref_range_lower <= valuenum <= ref_range_upper:
    result = "Normal"
else:
    result = "Abnormal"

print(result)

Normal


In [33]:
blood_test = df_icu_labevents_blood[(df_icu_labevents_blood["label"] == "red blood cells") &  (df_icu_labevents_blood["subject_id"] == 10001217)]
blood_test.head(10)

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper,label,fluid,category
1538822,10001217,27703517,51279,2157-12-19 04:50:00,4.52,m/uL,4.2,5.4,red blood cells,blood,hematology
1538823,10001217,27703517,51279,2157-12-20 01:45:00,4.53,m/uL,4.2,5.4,red blood cells,blood,hematology
1538824,10001217,27703517,51279,2157-12-21 04:40:00,4.48,m/uL,4.2,5.4,red blood cells,blood,hematology
1538825,10001217,27703517,51279,2157-12-23 04:30:00,4.66,m/uL,4.2,5.4,red blood cells,blood,hematology
1538826,10001217,27703517,51279,2157-12-24 03:59:00,4.63,m/uL,4.2,5.4,red blood cells,blood,hematology
