In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from utils.utils import get_readable_file_size

In [2]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

### Load patient.csv.gz file

In [3]:
ehr_patients_path = os.path.join(ehr_data_dir, "hosp/patients.csv.gz")
df_ehr_patients = pd.read_csv(ehr_patients_path, index_col=False)
df_ehr_patients.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000068,F,19,2160,2008 - 2010,
3,10000084,M,72,2160,2017 - 2019,2161-02-13
4,10000102,F,27,2136,2008 - 2010,


In [4]:
print(len(df_ehr_patients))

299712


In [6]:
icu_cxr = pd.read_csv(os.path.join("outputs", "icu_cxr.csv.gz"), compression="gzip")
selected_columns = ["subject_id", "hadm_id"]
icu_subject_hadm_df = icu_cxr[selected_columns].copy()
icu_subject_hadm_df.drop_duplicates(inplace=True, keep="first")
icu_subject_hadm_df.head()

Unnamed: 0,subject_id,hadm_id
0,10001217,27703517
1,10001884,26184834
6,10002013,23581541
10,10002428,28662225
11,10002428,23473524


In [7]:
df_icu_patients = pd.merge(icu_subject_hadm_df, df_ehr_patients, on=["subject_id"], 
                           how="inner")
df_icu_patients["gender"] = df_icu_patients["gender"].str.lower()
df_icu_patients.head()

Unnamed: 0,subject_id,hadm_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10001217,27703517,f,55,2157,2011 - 2013,
1,10001884,26184834,f,68,2122,2008 - 2010,2131-01-20
2,10002013,23581541,f,53,2156,2008 - 2010,
3,10002428,28662225,f,80,2155,2011 - 2013,
4,10002428,23473524,f,80,2155,2011 - 2013,


In [8]:
selected_columns = ["subject_id", "hadm_id", "gender", "anchor_age", "anchor_year", "dod", "dob"]

df_icu_patients['dob'] = (df_icu_patients['anchor_year'] - df_icu_patients['anchor_age']).astype(int)
df_icu_patients = df_icu_patients[selected_columns]
df_icu_patients.rename(columns={"anchor_age": "age", "anchor_year": "year"}, inplace=True)

df_icu_patients.head()

Unnamed: 0,subject_id,hadm_id,gender,age,year,dod,dob
0,10001217,27703517,f,55,2157,,2102
1,10001884,26184834,f,68,2122,2131-01-20,2054
2,10002013,23581541,f,53,2156,,2103
3,10002428,28662225,f,80,2155,,2075
4,10002428,23473524,f,80,2155,,2075


In [9]:
print(len(df_icu_patients))
print(len(df_icu_patients["subject_id"].unique()))

16268
13170


In [10]:
df_icu_patients.to_csv(os.path.join("outputs", "icu_patients.csv.gz"), 
                       index=False, compression="gzip")

In [11]:
readable_file_size = get_readable_file_size(os.path.join("outputs", "icu_patients.csv.gz"))
print(f"File size: {readable_file_size}")

File size: 212.42 KB


In [12]:
import os
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
# Open the file in read mode and read the content as a string
with open('files/patients_description.txt', 'r') as file:
    description_text = file.read()

with open('files/prompt_text.txt', 'r') as file:
    prompt_text = file.read()
# Print the content of the file
print(description_text)
print(prompt_text)

icu_patients.csv.gz contains information about patients admitted to the ICU. This file is located in the outputs folder. Here’s an explanation of each column in the icu_patients.csv.gz table:
subject_id: A unique identifier for each patient across the database. Each patient only has one subject_id, even if they are admitted multiple times to the ICU.
hadm_id: A unique identifier for each hospital admission for a patient. If a patient is admitted multiple times, each stay will have a distinct hadm_id.
gender: The recorded gender of the patient, generally marked as 'm' for male or 'f' for female.
age: The age of the patient at the time of admission. 
year: the year of the patient’s admission.
dod: Date of death. This field is recorded if known and provides the date on which the patient passed away, in a de-identified format to protect privacy.
dob: Date of birth. This field, while de-identified, gives insight into the patient’s age range and demographics without revealing the actual birt

In [15]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "How many patients are older than 60 years old in the dataset?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the data from the CSV file
file_path = 'outputs/icu_patients.csv.gz'
icu_data = pd.read_csv(file_path)

# Filter patients who are older than 60 years old at admission
older_than_60 = icu_data[icu_data['age'] > 60]

# Count unique patients (using subject_id to ensure each patient is only counted once)
result = older_than_60['subject_id'].nunique()

print(result)
```


In [16]:
import pandas as pd

# Load the data from the CSV file
file_path = 'outputs/icu_patients.csv.gz'
icu_data = pd.read_csv(file_path)

# Filter patients who are older than 60 years old at admission
older_than_60 = icu_data[icu_data['age'] > 60]

# Count unique patients (using subject_id to ensure each patient is only counted once)
result = older_than_60['subject_id'].nunique()

print(result)

8045


In [17]:
test = df_icu_patients[df_icu_patients["age"]>60]
test = test["subject_id"].unique()
print(len(test))

8045


In [18]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "How many female patients are older than 60 years old when admissions?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import gzip
import pandas as pd

# Load the data from the compressed CSV file
file_path = 'outputs/icu_patients.csv.gz'
data = pd.read_csv(gzip.open(file_path, 'rt'))

# Filter the dataframe for female patients older than 60 years old at admission
filtered_data = data[(data['gender'] == 'f') & (data['age'] > 60)]

# Count the number of such patients
result = filtered_data.shape[0]

print(result)
```


In [19]:
answer_text = answer_text
start_marker = "```python"
end_marker = "```"

start_idx = answer_text.find(start_marker) + len(start_marker)
end_idx = answer_text.find(end_marker, start_idx)

if start_idx != -1 and end_idx != -1:
    code = answer_text[start_idx:end_idx].strip()
    print("Extracted Code:\n", code)
else:
    print("No code block found")
    
try:
    local_vars = {}
    # compile code into a code object
    compiled_code = compile(code, "<string>", "exec")

    # execute code object in a new namespace
    exec(compiled_code, globals(), local_vars)

    # obtain output from the code
    output = local_vars.get("result")
    print("Output:", output)
except Exception as e:
    print("Error while executing code:", e)
else:         
    print("Code executed successfully.")

Extracted Code:
 import gzip
import pandas as pd

# Load the data from the compressed CSV file
file_path = 'outputs/icu_patients.csv.gz'
data = pd.read_csv(gzip.open(file_path, 'rt'))

# Filter the dataframe for female patients older than 60 years old at admission
filtered_data = data[(data['gender'] == 'f') & (data['age'] > 60)]

# Count the number of such patients
result = filtered_data.shape[0]

print(result)
4664
Output: 4664
Code executed successfully.
