In [2]:
import os
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from utils.utils import get_readable_file_size

In [3]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

### Load admissions.csv.gz file

In [3]:
ehr_admissions_path = os.path.join(ehr_data_dir, "hosp/admissions.csv.gz")
df_ehr_admissions = pd.read_csv(ehr_admissions_path, index_col=False)
df_ehr_admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [4]:
print(len(df_ehr_admissions))

431231


In [5]:
icu_subject_hadm_df = pd.read_csv(os.path.join("outputs", "icu_subject_hadm.csv.gz"), compression="gzip")
icu_subject_hadm_df.head()

Unnamed: 0,subject_id,hadm_id
0,10000032,29079034
1,10000980,26913865
2,10001217,24597018
3,10001217,27703517
4,10001725,25563031


In [6]:
df_icu_admissions = pd.merge(icu_subject_hadm_df, df_ehr_admissions, on=["subject_id", "hadm_id"], 
                           how="inner")
df_icu_admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
1,10000980,26913865,2189-06-27 07:38:00,2189-07-03 03:00:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,BLACK/AFRICAN AMERICAN,2189-06-27 06:25:00,2189-06-27 08:42:00,0
2,10001217,24597018,2157-11-18 22:56:00,2157-11-25 18:00:00,,EW EMER.,P4645A,EMERGENCY ROOM,HOME HEALTH CARE,Other,?,MARRIED,WHITE,2157-11-18 17:38:00,2157-11-19 01:24:00,0
3,10001217,27703517,2157-12-18 16:58:00,2157-12-24 14:55:00,,DIRECT EMER.,P99698,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,?,MARRIED,WHITE,,,0
4,10001725,25563031,2110-04-11 15:08:00,2110-04-14 15:00:00,,EW EMER.,P35SU0,PACU,HOME,Other,ENGLISH,MARRIED,WHITE,,,0


In [7]:
selected_columns = ["subject_id", "hadm_id", "admittime", "dischtime", "admission_type", "admission_location", 
                    "discharge_location", "insurance", "marital_status", "race"]

df_icu_admissions = df_icu_admissions[selected_columns]
df_icu_admissions ["admission_type"] = df_icu_admissions ["admission_type"].str.lower()
df_icu_admissions ["admission_location"] = df_icu_admissions ["admission_location"].str.lower()
df_icu_admissions ["discharge_location"] = df_icu_admissions ["discharge_location"].str.lower() 
df_icu_admissions ["insurance"] = df_icu_admissions ["insurance"].str.lower() 
df_icu_admissions ["marital_status"] = df_icu_admissions ["marital_status"].str.lower() 
df_icu_admissions ["race"] = df_icu_admissions ["race"].str.lower() 
df_icu_admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,admission_location,discharge_location,insurance,marital_status,race
0,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,ew emer.,emergency room,home,medicaid,widowed,white
1,10000980,26913865,2189-06-27 07:38:00,2189-07-03 03:00:00,ew emer.,emergency room,home health care,medicare,married,black/african american
2,10001217,24597018,2157-11-18 22:56:00,2157-11-25 18:00:00,ew emer.,emergency room,home health care,other,married,white
3,10001217,27703517,2157-12-18 16:58:00,2157-12-24 14:55:00,direct emer.,physician referral,home health care,other,married,white
4,10001725,25563031,2110-04-11 15:08:00,2110-04-14 15:00:00,ew emer.,pacu,home,other,married,white


In [8]:
print(len(df_icu_admissions))
print(len(df_icu_admissions["subject_id"].unique()))

print(df_icu_admissions.columns)

73181
50920
Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'admission_type',
       'admission_location', 'discharge_location', 'insurance',
       'marital_status', 'race'],
      dtype='object')


In [9]:
#print(df_icu_admissions["language"].unique())
print(df_icu_admissions["marital_status"].unique())
print(df_icu_admissions["race"].unique())

['widowed' 'married' 'single' nan 'divorced']
['white' 'black/african american' 'other' 'unknown' 'unable to obtain'
 'white - russian' 'black/cape verdean' 'portuguese'
 'hispanic/latino - salvadoran' 'hispanic/latino - puerto rican'
 'asian - south east asian' 'white - brazilian' 'hispanic or latino'
 'white - other european' 'black/african' 'patient declined to answer'
 'asian' 'black/caribbean island' 'hispanic/latino - cuban'
 'hispanic/latino - dominican' 'asian - korean' 'asian - chinese'
 'asian - asian indian' 'white - eastern european'
 'hispanic/latino - guatemalan' 'hispanic/latino - honduran'
 'hispanic/latino - columbian' 'native hawaiian or other pacific islander'
 'american indian/alaska native' 'hispanic/latino - central american'
 'hispanic/latino - mexican' 'south american' 'multiple race/ethnicity']


In [10]:
df_icu_admissions.to_csv(os.path.join("outputs", "icu_admissions.csv.gz"), 
                         index=False, compression="gzip")

In [11]:
readable_file_size = get_readable_file_size(os.path.join("outputs", "icu_admissions.csv.gz"))
print(f"File size: {readable_file_size}")

File size: 1.62 MB


## QA generation

In [4]:
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
# Open the file in read mode and read the content as a string
with open('files/admissions_description.txt', 'r') as file:
    file_content = file.read()

# Print the content of the file
print(file_content)

This is the description to the icu_admissions.csv.gz file. This file is located in outputs/icu_admissions.csv.gz.
subject_id: A unique identifier for each patient in the dataset. Each patient only has one subject_id.
hadm_id: Hospital admission ID, a unique identifier for each hospital admission. This ID enables differentiation between multiple admissions for the same patient.
admittime: Timestamp for the exact date and time when the patient was admitted to the hospital. This helps establish the start of a hospital stay.
dischtime: Timestamp for the date and time when the patient was discharged from the hospital, marking the end of a specific admission period.
admission_type: Categorical field indicating the type of admission, such as "emergency," "urgent," or "elective." This provides context on the reason or urgency of admission.
admission_location: Describes the location from which the patient was admitted, such as "clinic referral," "emergency department," or "transfer from another

In [6]:
# extract information from discharge summary
description_text = file_content
prompt_text = "Please generate python code to answer the question. Only generate code for the question. no explanation and other description. use print to putput the result. the final result variable should be named as result."
question_text = "How many patients are white?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
#response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_admissions.csv.gz')

# Filter for patients who are white
white_patients = df[df['race'] == 'white']

# Get unique subject_ids to count distinct patients
unique_subject_ids = white_patients['subject_id'].nunique()

# Output the result
result = unique_subject_ids
print(result)
```


In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_admissions.csv.gz')

# Filter for patients who are white
white_patients = df[df['race'] == 'white']

# Get unique subject_ids to count distinct patients
result = len(white_patients['subject_id'].unique())

print(result)

32893


In [11]:
description_text = file_content
prompt_text = "Please generate python code to answer the question. Only generate code for the question. no explanation and other description. use print to putput the result. the final result variable should be named as result."
question_text = "what is the discharge location of patient 10001725?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
# response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_admissions.csv.gz')

# Filter for patient with subject_id 10001725
patient_discharge_location = df[df['subject_id'] == 10001725]['discharge_location']

# Get the unique discharge location if there are multiple entries, or just get the first one.
result = patient_discharge_location.unique()[0] if len(patient_discharge_location.unique()) > 0 else None

print(result)
```


In [9]:
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_admissions.csv.gz')

# Filter for the specific patient
patient_discharge_location = df[df['subject_id'] == 10001725]['discharge_location']

# Output the result
result = patient_discharge_location.values[0] if not patient_discharge_location.empty else None
print(result)

home


In [14]:
description_text = file_content
prompt_text = "Please generate python code to answer the question. Only generate code for the question. no explanation and other description. use print to putput the result. the final result variable should be named as result."
question_text = "Suppose now is 2160 year, how many patients have been admitted to the hospital before this year?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
# response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
filename = 'outputs/icu_admissions.csv.gz'
icu_data = pd.read_csv(filename)

# Filter admissions that occurred before the year 2160
icu_data['admittime'] = pd.to_datetime(icu_data['admittime'])
before_2160 = icu_data[icu_data['admittime'].dt.year < 2160]

# Count unique patients (using subject_id to ensure uniqueness)
result = before_2160['subject_id'].nunique()
print(result)
```


In [15]:
import pandas as pd

# Load the dataset
filename = 'outputs/icu_admissions.csv.gz'
icu_data = pd.read_csv(filename)

# Filter admissions that occurred before the year 2160
icu_data['admittime'] = pd.to_datetime(icu_data['admittime'])
before_2160 = icu_data[icu_data['admittime'].dt.year < 2160]

# Count unique patients (using subject_id to ensure uniqueness)
result = before_2160['subject_id'].nunique()
print(result)

30058
