In [1]:
import os
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from utils.utils import get_readable_file_size

In [2]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

### Load admissions.csv.gz file

In [3]:
ehr_admissions_path = os.path.join(ehr_data_dir, "hosp/admissions.csv.gz")
df_ehr_admissions = pd.read_csv(ehr_admissions_path, index_col=False)
df_ehr_admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [4]:
print(len(df_ehr_admissions))

431231


In [5]:
# icu_subject_hadm_df = pd.read_csv(os.path.join("outputs", "icu_subject_hadm.csv.gz"), compression="gzip")
icu_cxr = pd.read_csv(os.path.join("outputs", "icu_cxr.csv.gz"), compression="gzip")
selected_columns = ["subject_id", "hadm_id"]
icu_subject_hadm_df = icu_cxr[selected_columns].copy()
icu_subject_hadm_df.drop_duplicates(inplace=True, keep="first")
icu_subject_hadm_df.head()

Unnamed: 0,subject_id,hadm_id
0,10001217,27703517
1,10001884,26184834
6,10002013,23581541
10,10002428,28662225
11,10002428,23473524


In [6]:
print(len(icu_subject_hadm_df))
print(len(icu_subject_hadm_df["subject_id"].unique()))
print(len(icu_subject_hadm_df["hadm_id"].unique()))

16268
13170
16268


In [7]:
df_icu_admissions = pd.merge(icu_subject_hadm_df, df_ehr_admissions, 
                             on=["subject_id", "hadm_id"], how="inner")
df_icu_admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10001217,27703517,2157-12-18 16:58:00,2157-12-24 14:55:00,,DIRECT EMER.,P99698,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,?,MARRIED,WHITE,,,0
1,10001884,26184834,2131-01-07 20:39:00,2131-01-20 05:15:00,2131-01-20 05:15:00,OBSERVATION ADMIT,P874LG,EMERGENCY ROOM,DIED,Medicare,ENGLISH,MARRIED,BLACK/AFRICAN AMERICAN,2131-01-07 13:36:00,2131-01-07 22:13:00,1
2,10002013,23581541,2160-05-18 07:45:00,2160-05-23 13:30:00,,SURGICAL SAME DAY ADMISSION,P47E1G,PHYSICIAN REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,SINGLE,OTHER,,,0
3,10002428,28662225,2156-04-12 14:16:00,2156-04-29 16:26:00,,EW EMER.,P64TOH,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,ENGLISH,WIDOWED,WHITE,2156-04-12 09:56:00,2156-04-12 17:11:00,0
4,10002428,23473524,2156-05-11 14:49:00,2156-05-22 14:16:00,,EW EMER.,P3529J,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicare,ENGLISH,WIDOWED,WHITE,2156-05-11 11:29:00,2156-05-11 16:53:00,0


In [8]:
selected_columns = ["subject_id", "hadm_id", "admittime", "dischtime", "admission_type", "admission_location", 
                    "discharge_location", "insurance", "marital_status", "race"]

df_icu_admissions = df_icu_admissions[selected_columns]
df_icu_admissions ["admission_type"] = df_icu_admissions ["admission_type"].str.lower()
df_icu_admissions ["admission_location"] = df_icu_admissions ["admission_location"].str.lower()
df_icu_admissions ["discharge_location"] = df_icu_admissions ["discharge_location"].str.lower() 
df_icu_admissions ["insurance"] = df_icu_admissions ["insurance"].str.lower() 
df_icu_admissions ["marital_status"] = df_icu_admissions ["marital_status"].str.lower() 
df_icu_admissions ["race"] = df_icu_admissions ["race"].str.lower() 
df_icu_admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,admission_location,discharge_location,insurance,marital_status,race
0,10001217,27703517,2157-12-18 16:58:00,2157-12-24 14:55:00,direct emer.,physician referral,home health care,other,married,white
1,10001884,26184834,2131-01-07 20:39:00,2131-01-20 05:15:00,observation admit,emergency room,died,medicare,married,black/african american
2,10002013,23581541,2160-05-18 07:45:00,2160-05-23 13:30:00,surgical same day admission,physician referral,home health care,medicare,single,other
3,10002428,28662225,2156-04-12 14:16:00,2156-04-29 16:26:00,ew emer.,emergency room,skilled nursing facility,medicare,widowed,white
4,10002428,23473524,2156-05-11 14:49:00,2156-05-22 14:16:00,ew emer.,emergency room,chronic/long term acute care,medicare,widowed,white


In [9]:
print(len(df_icu_admissions))
print(len(df_icu_admissions["subject_id"].unique()))
print(len(df_icu_admissions["hadm_id"].unique()))
print(df_icu_admissions.columns)

16268
13170
16268
Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'admission_type',
       'admission_location', 'discharge_location', 'insurance',
       'marital_status', 'race'],
      dtype='object')


In [10]:
#print(df_icu_admissions["language"].unique())
print(df_icu_admissions["marital_status"].unique())
print(df_icu_admissions["race"].unique())

['married' 'single' 'widowed' 'divorced' nan]
['white' 'black/african american' 'other' 'white - russian'
 'black/cape verdean' 'white - brazilian' 'unable to obtain'
 'white - other european' 'patient declined to answer' 'unknown'
 'hispanic/latino - cuban' 'hispanic/latino - dominican'
 'asian - asian indian' 'asian - chinese' 'hispanic/latino - puerto rican'
 'white - eastern european' 'hispanic or latino' 'asian'
 'hispanic/latino - honduran' 'portuguese' 'hispanic/latino - salvadoran'
 'hispanic/latino - guatemalan' 'american indian/alaska native'
 'hispanic/latino - central american' 'black/african'
 'asian - south east asian' 'multiple race/ethnicity'
 'black/caribbean island' 'asian - korean' 'hispanic/latino - mexican'
 'native hawaiian or other pacific islander' 'south american'
 'hispanic/latino - columbian']


In [11]:
df_icu_admissions.to_csv(os.path.join("outputs", "icu_admissions.csv.gz"), 
                         index=False, compression="gzip")

In [12]:
readable_file_size = get_readable_file_size(os.path.join("outputs", "icu_admissions.csv.gz"))
print(f"File size: {readable_file_size}")

File size: 403.89 KB


## QA generation

In [13]:
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [14]:
# Open the file in read mode and read the content as a string
with open('files/admissions_description.txt', 'r') as file:
    description_text = file.read()

with open('files/prompt_text.txt', 'r') as file:
    prompt_text = file.read()
# Print the content of the file
print(description_text)
print(prompt_text)

This is the description to the icu_admissions.csv.gz file. This file is located in outputs/icu_admissions.csv.gz.
subject_id: A unique identifier for each patient in the dataset. Each patient only has one subject_id.
hadm_id: Hospital admission ID, a unique identifier for each hospital admission. This ID enables differentiation between multiple admissions for the same patient.
admittime: Timestamp for the exact date and time when the patient was admitted to the hospital. This helps establish the start of a hospital stay.
dischtime: Timestamp for the date and time when the patient was discharged from the hospital, marking the end of a specific admission period.
admission_type: Categorical field indicating the type of admission, such as "emergency," "urgent," or "elective." This provides context on the reason or urgency of admission.
admission_location: Describes the location from which the patient was admitted, such as "clinic referral," "emergency department," or "transfer from another

In [15]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "How many patients are white?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
#response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_admissions.csv.gz')

# Count the number of patients who are white
result = df[df['race'] == 'white'].shape[0]
print(result)
```


In [16]:
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_admissions.csv.gz')

# Filter for patients who are of race 'white'
white_patients = df[df['race'] == 'white']['subject_id']

# Count unique patients (unique subject_ids)
result = white_patients.nunique()

print(result)

8300


In [17]:
description_text = description_text
prompt_text = prompt_text
question_text = "what is the discharge location of patient 10002428?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
# response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_admissions.csv.gz')

# Filter for patient with subject_id 10002428
patient_discharge_location = df[df['subject_id'] == 10002428]['discharge_location']

# Output the discharge location
result = patient_discharge_location.values[0] if not patient_discharge_location.empty else "Patient not found"
print(result)
```


In [18]:
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_admissions.csv.gz')

# Filter for patient with subject_id 10002428
patient_discharge_location = df[df['subject_id'] == 10002428]['discharge_location']

# Output the discharge location
result = patient_discharge_location.values[0] if not patient_discharge_location.empty else "Patient not found"
print(result)

skilled nursing facility


In [19]:
description_text = description_text
prompt_text = prompt_text
question_text = "Suppose now is 2160 year, how many patients have been admitted to the hospital before this year?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}], stream=False)
# response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}], stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
filename = 'outputs/icu_admissions.csv.gz'
icu_df = pd.read_csv(filename)

# Convert admit time to datetime format and extract the admission year
icu_df['admittime'] = pd.to_datetime(icu_df['admittime'])
icu_df['admission_year'] = icu_df['admittime'].dt.year

# Filter patients admitted before the year 2160
patients_before_2160 = icu_df[icu_df['admission_year'] < 2160]

# Count unique patients (based on subject_id)
result = patients_before_2160['subject_id'].nunique()
print(result)
```


In [20]:
import pandas as pd

# Load the dataset
filename = 'outputs/icu_admissions.csv.gz'
icu_df = pd.read_csv(filename)

# Convert admit time to datetime format and extract the admission year
icu_df['admittime'] = pd.to_datetime(icu_df['admittime'])
icu_df['admission_year'] = icu_df['admittime'].dt.year

# Filter patients admitted before the year 2160
patients_before_2160 = icu_df[icu_df['admission_year'] < 2160]

# Count unique patients (based on subject_id)
result = patients_before_2160['subject_id'].nunique()
print(result)

7315
