In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from utils.utils import get_readable_file_size

In [2]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

### diagnoses ICD and DRG codes

In [3]:
ehr_diagnoses_path = os.path.join(ehr_data_dir, "hosp/diagnoses_icd.csv.gz")
df_ehr_diagnoses = pd.read_csv(ehr_diagnoses_path)
df_ehr_diagnoses.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000032,22595853,1,5723,9
1,10000032,22595853,2,78959,9
2,10000032,22595853,3,5715,9
3,10000032,22595853,4,7070,9
4,10000032,22595853,5,496,9


In [4]:
icu_cxr = pd.read_csv(os.path.join("outputs", "icu_cxr.csv.gz"), compression="gzip")
selected_columns = ["subject_id", "hadm_id"]
icu_subject_hadm_df = icu_cxr[selected_columns].copy()
icu_subject_hadm_df.drop_duplicates(inplace=True, keep="first")
icu_subject_hadm_df.head()

Unnamed: 0,subject_id,hadm_id
0,10001217,27703517
1,10001884,26184834
6,10002013,23581541
10,10002428,28662225
11,10002428,23473524


In [5]:
df_icu_diagnoses = pd.merge(icu_subject_hadm_df, df_ehr_diagnoses, 
                            on=["subject_id", "hadm_id"], 
                            how="inner")
df_icu_diagnoses.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10001217,27703517,1,3240,9
1,10001217,27703517,2,3485,9
2,10001217,27703517,3,340,9
3,10001217,27703517,4,4102,9
4,10001217,27703517,5,4184,9


In [6]:
icd_diagnoses_path = os.path.join(ehr_data_dir, "hosp/d_icd_diagnoses.csv.gz")
df_ehr_icd_diagnoses = pd.read_csv(icd_diagnoses_path)
df_ehr_icd_diagnoses.head()

Unnamed: 0,icd_code,icd_version,long_title
0,10,9,Cholera due to vibrio cholerae
1,11,9,Cholera due to vibrio cholerae el tor
2,19,9,"Cholera, unspecified"
3,20,9,Typhoid fever
4,21,9,Paratyphoid fever A


In [7]:
df_icu_diagnoses_result = pd.merge(df_icu_diagnoses, df_ehr_icd_diagnoses,  
                                   on=["icd_code", "icd_version"], 
                                   how="left")

selected_columns = ["subject_id", "hadm_id", "icd_code", "icd_version", "long_title"]
df_icu_diagnoses_result = df_icu_diagnoses_result[selected_columns]
df_icu_diagnoses_result["long_title"] = df_icu_diagnoses_result["long_title"].str.lower()
df_icu_diagnoses_result.head()

Unnamed: 0,subject_id,hadm_id,icd_code,icd_version,long_title
0,10001217,27703517,3240,9,intracranial abscess
1,10001217,27703517,3485,9,cerebral edema
2,10001217,27703517,340,9,multiple sclerosis
3,10001217,27703517,4102,9,streptococcus infection in conditions classifi...
4,10001217,27703517,4184,9,other specified bacterial infections in condit...


In [8]:
df_icu_diagnoses_result.to_csv(os.path.join("outputs", "icu_diagnoses.csv.gz"),
                               index=False, compression="gzip")

In [9]:
readable_file_size = get_readable_file_size(os.path.join("outputs", "icu_diagnoses.csv.gz"))
print(f"File size: {readable_file_size}")

File size: 3.67 MB


In [10]:
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
# Open the file in read mode and read the content as a string
with open('files/diagnoses_description.txt', 'r') as file:
    description_text = file.read()

with open('files/prompt_text.txt', 'r') as file:
    prompt_text = file.read()

# Print the content of the file
print(description_text)
print(prompt_text)

This is the description to the icu_diagnoses.csv.gz file. This file is located in outputs/icu_diagnoses.csv.gz.
subject_id: This unique identifier corresponds to each patient in the dataset. It allows for the association of patient-level data across different tables. Each patient only has one subject_id.
hadm_id: This is the unique hospital admission ID. One patient can have multiple hospital admissions, corresponding to multiple hadm_ids.
icd_code: This column contains the ICD (International Classification of Diseases) code, which is a standardized code used to identify specific diagnoses. This can be used to understand the medical conditions diagnosed during a patientâ€™s hospital admission.
icd_version: Indicates the version of the ICD coding system used (for example, ICD-9 or ICD-10). Different versions have slightly different codes, so this helps interpret the icd_code accurately.
long_title: This is a descriptive label or title associated with each ICD code. It provides a full te

In [12]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "How many patients are diagnosed with dementia?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_diagnoses.csv.gz')

# Filter for dementia diagnoses (ICD-10: F00, ICD-9: 290)
dementia_icd_codes = df[(df['icd_code'].str.startswith('F00', na=False)) | ((df['icd_version'] == 'ICD-9') & (df['icd_code'] == '290'))]

# Get unique patients diagnosed with dementia
unique_patients_with_dementia = dementia_icd_codes['subject_id'].nunique()

result = unique_patients_with_dementia
print(result)
```


In [13]:
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_diagnoses.csv.gz')

# Filter for dementia diagnoses (ICD-10: F00, ICD-9: 290)
dementia_icd_codes = df[(df['icd_code'].str.startswith('F00', na=False)) | ((df['icd_version'] == 'ICD-9') & (df['icd_code'] == '290'))]

# Get unique patients diagnosed with dementia
unique_patients_with_dementia = dementia_icd_codes['subject_id'].nunique()

result = unique_patients_with_dementia
print(result)

0


In [20]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "What ar the patient 10001217 is diagnosed with at hospitalization 27703517?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_diagnoses.csv.gz')

# Filter for the specific subject_id and hadm_id
diagnosis_info = df[(df['subject_id'] == 10001217) & (df['hadm_id'] == 27703517)]

# Extract the long title(s)
result = diagnosis_info['long_title'].tolist()
print(result)
```


In [21]:
import pandas as pd

# Load the dataset
df = pd.read_csv('outputs/icu_diagnoses.csv.gz')

# Filter for subject_id 100011217 and hadm_id 27703517
filtered_df = df[(df['subject_id'] == 10001217) & (df['hadm_id'] == 27703517)]

# Extract the long_title for the filtered data
result = filtered_df['long_title'].tolist()
print(result)

['intracranial abscess', 'cerebral edema', 'multiple sclerosis', 'streptococcus infection in conditions classified elsewhere and of unspecified site, streptococcus, group b', 'other specified bacterial infections in conditions classified elsewhere and of unspecified site, other anaerobes', 'unspecified essential hypertension', 'tobacco use disorder']
