In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from utils.utils import get_readable_file_size

In [2]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"

In [3]:
ehr_microbiologyevents_path = os.path.join(ehr_data_dir, "hosp/microbiologyevents.csv.gz")
df_ehr_microbiologyevents = pd.read_csv(ehr_microbiologyevents_path, index_col=False)

selected_columns = ['subject_id', 'hadm_id', 'charttime', 'spec_type_desc', 'test_name', 'org_name']
df_ehr_microbiologyevents = df_ehr_microbiologyevents[selected_columns]
df_ehr_microbiologyevents.head()

  df_ehr_microbiologyevents = pd.read_csv(ehr_microbiologyevents_path, index_col=False)


Unnamed: 0,subject_id,hadm_id,charttime,spec_type_desc,test_name,org_name
0,10000032,,2180-03-23 11:51:00,Blood (Toxo),TOXOPLASMA IgG ANTIBODY,
1,10000032,,2180-03-23 11:51:00,SEROLOGY/BLOOD,"RUBEOLA ANTIBODY, IgG",
2,10000032,,2180-03-23 11:51:00,Blood (CMV AB),CMV IgG ANTIBODY,
3,10000032,,2180-03-23 11:51:00,Blood (CMV AB),CMV IgM ANTIBODY,
4,10000032,,2180-03-23 11:51:00,Blood (EBV),EPSTEIN-BARR VIRUS VCA-IgG AB,


In [4]:
print(df_ehr_microbiologyevents["org_name"].value_counts())

org_name
ESCHERICHIA COLI            453913
STAPH AUREUS COAG +         181628
KLEBSIELLA PNEUMONIAE       119355
PSEUDOMONAS AERUGINOSA       65068
PROTEUS MIRABILIS            53188
                             ...  
CUTIBACTERIUM AVIDUM             1
SALMONELLA KIAMBU                1
PENICILLIUM FELLUTANUM           1
CONIDIOBOLUS SPECIES             1
HAEMOPHILUS HAEMOLYTICUS         1
Name: count, Length: 642, dtype: int64


In [5]:
icu_cxr = pd.read_csv(os.path.join("outputs", "icu_cxr.csv.gz"), compression="gzip")
selected_columns = ["subject_id", "hadm_id"]
icu_subject_hadm_df = icu_cxr[selected_columns].copy()
icu_subject_hadm_df.drop_duplicates(inplace=True, keep="first")
icu_subject_hadm_df.head()

Unnamed: 0,subject_id,hadm_id
0,10001217,27703517
1,10001884,26184834
6,10002013,23581541
10,10002428,28662225
11,10002428,23473524


In [6]:
df_icu_microbiologyevents = pd.merge(icu_subject_hadm_df, df_ehr_microbiologyevents, 
                                     on=["subject_id", "hadm_id"], how="inner")

In [7]:
df_icu_microbiologyevents["spec_type_desc"] = df_icu_microbiologyevents["spec_type_desc"].str.lower()
df_icu_microbiologyevents["test_name"] = df_icu_microbiologyevents["test_name"].str.lower()
df_icu_microbiologyevents["org_name"] = df_icu_microbiologyevents["org_name"].str.lower()

df_icu_microbiologyevents.head()

Unnamed: 0,subject_id,hadm_id,charttime,spec_type_desc,test_name,org_name
0,10001217,27703517,2157-12-19 14:55:00,swab,gram stain,
1,10001217,27703517,2157-12-19 14:55:00,swab,wound culture,
2,10001217,27703517,2157-12-19 14:55:00,swab,anaerobic culture,
3,10001217,27703517,2157-12-19 14:55:00,swab,fungal culture,
4,10001217,27703517,2157-12-19 14:55:00,swab,acid fast smear,


In [8]:
print(df_icu_microbiologyevents.columns)

Index(['subject_id', 'hadm_id', 'charttime', 'spec_type_desc', 'test_name',
       'org_name'],
      dtype='object')


In [9]:
df_icu_microbiologyevents.to_csv(os.path.join("outputs", "icu_microbiologyevents.csv.gz"),
                                 index=False, compression="gzip")

In [10]:
readable_file_size = get_readable_file_size(os.path.join("outputs", "icu_microbiologyevents.csv.gz"))
print(f"File size: {readable_file_size}")

File size: 1.24 MB


In [11]:
import os
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
# Open the file in read mode and read the content as a string
with open('files/microbiologyevents_description.txt', 'r') as file:
    description_text = file.read()

with open('files/prompt_text.txt', 'r') as file:
    prompt_text = file.read()

# Print the content of the file
print(description_text)
print(prompt_text)

This is the description to the icu_microbiologyevents.csv.gz file. This file is located in outputs/icu_microbiologyevents.csv.gz.
subject_id: A unique identifier for each patient. This ID is specific to each patient and distinguishes them within the database.
hadm_id: A unique identifier for each hospital admission. Each hospital stay is associated with its own hadm_id, differentiating multiple admissions for the same patient.
charttime: The recorded time for the microbiology event. This timestamp reflects when the microbiology sample was collected or logged.
spec_type_desc: A description of the specimen type. This provides information on the type of specimen collected for testing, such as “blood,” “urine,” or “sputum.”. All characters are in lowercase.
test_name: The name of the microbiology test. This specifies the type of test conducted on the specimen, such as a “culture” or “gram stain.”. All characters are in lowercase.
org_name: The name of the organism identified (if any). This

In [13]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "List all the microbiology test names of patient 10001217 during the admission."
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
file_path = 'outputs/icu_microbiologyevents.csv.gz'
data = pd.read_csv(file_path)

# Filter data for subject_id 10001217 and specific hadm_id (assuming hadm_id needs to be filtered based on subject_id)
patient_data = data[data['subject_id'] == 10001217]

# Get unique test names
test_names = patient_data['test_name'].unique()

# Convert array to list format
result = test_names.tolist()
print(result)
```


In [14]:
import pandas as pd

# Load the dataset
file_path = 'outputs/icu_microbiologyevents.csv.gz'
data = pd.read_csv(file_path)

# Filter data for subject_id 10001217 and specific hadm_id (assuming hadm_id needs to be filtered based on subject_id)
patient_data = data[data['subject_id'] == 10001217]

# Get unique test names
test_names = patient_data['test_name'].unique()

# Convert array to list format
result = test_names.tolist()
print(result)

['gram stain', 'wound culture', 'anaerobic culture', 'fungal culture', 'acid fast smear', 'acid fast culture', 'mrsa screen']


In [15]:
# extract information from discharge summary
description_text = description_text
prompt_text = prompt_text
question_text = "How many times was wound culture microbiology event recorded for patient 10001217 during their admission?"
input_text = description_text + " " + prompt_text + " " + question_text

response = ollama.chat(model="qwen2.5:14b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
answer_text = response["message"]["content"]
print(answer_text)

```python
import pandas as pd

# Load the dataset
filename = 'outputs/icu_microbiologyevents.csv.gz'
data = pd.read_csv(filename)

# Filter data for patient with subject_id 10001217 and hadm_id (any admission)
patient_data = data[data['subject_id'] == 10001217]

# Further filter data where test_name is 'wound culture'
wound_culture_events = patient_data[patient_data['test_name'] == 'wound culture']

# Count the number of wound culture events
result = len(wound_culture_events)
print(result)
```


In [None]:
import pandas as pd

# Load the dataset
filename = 'outputs/icu_microbiologyevents.csv.gz'
data = pd.read_csv(filename)

# Filter data for patient with subject_id 10001217 and hadm_id (any admission)
patient_data = data[data['subject_id'] == 10001217]

# Further filter data where test_name is 'wound culture'
wound_culture_events = patient_data[patient_data['test_name'] == 'wound culture']

# Count the number of wound culture events
result = len(wound_culture_events)
print(result)

1
