In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import json

In [2]:
import torch
import ollama

os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
df_chunk = pd.read_csv("outputs/icu_discharge_merged.csv.gz", chunksize=1000)
df_icu_discharge = next(df_chunk)
df_icu_discharge.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,note_id,note_type,note_seq,charttime,storetime,text
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266,10000032-DS-23,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
1,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535,10000980-DS-21,DS,21,2189-07-03 00:00:00,2189-07-03 19:50:00,\nName: ___ Unit No: ___\n \nAdmi...
2,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032,10001217-DS-4,DS,4,2157-11-25 00:00:00,2157-11-25 17:26:00,\nName: ___ Unit No: ___\n \n...
3,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113,10001217-DS-5,DS,5,2157-12-24 00:00:00,2157-12-24 15:57:00,\nName: ___ Unit No: ___\n \n...
4,10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338588,10001725-DS-12,DS,12,2110-04-14 00:00:00,2110-04-19 17:44:00,\nName: ___ Unit No: ___\n \nA...


In [4]:
# test_hadm_id = df_icu_discharge["hadm_id"].unique()[100]
test_hadm_id = 28994087
df_example = df_icu_discharge[df_icu_discharge['hadm_id'] == test_hadm_id]
discharge_summary = df_example.iloc[0, :]['text']

print(discharge_summary)

 
Name:  ___                 Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
Codeine
 
Attending: ___
 
Chief Complaint:
Difficulty in breathing
 
Major Surgical or Invasive Procedure:
none

 
History of Present Illness:
The patient is a ___ year-old female with a history of NSCLC 
(stage IV) who presents with shortness of breath.
.
The patient was in her usual state of health until the evening 
before admission when she began to feel somewhat short of 
breath.  The next morning, this sensation persisted, so she 
became concerned.  She also reports a few day history of a 
non-productive cough.  Denies sick contacts, recent travel or 
sedentary lifestyle.  She denied chest pain, fever, chills, 
dizziness, lightheadedness or syncope.  She presented to the ED 
where she was found to be hypoxic to the ___ on room air.
.  
In the ED, she was placed on a non-rebreather with sats up to 
the h

In [9]:
# extract information from discharge summary
summary_text = discharge_summary
question_text = "Given the discharge summary above. extract discharge diagnosis from the given text, just give the answer briefly."
input_text = summary_text + " " + question_text

In [10]:
response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)

print(response["message"]["content"])

# post-obstructive pneumonia
# non small cell lung cancer stage IV, progressing


In [11]:
# extract information from discharge summary
summary_text = discharge_summary
question_text = "Given the discharge summary above. extract medication the patient should take from the given text, just give the medication name."
input_text = summary_text + " " + question_text

response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
print(response["message"]["content"])

Here is the list of medications:

1. atorvastatin
2. calcitriol
3. clopidogrel
4. folic acid
5. furosemide
6. loperamide
7. lorazepam
8. metoprolol tartrate
9. tramadol
10. trazodone
11. aspirin
12. ranitidine HCl
13. levofloxacin
14. docusate sodium


In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import random
import json
from tqdm import tqdm
import cv2
import torch
import time
import gc

In [13]:
ehr_data_dir = "/home/mengliang/DatasetFolder/mimiciv/2.2"
note_data_dir = "/home/mengliang/DatasetFolder/mimic-iv-note/2.2"
# load icustays.csv.gz
icu_icustays_path = os.path.join(ehr_data_dir, "icu/icustays.csv.gz")
df_icu_icustays = pd.read_csv(icu_icustays_path, index_col=False, 
                              compression="gzip")
# load discharge.csv.gz
note_discharge_path = os.path.join(note_data_dir, "note/discharge.csv.gz")
df_note_discharge = pd.read_csv(note_discharge_path, index_col=False, 
                                compression='gzip')

icu_discharge_merged = pd.merge(df_icu_icustays, df_note_discharge, on=['subject_id', 'hadm_id'])
icu_discharge_merged.head()

# control the number of samples
discharge_test = icu_discharge_merged.iloc[:10,:]

# load discharge summary question dictionary
def parse_questions(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines()]

    mother_questions = []
    child_questions = []
    current_mother_question = None
    current_child_list = []
    is_in_blank_section = False  # mark if in blank section

    for line in lines:
        if line == "":  # blank line
            is_in_blank_section = True  # mark blank section end
        else:
            if is_in_blank_section:  # turn to child questions
                if current_mother_question is not None:
                    child_questions.append(current_child_list)
                    current_child_list = []
                current_mother_question = None  # next mother question
                is_in_blank_section = False  # reset blank section flag

            if current_mother_question is None:
                # a new mother question starts
                current_mother_question = line
                mother_questions.append(current_mother_question)
            else:
                # mother question continues
                current_child_list.append(line)

    # last mother question and child questions
    if current_mother_question is not None:
        child_questions.append(current_child_list)

    return mother_questions, child_questions

# 示例使用：
file_path = 'files/discharge_summary_questions.txt'  # 替换为你的 txt 文件路径
mother_questions, child_questions = parse_questions(file_path)

# 打印结果
print("question template list:", mother_questions)
print("question dict list", child_questions)

# 使用DataParallel
#if torch.cuda.device_count() > 1:
#    model = torch.nn.DataParallel(model)

# extract answer from discharge summary
for index, row in tqdm(discharge_test.iterrows(), total=discharge_test.shape[0]):
    start_time = time.time()
    subject_id = row['subject_id']
    hadm_id = row['hadm_id']
    discharge_summary = row['text']
    # generate question and answer
    
    question_answer_pairs = []
    for i in range(len(mother_questions)):
        template = mother_questions[i]
        child_list = child_questions[i]
        k = len(child_list)
        random_number = random.randrange(0, k)
        question = child_list[random_number]
        
        input_text = discharge_summary + " " + question

        response = ollama.chat(model="llama3.1:8b", messages = [{"role": "user", "content": input_text,}],
                       stream=False)
        response = response["message"]["content"]
        torch.cuda.empty_cache()
        gc.collect()
        
        question_answer_pairs.append({"question_template": template,
                                      "question": question,
                                      "answer": response,})
        
    data = {"hadm_id": hadm_id,
            "subject_id": subject_id,
            "question_answer_pairs": question_answer_pairs}
    
    end_time = time.time()
    elapsed_time = end_time - start_time  # 计算处理时间

    # 输出处理时间
    print(f"Processing time for subject_id {subject_id}, hadm_id {hadm_id}: {elapsed_time:.2f} seconds")

    # 根据 subject_id 和 hadm_id 生成文件名
    file_name = f"{subject_id}_{hadm_id}.json"
    file_path = os.path.join("outputs/discharge_summary_qa_llama3.1:8b", file_name)
    # 将数据保存为 JSON 文件
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


question template list: ['What are the allergies of the patient?', "What is the patient's chief complaint?", 'What is the major surgical or invasive procedure according to the discharge summary?', 'Describe the illness history according to the discharge summary.', 'What is the medical history of the patient?', 'What is the family history of the patient?', 'Describe the hospital course briefly.', 'What medication on admission is given to the patient?', 'What medication on discharge is given to the patient?', 'What is the discharge disposition of the patient?', 'List the discharge diagnosis of the patient.', 'What is the discharge condition of the patient?', 'What follow-up care was recommended after discharge?', 'What are the discharge instructions for the patient?', 'List the medication name and dosage on discharge of the patient. Each medication has one row.']
question dict list [['What allergies does the patient have?', 'What is the patient allergic to?', 'Can you list the patient’s 

 10%|█         | 1/10 [00:28<04:13, 28.15s/it]

Processing time for subject_id 10000032, hadm_id 29079034: 28.15 seconds


 20%|██        | 2/10 [01:03<04:18, 32.35s/it]

Processing time for subject_id 10000980, hadm_id 26913865: 35.29 seconds


 30%|███       | 3/10 [01:28<03:22, 28.99s/it]

Processing time for subject_id 10001217, hadm_id 24597018: 24.99 seconds


 40%|████      | 4/10 [01:48<02:32, 25.46s/it]

Processing time for subject_id 10001217, hadm_id 27703517: 20.05 seconds


 50%|█████     | 5/10 [02:24<02:25, 29.17s/it]

Processing time for subject_id 10001725, hadm_id 25563031: 35.74 seconds


 60%|██████    | 6/10 [02:47<01:48, 27.15s/it]

Processing time for subject_id 10001884, hadm_id 26184834: 23.22 seconds


 70%|███████   | 7/10 [03:18<01:25, 28.42s/it]

Processing time for subject_id 10002013, hadm_id 23581541: 31.05 seconds


 80%|████████  | 8/10 [03:40<00:52, 26.27s/it]

Processing time for subject_id 10002155, hadm_id 20345487: 21.64 seconds


 90%|█████████ | 9/10 [04:12<00:28, 28.09s/it]

Processing time for subject_id 10002155, hadm_id 23822395: 32.11 seconds


100%|██████████| 10/10 [04:39<00:00, 27.98s/it]

Processing time for subject_id 10002155, hadm_id 28994087: 27.53 seconds



