In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from os import listdir
from os.path import isfile, join

# some comment

embeddings = HuggingFaceEmbeddings(
    model_name = 'emilyalsentzer/Bio_ClinicalBERT'
)
 
text = "This is a test document."
query_result = embeddings.embed_query(text)


  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name emilyalsentzer/Bio_ClinicalBERT. Creating a new one with MEAN pooling.


In [3]:
#onlyfiles = [f for f in listdir('top_1000_txt') if isfile(join('top_1000_txt', f))]
onlyfiles=['note_1227.txt']
raw_documents = []
for file in onlyfiles:
    print(file)
    raw_doc = TextLoader(f'23 txt/{file}').load()
    raw_documents.extend(raw_doc)

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

note_1227.txt


In [4]:
#onlyfiles = [f for f in listdir('top_1000_txt') if isfile(join('top_1000_txt', f))]
onlyfiles=['note_417.txt']
raw_documents = []
for file in onlyfiles:
    print(file)
    raw_doc = TextLoader(f'23 txt/{file}').load()
    raw_documents.extend(raw_doc)

text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=24)

note_417.txt


In [4]:
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, embeddings)

In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

In [6]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# 设置 Hugging Face 管道
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=3000,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

# 创建本地 LLM
local_llm = HuggingFacePipeline(pipeline=pipe)

# 创建检索器
retriever = db.as_retriever(search_kwargs={"k": 1})

# 创建 QA 链
qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm, 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [7]:
from operator import itemgetter

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

In [8]:
local_llm = HuggingFacePipeline(pipeline=pipe)

llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-small",
    task="text2text-generation",
    model_kwargs={"max_length": 3000},
)

template = """You are an expert clinical assistant. You will receive a collection of clinical notes. Your task is to retrieve relevant information from these notes and give an answer in response to the question. Answer the question accurately based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | prompt 
    | llm 
    | StrOutputParser()
)

In [25]:
chain.invoke("Does the patient have diabetes?")

'No'

In [26]:
#1227
from datasets import Dataset
questions= [
    "What will make patient return to the hospital?",
    "What was patient found in left parietal?",
    "What was patient diagnosed with?",
    "What dose patient present from nursing home?",
    "What was patient started on?",
    "Does the patient have diabetes?",
    "Did the patient have hypertension?"
]


ground_truths = [
    ["The patient will return to the hospital if experiencing chest pain, shortness of breath, high fever, or any mental status change."],
    ["He was found to have an embolic stroke in left parietal, left internal capsule region."],
    ["He was diagnosed with HIT and Afib."],
    ["Tachypnea for 2 days, intermittent fevers."],
    ["Dialysis."],
    ["Yes."],
    ["Yes."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-1227txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1228.35 examples/s]


In [37]:
#1031
from datasets import Dataset
questions= [
    "What is the patient's age and gender?",
    "What did the patient have a history of?",
    "Why did the patient come to the hospital?",
    "What medications was she discharged with?",
    "What were the patient's vital signs on admission?",
    "Did the patient have no known drug allergies?",
    "What did the patient use tobacco, alcohol, and other drug?"
]


ground_truths = [
    ["The patient is a 57-year-old female."],
    ["Hypertension and hypercholesterolemia."],
    ["She came due to chest pain that started at 4:00 p.m."],
    ["She was discharged on Aspirin, Lisinopril, Toprol XL, Coumadin, Plavix, Lovenox, and Pravastatin."],
    ["Vital signs: On admission, the patient was afebrile, blood pressure 146/88, pulse 71, respirations 18."],
    ["Yes"],
    ["The patient reported no tobacco use and denied any alcohol or other drug use."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-1031txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1395.51 examples/s]


In [33]:
#1025
from datasets import Dataset
questions= [
    "What did the chest x-ray show?",
    "What is the patient’s code status?",
    "What did the patient undergo?",
    "What are the patient's primary medical problems?",
    "What imaging studies were performed?",
    "When was the patient made DNR/DNI?",
    "What did serial chest x-ray show?"
]


ground_truths = [
    ["Moderate cardiomegaly with some bilateral opacifications in the left upper and right upper lobes with multifocal pneumonia."],
    ["he patient was made DNR/DNI after multiple discussions with her family."],
    ["The patient underwent a sleep and swallow study and modified barium study."],
    ["Multiple medical problems including atrial fibrillation, chronic obstructive pulmonary disease, coronary artery disease, congestive heart failure."],
    ["Serial Chest X-rays, chest CT Scan."],
    ["the patient was made DNR/DNI after multiple discussions with her family, as well as proxyholder."],
    ["Serial chest x-ray showed moderate cardiomegaly with some bilateral opacifications in the left upper and right upper lobes with multifocal pneumonia."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-1025txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1465.00 examples/s]


In [19]:
#1019
from datasets import Dataset
questions= [
    "What is the patient's age and gender?",
    "What is the patient's past medical history?",
    "Does the patient have any known drug allergies?",
    "What is the patient's vital signs on arrival at the hospital?",
    "What is the patient's diagnosis?",
    "What were the results of the electrolyte panel?",
    "What were the results of the CBC (Complete Blood Count)?"
]


ground_truths = [
    ["The patient is an 83-year-old female."],
    ["The patient has a history of hypertension and hypercholesterolemia."],
    ["No known drug allergies."],
    ["Temperature 97.0, blood pressure 133/77, pulse 51, respiratory rate 14."],
    ["The patient is an 83-year-old female with a history of hypertension, hypercholesterolemia, who presented with a large intraparenchymal cerebral hemorrhage complicated by obstructive hydrocephalus."],
    ["Sodium: 142 mmol/L, potassium: 4.0 mmol/L, chloride: 106 mmol/L, bicarb: 21 mmol/L, BUN: 15 mg/dL, creatinine: 0.6 mg/dL, glucose: 145 mg/dL."],
    ["CBC is as follows: White blood cell count 11.1, hematocrit 42.7, platelets 183."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-1019txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1277.81 examples/s]


In [30]:
#832
from datasets import Dataset
questions= [
    "What were the patient's conditions on discharge?",
    "What medications is the patient currently taking?",
    "Does the patient have a history of diabetes?",
    "What is the patient’s blood pressure on examination?",
    "What is the patient’s baseline creatinine level? ",
    "What is the patient's age?",
    "What follow-up instructions were given to the patient?"
]


ground_truths = [
    ["The patient was discharged home in stable condition post-CABG, with a history of myocardial infarction, hypercholesterolemia, insulin-dependent diabetes mellitus, hypertension, and possible chronic renal insufficiency."],
    ["The patient is taking NPH insulin 45 units subcutaneously twice per day, Lipitor 80 mg by mouth once per day, Aspirin, Norvasc, Zocor, Lopressor 25 mg by mouth twice per day, Heparin, and Intravenous Integrilin."],
    ["Yes, the patient has a history of insulin-dependent diabetes mellitus."],
    ["139/65"],
    ["1.3"],
    ["62"],
    ["He was continued on his heparin, nitroglycerin, and Integrilin drips.|The patient was discharged to home on .|1.  Metoprolol 20 mg by mouth twice per day.|.  Lasix 20 mg by mouth twice per day (times seven days).|.  Colace 100 mg by mouth twice per day.|.  Aspirin 325 mg by mouth once per day.|.  Percocet 5/325-mg tablets one to two tablets by mouth q.4h. as needed (for pain).|.  Lipitor 80 mg by mouth once per day.|.  Captopril 25 mg by mouth three times per day.|1.  Status post coronary artery bypass graft times four.|.  Status post myocardial infarction times two.|.  Status post percutaneous transluminal coronary angioplasty with stent in .|.  Hypercholesterolemia.|.  Insulin-dependent diabetes mellitus.|.  Hypertension.|.  Question chronic renal insufficiency.|The patient was discharged to home.|Condition on discharge was stable on ."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-832txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 408.77 examples/s]


In [22]:
#826
from datasets import Dataset
questions= [
    "Does the patient have any known drug allergies?",
    "What is the patient's smoking history?",
    "What was the cause of the patient's death?",
    "What was the patient admitted for?",
    "Has the patient got a history of diabetes?",
    "Does the patient use cigarettes or alcohol?",
    "Did the patient take imaging studies?"
]


ground_truths = [
    ["Patient recorded as having No Known Allergies to Drugs."],
    ["Prior 70 pack year smoking history, no EtOH."],
    ["GI bleed respiratory failure"],
    ["Evaluation of new right-sided weakness."],
    ["Yes."],
    ["Smoke."],
    ["Yes."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-826txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1369.09 examples/s]


In [19]:
#629
from datasets import Dataset
questions= [
    "Does the patient have any known drug allergies?",
    "How often does the patient consume alcohol?",
    "What symptoms should prompt the patient to call the doctor's office or seek immediate medical attention?",
    "What are the current findings regarding the chest tubes?",
    "Are there any new lung findings on the recent imaging?",
    "What should the patient do regarding tube feeds?",
    "Did the patient take imaging studies?"
]


ground_truths = [
    ["No,patient recorded as having no known allergies to drugs."],
    ["Five times per week(+EtOH 5x/wk)."],
    ["Fever, chest pain, shortness of breath, nausea, vomiting, constipation, abdominal pain, diarrhea."],
    ["Two left-sided chest tubes are in unchanged position. A tiny left apical pneumothorax is stable in appearance. A small left-sided pleural effusion and atelectasis within the left lower lobe are unchanged."],
    ["No new opacities are seen within the lungs."],
    ["The patient should remain NPO (nothing by mouth) and continue with tube feeds for the next 3 weeks, as per VNA instructions."],
    ["Yes."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-629txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1046.67 examples/s]


In [13]:
#615
from datasets import Dataset
questions= [
    "What is the patient's age and gender?",
    "What caused the patient's motor vehicle collision?",
    "What was the patient's GCS score upon arrival?",
    "What were the injuries sustained by the patient?",
    "What did the patient's official CT read show?",
    "What are the patient's current medications?",
    "Does the patient have any known drug allergies?"
]


ground_truths = [
    ["The patient is a 22 year old male."],
    ["The patient fell asleep at the wheel."],
    ["The patient arrived with a GCS of 5."],
    ["The patient sustained a grade III/IV splenic laceration, small left pneumothorax, pulmonary contusions, left rib fractures (ribs eight, nine, and ten), small pelvic rami superior and inferior fracture with intramuscular hematoma."],
    ["The patient's official CT read showed a splattered spleen with hemoperitoneum, grade III/IV splenic laceration, the hilum appeared to be intact."],
    ["Wellbutrin 200 mg orally twice a day, Zoloft 50 mg orally once daily, and Ambien as needed."],
    ["No."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-615txt')

Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1011.37 examples/s]


In [37]:
#601
from datasets import Dataset
questions= [
    "What is the patient's age?",
    "What were her vital signs in the emergency room?",
    "What did CXR show?",
    "Why was she transferred to the CCU?",
    "What did the patient transfer to the CCU for?",
    "What did the patient diagnosed with?",
    "What did the patient discharge?"
  
]


ground_truths = [
    ["The patient is a 22 year old male."],
    ["The patient fell asleep at the wheel."],
    ["The patient arrived with a GCS of 5."],
    ["The patient sustained a grade III/IV splenic laceration, small left pneumothorax, pulmonary contusions, left rib fractures (ribs eight, nine, and ten), small pelvic rami superior and inferior fracture with intramuscular hematoma."],
    ["The patient was transferred to the CCU for further monitoring and management."],
    ["Rheumatic heart disease, rapid atrial-fibrillation, strong smoking history, and COPD who presents with shortness of breath."],
    ["The patient was discharged on diltiazem for rate control and with a heart monitor."]
                 ]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-601txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1482.09 examples/s]


In [40]:
#563
from datasets import Dataset
questions= [
    "What is the patient's age?",
    "What were her vital signs in the emergency room?",
    "Has the patient got a history of diabetes?",
    "Is the patient scheduled to follow up with a pulmonologist?",
    "Did the patient receive medications in the emergency room?",
    "Did the patient take imaging studies?",
    "Was imaging done to assess the patient's lung condition?"
]

ground_truths = [
    ["55"],
    ["Temperature 97.3°F, heart rate 109, blood pressure 101/69, respiratory rate 20, oxygen level 98% on 6 liters of oxygen, 94% on room air."],
    ["No."],
    ["Yes, she is scheduled to follow up with a pulmonologist."],
    ["Yes, she was treated with Combivent nebulizations three times, received 5 liters of normal saline intravenously, was given Levofloxacin 500 mg intravenously for suspected pneumonia, and received Solumedrol 125 mg intravenously to reduce airway inflammation."],
    ["Yes"],
    ["Yes, chest X-ray was performed."]
                 ]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-563txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1107.80 examples/s]


In [41]:
#417
from datasets import Dataset
questions= [
    "What is the patient's age?",
    "What is the patient's medical history?",
    "What allergies did the patient have?",
    "What were the patient's main symptoms upon presenting to the Emergency Department?",
    "What were the patient's vital signs in the Emergency Department? ",
    "What did the patient's chest X-ray show?",
    "What were the main diagnoses for the patient?"
]

ground_truths = [
    ["89"],
    ["A history of chronic obstructive pulmonary disease and ITP, who presented to the Emergency Department after a few hour history of chest and abdominal discomfort, increasing shortness of breath, and nausea with an episode of vomiting x1. "],
    ["Penicillin produces a rash,Aspirin produces GI irritation."],
    ["He presented to the Emergency Department after a few hour history of chest and abdominal discomfort, increasing shortness of breath, and nausea with an episode of vomiting x1. He notes chest pressure with radiation to the back into the left arm, severity and associated epigastric discomfort with nausea and vomiting x1 in the Emergency Department."],
    ["Temperature 100.5, blood pressure 99/50, heart rate 126, respiratory rate 32 decreasing to 34 with nebulizer treatment, and O2 saturation 94% on 2 liters."],
    ["Left lower lung zone opacity, mild congestive heart failure."],
    ["An 89-year-old male with a history of chronic obstructive pulmonary disease and ITP, who presented with fever, elevated white count, and evidence of pneumonia on chest x-ray with suspected sepsis and chronic obstructive pulmonary disease exacerbation."]
                 ]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-417txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1528.38 examples/s]


In [11]:
#403
from datasets import Dataset
questions= [
    "What is the patient's age?",
    "Did the patient have any allergies to drugs?",
    "What was the temperature of the patient at nursing home?",
    "What findings were noted on the CXR?",
    "Who was scheduled for the patient's follow-up care?",
    "What measures were taken for the patient during his hospital stay?"
]

ground_truths = [
    ["89"],
    ["A history of chronic obstructive pulmonary disease and ITP, who presented to the Emergency Department after a few hour history of chest and abdominal discomfort, increasing shortness of breath, and nausea with an episode of vomiting x1. "],
    ["99.6"],
    ["CXR showed new LLL opacity."],
    ["neurologist Dr."],
    ["During his hospital stay his LFTs/bili and coags remained stable.  He underwent an abdominal U/S of liver w/ normal TIPS evaluation with wall-to-wall flow. No ascites identified.  He was continued on lactulose and rifaxamin."]
                 ]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-403txt')

Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 1154.98 examples/s]


In [28]:
#371
from datasets import Dataset
questions= [
    "What is the patient's age?",
    "What are the discharge instructions for the patient?",
    "What surgery did the patient undergo?",
    "What medications is the patient prescribed?",
    "What imaging studies were performed on the patient?",
    "How was the patient's lung and heart status postoperatively?",
    "What conditions did patient get a history of?"

]

ground_truths = [["66"],
                 ["Patient will be discharged to home with instructions to followup with Dr.  in  weeks, and he will have a follow-up CT angiogram here in about one month."],
                 ["An endovascular repair for an abdominal aortic aneurysm."],
                 ["Atorvastatin 20 mg daily, Colchicine 0.6 mg daily, Digoxin 0.25 mg daily, Lasix 120 mg daily, Lopressor XL 150 mg daily, Moexipril 7.5 mg daily, Percocet 1-2 tablets every 4-6 hours as needed, Coumadin 5 mg daily at bedtime, Levaquin 500 mg daily for 10 days"],
                 ["CT scan."],
                 ["The patient's pulmonary status improved after the diuresis, and patient subsequently underwent a bronch, which showed no plugging, no secretions, and no signs of CHF."],
                 ["Coronary artery disease, CHF,Abdominal aortic aneurysm"]
                 ]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-371txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1596.70 examples/s]


In [40]:
#365
from datasets import Dataset
questions= [
    "How old is the patient?",
    "What medications are the patient taking?",
    "Has the patient got a history of diabetes?",
    "What did initial lab data show?",
    "What is the plan for the patient's follow-up?",
    "What did the patiet do well in?"

]

ground_truths = [
    ['52'],
    ['Cyclosporin 100 mg b.i.d., Neurontin 100 mg t.i.d., Lipitor 20 mg q.d., Fentanyl patch 25 q.72, sliding scale Insulin, Cefazolin IV, Cipro 500 q.d., Mucomyst 600 b.i.d., Colace 100 b.i.d., Lasix 80 p.o. b.i.d., Lopressor 75 mg p.o. b.i.d., Prednisone taper, Zantac, Enteric Coated Aspirin q.d., NPH 4 U subcue at 6:30 a.m. and 4:30 p.m.'],
    ['Yes, he has a history of diabetes.'],
    ['A white count which was 19.1, hematocrit 30.0, platelet count 66,000; INR 1.1, PTT 58.0; sodium 128, potassium 4.3, chloride 93, bicarb 23, BUN 78, creatinine 1.8, glucose 255, CK 45, calcium 8.5, magnesium 2.2, phosphorus 4.2; cyclosporin level was normal at 151.'],
    ['The patient should follow-up with his primary cardiologist Dr.   in one month.  He will be evaluated at the  Hospital for possible AKA. Cardiology recommended that the surgery be done emergently as our Vascular surgeons feel that it should be postponed for two weeks to one month until the Plavix can be discontinued, as it is imperative that the Aspirin and Plavix not be stopped even for an operation, as he at high risk for stent reclosure.'],
    ['He did well continuing on his Cyclosporin for status post renal transplant.']
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-365txt')

Token indices sequence length is longer than the specified maximum sequence length for this model (1218 > 512). Running this sequence through the model will result in indexing errors
Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 2696.43 examples/s]


In [35]:
#359
from datasets import Dataset
questions= [
    "How old is the patient?",
    "What surgery did the patient undergo?",
    "What is the patient's blood pressure?",
    "What allergies did the patient have?",
    "What imaging studies were performed on the patient?",
    "Has the patient got a history of diabetes",
    "Were there any problems after the operation?"
]

ground_truths = [
    ['76'],
    ['Elective hip surgery, specifically an open reduction and internal fixation of the left hip, for revision due to nonunion of a previous hip fracture'],
    ['BP 156/70'],
    ['The patient is allergic to sulfa drugs, which cause a rash.'],
    ['Chest x-ray,CT scan of the head'],
    ['Yes'],
    ['No, there were no complications.']
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-359txt')

Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 1255.40 examples/s]


In [35]:
#239
from datasets import Dataset
questions= [
    "What are the patient's vital signs upon admission?",
    "Describe the patient's cardiovascular status",
    "What is the patient's blood pressure reading?",
    "Does the patient have any allergies?",
    "What are the patient's conditions?",
    "What medication is the patient taking?"

]

ground_truths = [["BP 149/86, heart rate 72, respiratory rate 16, and saturations 100%, intubated on ventilator."],
                 ["Cardiovascular: Regular rate and rhythm, harsh S1 and S2 sounds, no murmur."],
                 ["BP 149/86"],
                 ["No known allergies."],
                 ["Intubated, young-appearing man attempting to pull at the ET tube with his left hand. HEENT: Nonicteric. Neck: Supple, no carotid bruits. Chest was clear to auscultation. Cardiovascular: Regular rate and rhythm, harsh S1 and S2 sounds, no murmur. Abdomen: Soft, nontender, positive bowel sounds. Extremities: No edema. Neurologically: Does not open eyes to voice or painful stimulation. Cranial nerves: Pupils| mm down to 1 bilaterally. EOMs full. Positive doll's eyes. Corneal reflexes: Absent bilaterally. Facial symmetry: ET restricts the lower face, but upper face appears wrinkling, symmetrically. Gag reflex: Gagging on the ET. Motor: Increased tone in all four extremities. Moves left side spontaneously, reaching and grabbing for the ET tube with the left hand. No spontaneous movement of the right hemibody. Decerebrate posturing of the right arm with pain and flexes knees and ankle with pain applied to both legs. Purposely withdraws, localizes with the left arm."],
                 ["Coumadin"]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-239txt')

Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 271.97 examples/s]


In [48]:
#205
from datasets import Dataset
questions= [
    "Why was the patient admitted to the hospital?",
    "What allergie did the patient have?",
    "What imaging studies were performed on the patient?",
    "What medication is the patient taking?",
    "What were the results of chest CT?",
    "What follow-up instructions were given to the patient?"

]

ground_truths = [["He was stabbed in the back three times."],
                 ["No known drug allergies."],
                 ["Chest and pelvic x-rays were normal. Head CT was negative. Abdominal CT was within normal limits. Chest CT showed a right pneumothorax."],
                 ["Methadone 5 mg po b.i.d. for five days only with Ibuprofen 400 mg t.i.d. Cefalexin 500 mg b.i.d. times four days. Pantoprazole 40 mg po q.d. Dilaudid 2 mg po q 6 hours prn for five days."],

                 ["Chest CT showed a right pneumothorax."],
                 ["The patient can follow up at the Trauma Clinic if there are any new developments with the wounds or problems relating to this injury. Otherwise he is to obtain a primary care physician for long term healthcare."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-205txt')

Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 432.66 examples/s]


In [None]:
#198
from datasets import Dataset
questions= [
    "Why was the patient admitted to the hospital?",
    "What allergie did the patient have?",
    "What imaging studies were performed on the patient?",
    "What medication is the patient taking?",
    "What were the results of chest CT?",
    "What follow-up instructions were given to the patient?"

]

ground_truths = [["He was stabbed in the back three times."],
                 ["No known drug allergies."],
                 ["Chest and pelvic x-rays were normal. Head CT was negative. Abdominal CT was within normal limits. Chest CT showed a right pneumothorax."],
                 ["Methadone 5 mg po b.i.d. for five days only with Ibuprofen 400 mg t.i.d. Cefalexin 500 mg b.i.d. times four days. Pantoprazole 40 mg po q.d. Dilaudid 2 mg po q 6 hours prn for five days."],

                 ["Chest CT showed a right pneumothorax."],
                 ["The patient can follow up at the Trauma Clinic if there are any new developments with the wounds or problems relating to this injury. Otherwise he is to obtain a primary care physician for long term healthcare."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-198txt')

Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 1405.52 examples/s]


In [None]:

#167
from datasets import Dataset
questions= [
    "What is the patient's age?",
    "What medications is the patient currently taking for maintenance?",
    "Has the patient got a history of drug?",
    "What allergies did the patient have?",
    "What were the results of the patient's chest X-ray on admission?",
    "How many loose stools does the patient report per day?"

]

ground_truths = [["51 years old"],
                 ["Methadone 145 mg q. d.|Lives in .|History of alcohol abuse.  Denies any cigarette smoking.|On admission, temperature 98.1, heart rate 65, blood pressure 122/84 with a respiratory rate of 18,|8% on room air.  He was alert and oriented, no acute distress.  Lungs clear to auscultation bilaterally.  Heart regular rate and rhythm.  Abdomen soft, obese, positive bowel sounds, nonfocal abdomen, tender, negative rebound, negative hemorrhoids, positive umbilical hernia.  Rectal, guaiac negative, no hemorrhoids.  Extremities, positive venous changes and no edema.|On admission, the patient had a chest x-ray that showed no acute process, no suspicious nodules.  EKG on , sinus bradycardia, rate 49, inferior lateral flat T-waves.  On , CT of the thorax was stable, no lesions.|The patient was admitted to the transplant service.  He was made NPO after midnight.  IV fluid was started.  Lab work was sent off to use preop for the OR. Labs preop:  White count 3.7, hematocrit"],
                 ["Yes, he had a history of alcohol and IV drug abuse."],
                 ["Penicillin"],
                 ["The chest x-ray showed no acute process, no suspicious nodules."],
                 ["x10 per 24 hours"]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-167txt')

Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 2442.10 examples/s]


In [None]:
#79
from datasets import Dataset
questions= [
    "What were the maternal conditions during pregnancy?",
    "What were the newborn's initial clinical findings?",
    "What respiratory support did the newborn receive?",
    "What is the screen status of the newborn?",
    "What medication is the patient taking?",
    "What imaging studies were performed?"

]

ground_truths = [["She was born to a 30-year-old G1/P0 (to 1) woman whose pregnancy was notable for an admission to with cervical shortening on. She was treated with bed rest and betamethasone. Mother's history is notable for insulin-dependent diabetes and a seizure disorder (for which she is being treated with Trileptal)."],
                 ["Birth weight was 775 grams. She is a patent nondysmorphic infant with a foul smell noted. Skin with bruising noted about the trunk. HEENT exam was within normal limits. Cardiovascular exam revealed S1 and S2 without murmur. Lungs revealed coarse breath sounds bilaterally. The abdomen was benign. Neurologic exam was nonfocal. Tone was slightly decreased throughout. The patient was moving all 4 extremities. Hips were normal. Anus was patent. The spine was intact."],
                 ["The patient was intubated in the delivery room. Received 2 doses of surfactant and is currently in SIMV at settings of 16/5 at a rate of 18. FiO2 is room air."],
                 ["Has not been sent."],
                 ["The patient is currently on ampicillin and gentamicin."],
                 ["The patient was started on phototherapy for an 8-hour bilirubin level of 3.5/4/0.2."]
]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
dataset.save_to_disk('t5-79txt')

Token indices sequence length is longer than the specified maximum sequence length for this model (675 > 512). Running this sequence through the model will result in indexing errors
Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 2249.56 examples/s]


In [None]:
#45
from datasets import Dataset
questions= [
    "Does the patient have any known drug allergies?",
    "What languages does the patient speak?",
    "What was the patient's sodium level at 04:51 PM?",
    "What was the patient's albumin level?",
    "What abnormalities were found on the MRI?",
    "What were the findings on the CT?"
]

ground_truths = [["No."],
                 ["Italian, some english"],
                 ["SODIUM-137"],
                 ["ALBUMIN-3.2*"],
                 ["On diffusion-weighted images there is a small area of restricted diffusion along the falx within the left occipitotemporal lobe. It is also bright on FLAIR-weighted images and may represent a subacute infarct. Clinical correlation is recommended. On gradient echo images there is a large area of intraparenchymal hemorrhage within the right parietal lobe and left thalamus which following administration of gadolinium reveals ring-enhancing lesions. These are suspicious for hemorrhagic metastases given the patient's history. Additional ring-enhancing lesions throughout the supra- and infratentorial compartments are visualized."],
                 ["Stable appearance of right parietal lobe and left thalamic hemorrhages, which are concerning for hemorrhagic metastasis in this patient with known metastatic lung carcinoma to the brain."]

]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)


In [None]:
dataset.save_to_disk('t5-45txt')

Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 781.98 examples/s]


In [38]:
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 7
})

In [39]:
ragas_input_df = dataset.to_pandas()
display(ragas_input_df.head())

Unnamed: 0,question,answer,contexts,ground_truths
0,What is the patient's age and gender?,57-year-old,[The patient is a 57-year-old female with a hi...,[The patient is a 57-year-old female.]
1,What did the patient have a history of?,Hypertension and hypercholesterolemia,[The patient is a 57-year-old female with a hi...,[Hypertension and hypercholesterolemia.]
2,Why did the patient come to the hospital?,She was diagnosed with GERD-like symptoms. She...,[The patient is a 57-year-old female with a hi...,[She came due to chest pain that started at 4:...
3,What medications was she discharged with?,Coumadin,[The patient is a 57-year-old female with a hi...,"[She was discharged on Aspirin, Lisinopril, To..."
4,What were the patient's vital signs on admission?,"afebrile, blood pressure 146/88, pulse 71, res...",[The patient is a 57-year-old female with a hi...,"[Vital signs: On admission, the patient was af..."


In [2]:
from datasets import Dataset
dataset= Dataset.load_from_disk('t5-826txt')

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)
     

In [28]:


def evaluate_ragas_dataset(dataset):
  result = evaluate(
    dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
    
  )
  return result

In [1]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [29]:
qa_result = evaluate_ragas_dataset(dataset)

evaluating with [context_precision]


100%|██████████| 1/1 [00:01<00:00,  1.09s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:05<00:00,  5.64s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:02<00:00,  2.22s/it]


evaluating with [context_relevancy]


100%|██████████| 1/1 [00:03<00:00,  3.55s/it]


evaluating with [answer_correctness]


100%|██████████| 1/1 [00:08<00:00,  8.40s/it]


evaluating with [answer_similarity]


100%|██████████| 1/1 [00:00<00:00,  1.16it/s]


In [30]:
#1227
print(qa_result)

{'context_precision': 0.0000, 'faithfulness': 0.6667, 'answer_relevancy': 0.7758, 'context_recall': 0.8571, 'context_relevancy': 0.2000, 'answer_correctness': 0.8061, 'answer_similarity': 1.0000}


In [43]:
#1031
print(qa_result)

{'context_precision': 0.7143, 'faithfulness': 0.8810, 'answer_relevancy': 0.8762, 'context_recall': 1.0000, 'context_relevancy': 0.0202, 'answer_correctness': 0.8482, 'answer_similarity': 1.0000}


In [37]:
#1025
print(qa_result)

{'context_precision': 0.2857, 'faithfulness': 0.7429, 'answer_relevancy': 0.8435, 'context_recall': 0.9524, 'context_relevancy': 0.0251, 'answer_correctness': 0.8048, 'answer_similarity': 1.0000}


In [24]:
#1019
print(qa_result)

{'context_precision': 0.4286, 'faithfulness': 0.7041, 'answer_relevancy': 0.8465, 'context_recall': 1.0000, 'context_relevancy': 0.0437, 'answer_correctness': 0.7270, 'answer_similarity': 1.0000}


In [34]:
#832
print(qa_result)

{'context_precision': 0.5714, 'faithfulness': 0.7226, 'answer_relevancy': 0.8399, 'context_recall': 1.0000, 'context_relevancy': 0.1670, 'answer_correctness': 0.6854, 'answer_similarity': 1.0000}


In [6]:
#826
print(qa_result)

{'context_precision': 0.2857, 'faithfulness': 0.6508, 'answer_relevancy': 0.8236, 'context_recall': 0.5000, 'context_relevancy': 0.0857, 'answer_correctness': 0.6825, 'answer_similarity': 1.0000}


In [24]:
#629
print(qa_result)

{'context_precision': 0.4286, 'faithfulness': 0.8082, 'answer_relevancy': 0.7927, 'context_recall': 0.8571, 'context_relevancy': 0.0840, 'answer_correctness': 0.7673, 'answer_similarity': 1.0000}


In [17]:
#615
print(qa_result)

{'context_precision': 0.5714, 'faithfulness': 0.7381, 'answer_relevancy': 0.8399, 'context_recall': 1.0000, 'context_relevancy': 0.0714, 'answer_correctness': 0.8357, 'answer_similarity': 1.0000}


In [43]:
#601
print(qa_result)

{'context_precision': 0.2857, 'faithfulness': 0.8571, 'answer_relevancy': 0.8438, 'context_recall': 0.8571, 'context_relevancy': 0.0434, 'answer_correctness': 0.8730, 'answer_similarity': 1.0000}


In [44]:
#563
print(qa_result)

{'context_precision': 0.2857, 'faithfulness': 0.6063, 'answer_relevancy': 0.8243, 'context_recall': 0.6857, 'context_relevancy': 0.0155, 'answer_correctness': 0.7254, 'answer_similarity': 1.0000}


In [7]:
#417
print(qa_result)

{'context_precision': 0.5714, 'faithfulness': 0.7520, 'answer_relevancy': 0.8560, 'context_recall': 0.8571, 'context_relevancy': 0.0260, 'answer_correctness': 0.6875, 'answer_similarity': 1.0000}


In [15]:
#403
print(qa_result)

{'context_precision': 0.3333, 'faithfulness': 0.7000, 'answer_relevancy': 0.8255, 'context_recall': 1.0000, 'context_relevancy': 0.0476, 'answer_correctness': 0.7803, 'answer_similarity': 1.0000}


In [6]:
#365
print(qa_result)

{'context_precision': 0.6667, 'faithfulness': 0.7500, 'answer_relevancy': 0.8268, 'context_recall': 0.8889, 'context_relevancy': 0.0090, 'answer_correctness': 1.0000, 'answer_similarity': 1.0000}


In [6]:
#359
print(qa_result)

{'context_precision': 0.1429, 'faithfulness': 0.7143, 'answer_relevancy': 0.7828, 'context_recall': 0.9643, 'context_relevancy': 0.0373, 'answer_correctness': 0.9048, 'answer_similarity': 1.0000}


In [6]:
#239
print(qa_result)

{'context_precision': 0.1667, 'faithfulness': 0.9028, 'answer_relevancy': 0.8545, 'context_recall': 1.0000, 'context_relevancy': 0.0682, 'answer_correctness': 0.9167, 'answer_similarity': 1.0000}


In [6]:
#205
print(qa_result)

{'context_precision': 0.0000, 'faithfulness': 0.6667, 'answer_relevancy': 0.8100, 'context_recall': 0.1667, 'context_relevancy': 0.0154, 'answer_correctness': 0.5625, 'answer_similarity': 1.0000}


In [None]:
#198
print(qa_result)

{'context_precision': 0.5000, 'faithfulness': 0.9167, 'answer_relevancy': 0.8500, 'context_recall': 1.0000, 'context_relevancy': 0.0458, 'answer_correctness': 0.8125, 'answer_similarity': 1.0000}


In [None]:
#45
print(qa_result)

{'context_precision': 1.0000, 'faithfulness': 0.5069, 'answer_relevancy': 0.7992, 'context_recall': 0.8333, 'context_relevancy': 0.1275, 'answer_correctness': 0.7917, 'answer_similarity': 1.0000}


In [None]:
#167
print(qa_result)

{'context_precision': 0.5000, 'faithfulness': 0.8333, 'answer_relevancy': 0.8241, 'context_recall': 0.9524, 'context_relevancy': 0.0871, 'answer_correctness': 0.8438, 'answer_similarity': 1.0000}


In [None]:
#79
print(qa_result)

{'context_precision': 0.6667, 'faithfulness': 0.9833, 'answer_relevancy': 0.8422, 'context_recall': 0.9583, 'context_relevancy': 0.1239, 'answer_correctness': 0.8583, 'answer_similarity': 1.0000}


In [31]:
df = qa_result.to_pandas()

In [32]:
#df.to_csv("t5_45txt_result.csv")
#df.to_csv("t5_79txt_result.csv")
# #df.to_csv("t5_167txt_result.csv")
#df.to_csv("t5_198txt_result.csv")
#df.to_csv("t5_205txt_result.csv")
#df.to_csv("t5_239txt_result.csv")
#df.to_csv("t5_359txt_result.csv")
#df.to_csv("t5_365txt_result.csv")
#df.to_csv("t5_371txt_result.csv")
#df.to_csv("t5_403txt_result.csv")
#df.to_csv("t5_417txt_result.csv")
#df.to_csv("t5_563txt_result.csv")
#df.to_csv("t5_601txt_result.csv")
#df.to_csv("t5_615txt_result.csv")
#df.to_csv("t5_629txt_result.csv")
#df.to_csv("t5_826txt_result.csv")
#df.to_csv("t5_832txt_result.csv")
#df.to_csv("t5_1019txt_result.csv")
#df.to_csv("t5_1025txt_result.csv")
#df.to_csv("t5_1031txt_result.csv")
df.to_csv("t5_1227txt_result.csv")
