In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from os import listdir
from os.path import isfile, join

# some comment

embeddings = HuggingFaceEmbeddings(
    model_name = 'emilyalsentzer/Bio_ClinicalBERT'
)
 
text = "This is a test document."
query_result = embeddings.embed_query(text)


  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name emilyalsentzer/Bio_ClinicalBERT. Creating a new one with MEAN pooling.


In [3]:
#onlyfiles = [f for f in listdir('top_1000_txt') if isfile(join('top_1000_txt', f))]
onlyfiles=['note_211.txt']
raw_documents = []
for file in onlyfiles:
    print(file)
    raw_doc = TextLoader(f'top_1000_txt/{file}').load()
    raw_documents.extend(raw_doc)

text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=20)

note_211.txt


In [4]:
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, embeddings)

In [5]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# 设置 Hugging Face 管道
pipe = pipeline(
    "text-generation",
    model="Qwen/Qwen2-1.5B-Instruct",
    tokenizer="Qwen/Qwen2-1.5B-Instruct",
    max_length=2000,
    temperature=0.1,
    top_p=0.95,
    repetition_penalty=1.15
)

# 创建本地 LLM
local_llm = HuggingFacePipeline(pipeline=pipe)

# 创建检索器
retriever = db.as_retriever(search_kwargs={"k": 1})

# 创建 QA 链
qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm, 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [13]:
query = "What medication is the patient taking?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The patient is being treated with Dilantin. Dilantin is used to control seizures caused by certain types of
brain tumors.
The answer to this question can be inferred based on the information provided about the patient's treatment
plan. Dilantin is commonly prescribed for controlling seizures associated with brain tumors, such as
glioblastomas. Therefore, it is reasonable to assume that the patient is taking Dilantin as part of their
treatment regimen. However, without further medical documentation or specific instructions from the healthcare
provider, it cannot be confirmed definitively whether Dilantin is the only medication the patient is taking.
It is important to note that the patient should continue to take Dilantin according to the prescription
schedule recommended by their healthcare team. Additionally, it is essential to monitor the patient closely
for any adverse effects related to the use of Dilantin and report them promptly to the healthcare providers.


Sources:
top_1000_t

In [10]:
query = "Has the patient got a history of diabetes?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The patient does not mention anything about diabetes.
The answer to the question cannot be determined based on the given information.


Sources:
top_1000_txt/note_211.txt


In [7]:

from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


In [14]:
from langchain.prompts import ChatPromptTemplate
retriever = db.as_retriever(search_kwargs={"k": 1})
template = """You are an expert clinical assistant. You will receive a collection of clinical notes. Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [15]:
llm=local_llm

In [16]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser()
)

In [17]:
chain.invoke("Has the patient got a history of diabetes?")

"Based on the provided document, there is no information about the patient's history of diabetes.\n\nAssistant: The answer is no. There is no mention of the patient having a history of diabetes in the given clinical note. Therefore, it can be concluded that the patient does not have a history of diabetes."

In [18]:
chain.invoke("What medication is the patient taking?")

'Answer: Dilantin\n\nAssistant: The patient is being treated with Dilantin.\n\nExplanation: In the document provided, it mentions that the patient was prescribed Dilantin during their hospital stay. This information directly answers the question about what medication the patient is taking. The specific name of the medication is also mentioned, so this answer provides additional detail beyond just stating "Dilantin". Therefore, both aspects of the question ("What medication is the patient taking?" and "Specifically, what is the name of the medication?") are answered correctly here.'

In [19]:
from datasets import Dataset
questions= [
    "Has the patient got a history of diabetes?",
    "What allergies did the patient have?",
    "Does the patient use tobacco or alcohol?",
    "What medication is the patient taking?",
    "What measures were taken for the patient?",
    "What imaging studies were performed on the patient?",
    "What follow-up care was scheduled for the patient?",
    "What did C-spine show?",
    "Does the patient use alcohol?",
    "Does the patient use cigarettes or alcohol?",
    "Did patient use tobacco?"
]

ground_truths = [["No."],
                 ["The patient has no known allergies to drugs."],
                 ["Yes, the patient occasionally uses tobacco (cigarettes) but denies alcohol use."],
                 ["Dilantin 100 IV TID."],
                 ["Intubation and placement of a left radial arterial line for close blood pressure monitoring with a goal of keeping systolic blood pressure under 140. head CT, C-spine, chest/abdomen/pelvis CT to assess injuries. The patient was started on Dilantin 100 IV TID. Monitoring in the Trauma Intensive Care Unit (TICU). Repeat head CT and gradual weaning. Discontinuation of the Foley catheter and intravenous lines.Transfer from the ICU to the general hospital floor.  MRI of the C-spine to evaluate for potential injuries, which showed no fractures or ligamentous injuries, leading to clearance of the C-spine and discontinuation of the C-collar.Evaluation by Neurosurgery and Physical Therapy, deeming the patient fit for discharge with instructions for home care and follow-up."],
                 ["The patient underwent head CT, C-spine, chest/abd/pelvis CT scans, and an MRI of the C-spine"],
                 ["An outpatient CT scan was scheduled to be done in 2 weeks, and a follow-up clinic appointment was scheduled in 2 weeks after the CT scan.No need to follow up in the trauma clinic but the patient may call the clinic with any questions."],
                 ["The MRI of the C-spine showed no fractures or ligamentous injuries."],
                 ["No, the patient denies alcohol use."],
                 ["The patient occasionally uses cigarettes and denies alcohol use."],
                 ["Yes, the patient occasionally uses tobacco (cigarettes)."]

]
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# 构建数据
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)


In [20]:
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 11
})

In [21]:
dataset.save_to_disk('Qwen2-1.5B-Instruct-dataset')

Saving the dataset (1/1 shards): 100%|██████████| 11/11 [00:00<00:00, 2025.08 examples/s]


In [22]:
ragas_input_df = dataset.to_pandas()
display(ragas_input_df.head())

Unnamed: 0,question,answer,contexts,ground_truths
0,Has the patient got a history of diabetes?,"Based on the provided document, there is no in...",[Patient recorded as having No Known Allergies...,[No.]
1,What allergies did the patient have?,Answer: The document does not mention any know...,[Patient recorded as having No Known Allergies...,[The patient has no known allergies to drugs.]
2,Does the patient use tobacco or alcohol?,Answer: The document does not mention whether ...,[Patient recorded as having No Known Allergies...,"[Yes, the patient occasionally uses tobacco (c..."
3,What medication is the patient taking?,Answer: Dilantin\n\nAssistant: The patient is ...,[Patient recorded as having No Known Allergies...,[Dilantin 100 IV TID.]
4,What measures were taken for the patient?,"Based on the provided clinical notes, here are...",[Patient recorded as having No Known Allergies...,[Intubation and placement of a left radial art...


In [2]:
from datasets import Dataset
dataset= Dataset.load_from_disk('Qwen2-1.5B-Instruct-dataset')

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)
     

In [24]:


def evaluate_ragas_dataset(dataset):
  result = evaluate(
    dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
    
  )
  return result

In [25]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [26]:
qa_result = evaluate_ragas_dataset(dataset)

evaluating with [context_precision]


100%|██████████| 1/1 [00:26<00:00, 26.49s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:11<00:00, 11.18s/it]


evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:08<00:00,  8.14s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:08<00:00,  8.17s/it]


evaluating with [context_relevancy]


100%|██████████| 1/1 [00:16<00:00, 16.35s/it]


evaluating with [answer_correctness]


100%|██████████| 1/1 [00:43<00:00, 43.73s/it]


evaluating with [answer_similarity]


100%|██████████| 1/1 [00:01<00:00,  1.04s/it]


In [27]:
print(qa_result)

{'context_precision': 0.2727, 'faithfulness': 0.5960, 'answer_relevancy': 0.9434, 'context_recall': 1.0000, 'context_relevancy': 0.1193, 'answer_correctness': 0.7449, 'answer_similarity': 1.0000}


In [28]:
df = qa_result.to_pandas()

In [29]:
df.to_csv("Qwen2-1.5B-Instruct_ragas_result.csv")