In [1]:
generative_model="meta-llama/Llama-3.2-1B-Instruct"
sentence_model = "dmis-lab/biobert-base-cased-v1.1"
chunk_size=1000
chunk_overlap=200
k=10

In [2]:
from google.colab import userdata

In [None]:
!pip install langchain_huggingface
!pip install -qU langchain-text-splitters
!pip install -qU "langchain-chroma>=0.1.2"
!pip install -U langchain-community
!pip install nltk

In [4]:
!wget https://raw.githubusercontent.com/pubmedqa/pubmedqa/refs/heads/master/data/ori_pqal.json

--2024-12-16 16:50:53--  https://raw.githubusercontent.com/pubmedqa/pubmedqa/refs/heads/master/data/ori_pqal.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2584787 (2.5M) [text/plain]
Saving to: ‘ori_pqal.json’


2024-12-16 16:50:54 (34.3 MB/s) - ‘ori_pqal.json’ saved [2584787/2584787]



We collect two datasets:

    ‘questions’: the questions with corresponding gold long answer, gold document ID, and year.
    ‘documents’: the abstracts (contexts+long_answer concatenated), and year.


In [5]:
import pandas as pd
tmp_data = pd.read_json("/content/ori_pqal.json").T
# some labels have been defined as "maybe", only keep the yes/no answers
tmp_data = tmp_data[tmp_data.final_decision.isin(["yes", "no"])]

documents = pd.DataFrame({"abstract": tmp_data.apply(lambda row: (" ").join(row.CONTEXTS+[row.LONG_ANSWER]), axis=1),
             "year": tmp_data.YEAR})
questions = pd.DataFrame({"question": tmp_data.QUESTION,
             "year": tmp_data.YEAR,
             "gold_label": tmp_data.final_decision,
             "gold_context": tmp_data.LONG_ANSWER,
             "gold_document_id": documents.index})

For an example of a query:

In [6]:
questions.iloc[0].question

'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?'

For an example of a document to leverage for the queries:

In [8]:
from langchain_core.documents import Document

docs=[]
docs_chunks=[]

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from langchain.text_splitter import NLTKTextSplitter
import re

# Initialize the NLTKTextSplitter
text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

child_parent_doc_dict=dict()

for doc in documents.itertuples(index=True, name='Row'):
  # Split the text
  chunks = text_splitter.split_text(doc.abstract)
  chunks = [re.sub(r'\s+', ' ', chunk) for chunk in chunks]
  for i, chunk in enumerate(chunks):
   chunk_id=str(doc.Index)+"-"+str(i)
   child_parent_doc_dict[chunk_id]=doc.Index
   temp_doc = Document(
       page_content=chunk,
       metadata={"source": "pubmedqa"},
       id=chunk_id,
   )
   docs_chunks.append(temp_doc)

for doc in documents.itertuples(index=True, name='Row'):
    temp_doc = Document(
        page_content=doc.abstract,
        metadata={"source": "pubmedqa"},
        id=doc.Index,
    )
    docs.append(temp_doc)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [9]:
from langchain_huggingface import HuggingFaceEmbeddings

model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=sentence_model,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

If you want to get best in-class automated tracing of your model calls you can also set your LangSmith API key by uncommenting below:

In [10]:
from langchain.vectorstores import Chroma

try:
  vector_store.delete_collection() # Delete vector_store collection if it already exists
  chunks_vector_store.delete_collection() # Delete vector_store collection if it already exists
except:
  pass # Ignore if it doesnt exist

vector_store = Chroma.from_documents(docs, hf)
chunks_vector_store=Chroma.from_documents(docs_chunks, hf)
#retriever = vector_store.as_retriever(...)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [11]:
results = vector_store.similarity_search_with_score(
    "What is programmed cell death?", k=k
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=44.339890] Although body dysmorphic disorder (BDD) is classified in DSM-III-R as a nonpsychotic somatoform disorder, controversy exists as to whether BDD can present with psychotic features. If it can, this raises the possibility that its DSM-III-R psychotic counterpart-delusional disorder, somatic type--may not be a separate disorder. The purpose of this study was to determine whether patients with nonpsychotic BDD (defined according to DSM-III-R criteria, i.e., with maintenance of some insight) were different from patients with psychotic BDD (those whose preoccupation was without insight and of delusional intensity). Fifty consecutive patients meeting DSM-III-R criteria A and C for BDD were assessed with a semistructured interview and the Structured Clinical Interview for DSM-III-R (SCID). Family histories of psychiatric disorders were blindly assessed. [{'source': 'pubmedqa'}]
* [SIM=44.548996] However, the theory that physicians' early adoption of new drugs is a personal tra

In [12]:
results = chunks_vector_store.similarity_search_with_score(
    "What is programmed cell death?", k=k
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=44.339890] Although body dysmorphic disorder (BDD) is classified in DSM-III-R as a nonpsychotic somatoform disorder, controversy exists as to whether BDD can present with psychotic features. If it can, this raises the possibility that its DSM-III-R psychotic counterpart-delusional disorder, somatic type--may not be a separate disorder. The purpose of this study was to determine whether patients with nonpsychotic BDD (defined according to DSM-III-R criteria, i.e., with maintenance of some insight) were different from patients with psychotic BDD (those whose preoccupation was without insight and of delusional intensity). Fifty consecutive patients meeting DSM-III-R criteria A and C for BDD were assessed with a semistructured interview and the Structured Clinical Interview for DSM-III-R (SCID). Family histories of psychiatric disorders were blindly assessed. [{'source': 'pubmedqa'}]
* [SIM=44.548996] However, the theory that physicians' early adoption of new drugs is a personal tra

### Evaluating the retriever

In [13]:
questions.head()

Unnamed: 0,question,year,gold_label,gold_context,gold_document_id
21645374,Do mitochondria play a role in remodelling lac...,2011,yes,Results depicted mitochondrial dynamics in viv...,21645374
16418930,Landolt C and snellen e acuity: differences in...,2006,no,"Using the charts described, there was only a s...",16418930
9488747,"Syncope during bathing in infants, a pediatric...",1997,yes,"""Aquagenic maladies"" could be a pediatric form...",9488747
17208539,Are the long-term results of the transanal pul...,2007,no,Our long-term study showed significantly bette...,17208539
10808977,Can tailored interventions increase mammograph...,2000,yes,The effects of the intervention were most pron...,10808977


In [23]:
from tqdm import tqdm
q_doc = zip(questions.question, questions.gold_context)
count_correct=0
for question, gold_context in tqdm(q_doc, total =890):
  results = vector_store.similarity_search_with_score(
      question, k=k
  )
  for result in results:
    context = result[0].page_content
    if gold_context in context: # Count as correct if the retrieved context is in gold context
     count_correct += 1
     break

print(f'no chunking Accuracy: {count_correct/len(questions.question)}')

100%|██████████| 890/890 [00:21<00:00, 41.31it/s]

no chunking Accuracy: 0.6258426966292134





In [24]:
count_correct=0
q_doc = zip(questions.question, questions.gold_context)
for question, gold_context in tqdm(q_doc, total =890):
  results = chunks_vector_store.similarity_search_with_score(
      question, k=k
  )
  for result in results:
    context = result[0].page_content
    if gold_context in context: # Count as correct if the retrieved context is in gold context
     count_correct += 1
     break

print(f'chunking Accuracy: {count_correct/len(questions.question)}')

100%|██████████| 890/890 [00:17<00:00, 50.38it/s]

chunking Accuracy: 0.6258426966292134





## Creating the model

In [None]:
from huggingface_hub import login
login(userdata.get('huggingface_token'))

In [None]:
! huggingface-cli download $generative_model --local-dir $generative_model

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(generative_model)
model_hf = AutoModelForCausalLM.from_pretrained(generative_model)

In [None]:
retriever = vector_store.as_retriever(k=k)

## Step 4: Evaluation

### Full evalution using f1 metric

In [None]:
import warnings
import logging
from tqdm import tqdm
logging.getLogger("transformers").setLevel(logging.ERROR) # Ignoring warnings
logging.getLogger("chromadb.segment.impl.vector.local_hnsw").setLevel(logging.ERROR) # Ignoring warnings


def f_1_score(q_a, retrieval_chain):
  TN, FP, FN, TP = 0, 0, 0, 0
  for question, gold_label in tqdm(q_a, total=890):
    answer = retrieval_chain.invoke(question)
    y_n = answer.splitlines()[-1].split()[-1]
    if gold_label.lower() == "yes" and "yes" in y_n.lower():
        TP += 1
    elif gold_label.lower() == "no" and "no" in y_n.lower():
        TN += 1
    elif gold_label.lower() == "yes" and "no" in y_n.lower():
        FN += 1
    elif gold_label.lower() == "no" and "yes" in y_n.lower():
        FP += 1

  f1 = 2*TP / (2*TP + FP + FN)
  return f1


In [None]:
template = """Answer the question, only with yes or no, based only on the following context:
{context}
Do not write anything else. Do not explain your answer.
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Create a pipeline for text generation
pipe = pipeline(
    "text-generation",
    model=model_hf,
    tokenizer=tokenizer,
    max_new_tokens=10,
    device="cuda"
    )

# Wrap the pipeline in a LangChain HuggingFacePipeline object
llm = HuggingFacePipeline(pipeline=pipe)

retrieval_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Evaluate
q_a = zip(questions.question, questions.gold_label)
score = f_1_score(q_a, retrieval_chain)
print(score)




 15%|█▌        | 135/890 [01:35<08:56,  1.41it/s]


KeyboardInterrupt: 

### Baseline comparison, f1 metric without using the retriever

In [None]:
# Baseline, without retrieving documents
template = """Answer the question, only with yes or no.
Do not write anything else. Do not explain your answer.
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Create a pipeline for text generation
pipe = pipeline(
    "text-generation",
    model=model_hf,
    tokenizer=tokenizer,
    max_new_tokens=10,
    device="cuda"
    )

# Wrap the pipeline in a LangChain HuggingFacePipeline object
llm = HuggingFacePipeline(pipeline=pipe)

retrieval_chain = (
    {"question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Evaluate
q_a = zip(questions.question, questions.gold_label)
score = f_1_score(q_a, retrieval_chain)
print(score)

## Step 5: Make improvements

### Prompt engineering

In [None]:
# Investigating the effects of prompt engineering
# Few shot prompting
from langchain.prompts.example_selector import LengthBasedExampleSelector
from langchain.prompts import FewShotPromptTemplate
from langchain import PromptTemplate


# From the data
examples = [
    {
        "query": """Answer the question, only with yes or no, based only on the following context:
                  Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.
                  The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leaves were stained with the mitochondrial dye MitoTracker Red CMXRos and examined. Mitochondrial dynamics were delineated into four categories (M1-M4) based on characteristics including distribution, motility, and membrane potential (\u0394\u03a8m). A TUNEL assay showed fragmented nDNA in a gradient over these mitochondrial stages. Chloroplasts and transvacuolar strands were also examined using live cell imaging. The possible importance of mitochondrial permeability transition pore (PTP) formation during PCD was indirectly examined via in vivo cyclosporine A (CsA) treatment. This treatment resulted in lace plant leaves with a significantly lower number of perforations compared to controls, and that displayed mitochondrial dynamics similar to that of non-PCD cells.
                  Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?""",
        "answer": "Yes."
    }, {
        "query": """Answer the question, only with yes or no, based only on the following context:
                  Assessment of visual acuity depends on the optotypes used for measurement. The ability to recognize different optotypes differs even if their critical details appear under the same visual angle. Since optotypes are evaluated on individuals with good visual acuity and without eye disorders, differences in the lower visual acuity range cannot be excluded. In this study, visual acuity measured with the Snellen E was compared to the Landolt C acuity.
                  100 patients (age 8 - 90 years, median 60.5 years) with various eye disorders, among them 39 with amblyopia due to strabismus, and 13 healthy volunteers were tested. Charts with the Snellen E and the Landolt C (Precision Vision) which mimic the ETDRS charts were used to assess visual acuity. Three out of 5 optotypes per line had to be correctly identified, while wrong answers were monitored. In the group of patients, the eyes with the lower visual acuity, and the right eyes of the healthy subjects, were evaluated.
                  Differences between Landolt C acuity (LR) and Snellen E acuity (SE) were small. The mean decimal values for LR and SE were 0.25 and 0.29 in the entire group and 0.14 and 0.16 for the eyes with strabismus amblyopia. The mean difference between LR and SE was 0.55 lines in the entire group and 0.55 lines for the eyes with strabismus amblyopia, with higher values of SE in both groups. The results of the other groups were similar with only small differences between LR and SE.
                  Landolt C and snellen e acuity: differences in strabismus amblyopia?""",
        "answer": "No."
    }, {
        "query": """Answer the question, only with yes or no, based only on the following context:
                  Apparent life-threatening events in infants are a difficult and frequent problem in pediatric practice. The prognosis is uncertain because of risk of sudden infant death syndrome.
                  Eight infants aged 2 to 15 months were admitted during a period of 6 years; they suffered from similar maladies in the bath: on immersion, they became pale, hypotonic, still and unreactive; recovery took a few seconds after withdrawal from the bath and stimulation. Two diagnoses were initially considered: seizure or gastroesophageal reflux but this was doubtful. The hypothesis of an equivalent of aquagenic urticaria was then considered; as for patients with this disease, each infant's family contained members suffering from dermographism, maladies or eruption after exposure to water or sun. All six infants had dermographism. We found an increase in blood histamine levels after a trial bath in the two infants tested. The evolution of these \"aquagenic maladies\" was favourable after a few weeks without baths. After a 2-7 year follow-up, three out of seven infants continue to suffer from troubles associated with sun or water.
                  Syncope during bathing in infants, a pediatric form of water-induced urticaria?""",
        "answer": "Yes."
    }
]

example_template = """
User: {query}
AI: {answer}
"""

example_prompt = PromptTemplate(
    input_variables=["query", "answer"],
    template=example_template
)

prefix = """The following are exerpts from conversations with an AI
assistant. The assistant is a biomedical research assistant
that answers yes or no questions based on some context. Here are some
examples:
"""

suffix = """
User: Answer the question, only with yes or no, based only on the following context:
{context}
{query}
AI: """


few_shot_prompt_template = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["query"],
    example_separator="\n\n"
)

prompt = few_shot_prompt_template

retrieval_chain = (
    {"context": retriever, "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [None]:
# Evaluate again
q_a = zip(questions.question, questions.gold_label)
score = f_1_score(q_a, retrieval_chain)
print(score)


### Retriever

In [None]:
# Investigating the effects of a different retriever
from langchain.retrievers import MultiQueryRetriever
template = """Answer the question, only with yes or no, based only on the following context:
{context}
Do not write anything else. Do not explain your answer.
Question: {question}
"""

pipe = pipeline(
    "text-generation",
    model=model_hf,
    tokenizer=tokenizer,
    max_new_tokens=10,
    pad_token_id=tokenizer.pad_token_id,
    device="cuda"
    )

llm = HuggingFacePipeline(pipeline=pipe)

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(), llm=llm,k=k
)

prompt = ChatPromptTemplate.from_template(template)

retrieval_chain = (
    {"context": retriever_from_llm, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Evaluate
q_a = zip(questions.question, questions.gold_label)
score = f_1_score(q_a, retrieval_chain)
print(score)

In [None]:
# Test again but with ParentDocument
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain.retrievers import ParentDocumentRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

#child_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)

new_vector_store = Chroma(
    collection_name="full_documents", embedding_function=hf
)

# The storage layer for the parent documents
store = InMemoryStore()
parent_doc_retriever = ParentDocumentRetriever(
    vectorstore=new_vector_store,
    docstore=store,
    child_splitter=text_splitter,
    k=k
)

full_docs = []
ids = documents.index
for i, doc in enumerate(documents.abstract):
    temp_doc = Document(
        page_content=doc,
        id=ids[i],
    )
    full_docs.append(temp_doc)
parent_doc_retriever.add_documents(full_docs)

In [None]:
template = """Answer the question, only with yes or no, based only on the following context:
{context}
Do not write anything else. Do not explain your answer.
Question: {question}
"""

pipe = pipeline(
    "text-generation",
    model=model_hf,
    tokenizer=tokenizer,
    max_new_tokens=10,
    pad_token_id=tokenizer.pad_token_id,
    device="cuda"
    )

llm = HuggingFacePipeline(pipeline=pipe)


prompt = ChatPromptTemplate.from_template(template)

retrieval_chain = (
    {"context": parent_doc_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
q_a = zip(questions.question, questions.gold_label)
score = f_1_score(q_a, retrieval_chain)
print(score)