In [None]:
!wget https://raw.githubusercontent.com/pubmedqa/pubmedqa/refs/heads/master/data/ori_pqal.json

In [None]:
!pip install langchain-huggingface
!pip install -qU langchain-text-splitters
!pip install -U langchain-community
!pip install chromadb

In [None]:
import pandas as pd
tmp_data = pd.read_json("/content/ori_pqal.json").T
# some labels have been defined as "maybe", only keep the yes/no answers
tmp_data = tmp_data[tmp_data.final_decision.isin(["yes", "no"])]

documents = pd.DataFrame({"abstract": tmp_data.apply(lambda row: (" ").join(row.CONTEXTS+[row.LONG_ANSWER]), axis=1),
             "year": tmp_data.YEAR})
questions = pd.DataFrame({"question": tmp_data.QUESTION,
             "year": tmp_data.YEAR,
             "gold_label": tmp_data.final_decision,
             "gold_context": tmp_data.LONG_ANSWER,
             "gold_document_id": documents.index})

In [None]:
questions.iloc[0].question

In [None]:
documents.iloc[0].abstract

In [None]:
# Step 1: Configure your LangChain LM

In [None]:
import os
os.environ["HF_TOKEN"] = ""

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-3.2-1B"

hf_token = os.environ.get("HF_TOKEN")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)

# Load the pipeline using LangChain's HuggingFacePipeline
lm = HuggingFacePipeline.from_model_id(
    model_id=model_id,
    task="text-generation",     # Define task
    device= 0,           # Set to 0 for GPU support
    model_kwargs={"torch_dtype": "auto", "pad_token_id": tokenizer.eos_token_id,},
    pipeline_kwargs={"max_new_tokens": 40},    # Set generation parameters
)


In [None]:
# Example
prompt = "What is programmed cell death?"
output = lm(prompt)
print("Generated Output:")
print(output)

In [None]:
# Step 2: Set up the document database and retriever

In [None]:
# Step 2.1: Embedding model

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
# Sanity check: check the embedding shape
sample_text = "Programmed cell death is a vital process in biological organisms."
embedding = embedding_model.embed_query(sample_text)

print(f"Embedding shape: {len(embedding)}")

In [None]:
# Step 2.2: Chunking

In [None]:
# compute the avergae length of the gold context to chose suitable chunk size
gold_context_lengths = [len(context) for context in questions['gold_context'].tolist()]

average_length = sum(gold_context_lengths) / len(gold_context_lengths)

print(f"Average Length of Gold Context: {average_length:.2f} characters")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,       # Maximum characters in each chunk
    chunk_overlap=30      # Overlap between chunks to preserve context
)

# Chunk the documents
chunked_documents = []    # Store chunks as a list of dictionaries
for idx, row in documents.iterrows():
    chunks = text_splitter.split_text(row['abstract'])  # Split the abstract
    for chunk in chunks:
        chunked_documents.append({"doc_id": idx, "chunk": chunk})

# Print a few samples for sanity check
print("Sample chunks:")
for i in range(10):
    print(f"Doc ID: {chunked_documents[i]['doc_id']}")
    print(f"Chunk: {chunked_documents[i]['chunk'][:]}")
    print()


In [None]:
# Step 2.3: Define a vector store and retriever

In [None]:
from langchain.vectorstores import Chroma

texts = [chunk["chunk"] for chunk in chunked_documents]
metadatas = [{"doc_id": chunk["doc_id"]} for chunk in chunked_documents]

# Initialize Chroma vector store
vector_store = Chroma.from_texts(
    texts=texts,
    embedding=embedding_model,
    metadatas=metadatas,
)

retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 1})


In [None]:
# Perform similarity search with scores
query = "What is programmed cell death?"
# query = "Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?"
results = vector_store.similarity_search_with_score(query, k=3)

# Display results
print("Query Results:")
for res, score in results:
    content_excerpt = res.page_content[:600]  # Limit output the characters for readability
    print(f"* [SIM={score:.3f}] {content_excerpt} [{res.metadata}]")

In [None]:
# Step 3: Define the full RAG pipeline

In [None]:
from langchain_core.prompts import ChatPromptTemplate
rag_prompt = ChatPromptTemplate.from_template(
    template="Based on the following retrieved context, answer the question with 'Yes' or 'No' only. "
             "Do not provide any explanation or additional information. If the context does not contain enough evidence, answer 'No'. "
             "Your answer must be a single word: 'Yes' or 'No'.\n\n"
             "Context: {context}\n"
             "Question: {question}\n\n"
             "Answer (Yes or No):"
)

In [None]:
from langchain.chains import RetrievalQA

lm = HuggingFacePipeline.from_model_id(
    model_id=model_id,
    task="text-generation",
    device= 0,
    model_kwargs={"torch_dtype": "auto", "pad_token_id": tokenizer.eos_token_id,},
    pipeline_kwargs={"max_new_tokens": 1},    # Set generation parameters
)

# Create the RetrievalQA chain
rag_chain = RetrievalQA.from_chain_type(
    retriever=retriever,
    llm=lm,
    chain_type="stuff",          # Combine context and LM generation
    return_source_documents=True,     # Return retrieved documents
    chain_type_kwargs={"prompt": rag_prompt}
)

In [None]:
# Test a single question
test_input = {"query": "What is programmed cell death?"}

# Run the RAG pipeline for the test input
response = rag_chain.invoke(test_input)


print(f"--- Test Question ---")
print(f"Answer: {response['result']}")
print(f"Retrieved Document: {response['source_documents']}\n")


In [None]:
# Step 4: Evaluate the RAG pipeline on the dataset

In [None]:
from sklearn.metrics import f1_score

# Function to evaluate retriever accuracy
def evaluate_retriever(all_doc_ids, gold_document_ids):
    # Count matches between retrieved and gold document IDs
    correct_matches = 0
    total_queries = len(gold_document_ids)

    for i in range(total_queries):
        if gold_document_ids[i] == all_doc_ids[i]:
            correct_matches += 1

    # Calculate accuracy
    accuracy = correct_matches / total_queries
    return accuracy


# Function to evaluate generator using F1 score
def evaluate_generator(predictions, gold_labels):
    predictions_binary = [1 if pred.lower() == "yes" else 0 for pred in predictions]
    gold_binary = [1 if label.lower() == "yes" else 0 for label in gold_labels]
    return f1_score(gold_binary, predictions_binary)

# Function to extract answer
def extract_answer_from_result(result, marker):
    marker_index = result.lower().find(marker)

    if marker_index != -1:
        # Extract the answer portion
        answer_start = marker_index + len(marker)
        answer = result[answer_start:].strip().split("\n")[0].strip()  # Extract and clean the first line

        # Validate if the answer is 'yes' or 'no'
        if answer.lower() in ["yes", "no"]:
            return answer.lower()   # Return 'yes' or 'no' in lowercase
        else:
            return "invalid"     # Return 'invalid' if the answe


In [None]:
def compute_performance(rag_chain):
  responses = []
  predictions = []
  all_doc_ids = []

  for idx, row in questions.iterrows():

      # Input question to RAG pipeline
      test_input = {"query": row['question']}
      response = rag_chain.invoke(test_input)

      doc_id = response['source_documents'][0].metadata['doc_id']
      cleaned_result = extract_answer_from_result(response['result'], marker = "answer (yes or no):")

      responses.append(response)
      all_doc_ids.append(doc_id)
      predictions.append(cleaned_result)

  return responses, predictions, all_doc_ids

In [None]:
# Test a single question
# test_input = {"query": questions.iloc[5]['question']}
# response = rag_chain.invoke(test_input)
# print(f"--- Test Question ---")
# print(f"Answer: {response['result']}")
# print(f"Retrieved Document: {response['source_documents']}\n")


In [None]:
responses, predictions, all_doc_ids = compute_performance(rag_chain)

In [None]:
gold_context = questions['gold_context'].tolist()
print(gold_context)

In [None]:
# Evaluate F1 score
gold_labels = questions['gold_label'].tolist()
f1 = evaluate_generator(predictions, gold_labels)
print(f"F1 Score: {f1:.2f}")

In [None]:
# Evaluate retriever accuracy
gold_document_ids = questions['gold_document_id'].tolist()
accuracy = evaluate_retriever(all_doc_ids, gold_document_ids)
print(f"Retriever Accuracy: {accuracy:.2f}")

In [None]:
# Baseline - without context

responses_baseline = []
predictions_baseline = []

for idx, row in questions.iterrows():
    # Input question to RAG pipeline
    prompt = f"Answer the question with 'Yes' or 'No' only: {row['question']}\nAnswer:"
    response = lm.invoke(prompt)       # Directly use the LM without context

    cleaned_result = extract_answer_from_result(response, marker = "answer:")

    responses_baseline.append(response)
    predictions_baseline.append(cleaned_result)

In [None]:
# Evaluate F1 score for Non-context
f1_baseline = evaluate_generator(predictions_baseline, gold_labels)
print(f"F1 Score: {f1:.2f}")

In [None]:
# Step 5: Make improvements

In [None]:
#====================================================
# Impromentt 1 - Change Embeddings model

In [None]:
embeddings_improve = HuggingFaceEmbeddings(model_name="intfloat/e5-large")

In [None]:
vector_store_improve = Chroma.from_texts(
    texts=texts,
    embedding=embeddings_improve,
    metadatas=metadatas,
    persist_directory="./improved_chroma_data"
)

# Create the updated retriever
retriever_improve = vector_store_improve.as_retriever(search_type="similarity", search_kwargs={"k": 1})


In [None]:
# Create the RetrievalQA chain
rag_chain_improve = RetrievalQA.from_chain_type(
    retriever=retriever_improve,
    llm=lm,
    chain_type="stuff",       # Combine context and LM generation
    return_source_documents=True,  # Return retrieved documents
    chain_type_kwargs={"prompt": rag_prompt}
)

In [None]:
responses_imp, predictions_imp, all_doc_ids_imp = compute_performance(rag_chain_improve)

In [None]:
# Evaluate F1 score for improvement1
f1_imp = evaluate_generator(predictions_imp, gold_labels)
print(f"F1 Score: {f1_imp:.2f}")

In [None]:
#====================================================
# Improvement 2 - Change Chunker(Semantic)

In [None]:
!pip install --quiet langchain_experimental langchain_openai

In [None]:
from langchain_experimental.text_splitter import SemanticChunker

semantic_chunker = SemanticChunker(embeddings_improve, breakpoint_threshold_type="interquartile")

In [None]:
semantic_chunked_docs = []
for idx, row in documents.iterrows():
    chunks = semantic_chunker.split_text(row['abstract'])
    for chunk in chunks:
        semantic_chunked_docs.append({"doc_id": idx, "chunk": chunk})

In [None]:
print("Sample Semantic Chunks:")
for i in range(10):
    print(f"Doc ID: {semantic_chunked_docs[i]['doc_id']}")
    print(f"Chunk: {semantic_chunked_docs[i]['chunk']}")
    print()


In [None]:
texts_imp2 = [chunk["chunk"] for chunk in semantic_chunked_docs]
metadatas_imp2 = [{"doc_id": chunk["doc_id"]} for chunk in semantic_chunked_docs]

# Initialize Chroma vector store
vector_store_imp2 = Chroma.from_texts(
    texts=texts_imp2,         # List of document chunks
    embedding=embeddings_improve,   # HuggingFace embedding model
    metadatas=metadatas_imp2,     # Metadata associated with each document chunk
    persist_directory="./improved_chroma_data2"
)

retriever_imp2 = vector_store_imp2.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [None]:
# Create the RetrievalQA chain
rag_chain_improve2 = RetrievalQA.from_chain_type(
    retriever=retriever_imp2,
    llm=lm,
    chain_type="stuff",         # Combine context and LM generation
    return_source_documents=True,    # Return retrieved documents
    chain_type_kwargs={"prompt": rag_prompt}
)

In [None]:
responses_imp2, predictions_imp2, all_doc_ids_imp2 = compute_performance(rag_chain_improve2)

In [None]:
# Evaluate F1 score for improvement 2
f1_imp2 = evaluate_generator(predictions_imp2, gold_labels)
print(f"F1 Score: {f1_imp2:.2f}")

In [None]:
#====================================================
# Improvement 3 - Prompt

In [None]:
prompt_improve = ChatPromptTemplate.from_template(
    template="""You are a medical expert tasked with answering questions based on provided research abstracts.
    Given the following context, answer the question with 'Yes' or 'No' only. Do not provide any explanation or additional information.
    If the context does not contain enough evidence to support a 'Yes' answer, respond with 'No'.

    Context: {context}
    Question: {question}

    Answer (Yes or No):"""
)

In [None]:
rag_chain_improve3 = RetrievalQA.from_chain_type(
    retriever=retriever,
    llm=lm,
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_improve}
)

In [None]:
test_input = {"query": "Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?"}
response = rag_chain_improve3.invoke(test_input)


print(f"--- Test Question ---")
print(f"Answer: {response['result']}")
print(f"Retrieved Document: {response['source_documents']}\n")

In [None]:
responses_imp3, predictions_imp3, all_doc_ids_imp3 = compute_performance(rag_chain_improve3)

In [None]:
# Evaluate F1 score for improvement 3
f1_imp3 = evaluate_generator(predictions_imp3, gold_labels)
print(f"F1 Score: {f1_imp3:.2f}")