In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import re
import time
from tqdm import tqdm
import pandas as pd
from uuid import uuid4
from typing import List,Tuple
from hashlib import md5
from ragas import evaluate
from json import dumps, loads
from unidecode import unidecode
from collections import defaultdict
from langchain.chains import LLMChain
from langchain.schema import Document
from transformers import AutoTokenizer
from qdrant_client import QdrantClient
from langchain_ollama import OllamaLLM
from elasticsearch import Elasticsearch
from langchain.chains import RetrievalQA
from qdrant_client.models import HnswConfig
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain.chains import QAGenerationChain
from langchain.retrievers import EnsembleRetriever
from langchain_elasticsearch import ElasticsearchStore
from langchain.chains import HypotheticalDocumentEmbedder
from langchain.schema.output_parser import StrOutputParser
from langchain.retrievers import ElasticSearchBM25Retriever
from langchain.document_transformers import LongContextReorder
from langchain.prompts import PromptTemplate,ChatPromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from qdrant_client.conversions import common_types as RestToGrpc
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers.self_query.qdrant import QdrantTranslator
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain.prompts import ChatMessagePromptTemplate,PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter,TokenTextSplitter
from langchain.schema.runnable import RunnableMap,RunnableLambda, RunnablePassthrough
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.document_transformers import EmbeddingsClusteringFilter,EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline,LLMChainExtractor
from langchain.chains.query_constructor.base import StructuredQueryOutputParser, get_query_constructor_prompt

In [None]:
import ast
import asyncio
import nest_asyncio
import pandas as pd
import evaluate as ev
from ranx import evaluate
from langchain_ollama import OllamaLLM
from ragas.llms import LangchainLLMWrapper
from ragas.dataset_schema import SingleTurnSample
from ragas.embeddings import LangchainEmbeddingsWrapper
from sentence_transformers import SentenceTransformer, util
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from ragas.metrics import BleuScore, ExactMatch, RougeScore, ResponseRelevancy, Faithfulness

nest_asyncio.apply()

evaluator_llm        = LangchainLLMWrapper(OllamaLLM(model="gemma3:1b") )
embeddings           = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",model_kwargs={"device": "cpu"})
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)

bleu_metric   = BleuScore()
em_metric     = ExactMatch()
rouge_metric  = RougeScore()
meteor_metric = ev.load("meteor")
bertscore     = ev.load("bertscore")
FF            = Faithfulness(llm=evaluator_llm)
Relevancy     = ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings)

# Embedding and Generative Model 

In [None]:
tokenizer       = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

In [None]:
embedding_model = OllamaEmbeddings(model="nomic-embed-text",num_gpu=1)
llm_model       = OllamaLLM(model="gpt-oss:20b", num_gpu=1)

# Data loader

In [None]:
path     = "Medical_book.pdf"

In [None]:
%%time
document = PDFPlumberLoader(path).load()

In [None]:
docs = document[14:635]

# Data Cleaning

In [None]:
preprocessed_docs = []

def preprocess_page(doc: Document) -> Document:
    text = doc.page_content
    metadata = doc.metadata

    text = re.sub(r'\n?\s*Page\s*\d+\s*\n?', '\n', text, flags=re.IGNORECASE)
    text = text.replace("\n", " ")
    text = re.sub(r'\s{2,}', ' ', text)
    text = text.strip()
    text = unidecode(text)
    text = text.replace('\xa0', ' ')
    text = text.strip()
    text = text.lower()

    lines = text.split('\n')
    joined_lines = []
    buffer = ""

    for line in lines:
        line = line.strip()
        if not line:
            continue
        if len(line) < 60 and not line.endswith(('.', '?', '!', ':')):
            buffer += line + " "
        else:
            buffer += line
            joined_lines.append(buffer.strip())
            buffer = ""
    if buffer:
        joined_lines.append(buffer.strip())

    text = '\n'.join(joined_lines)

    metadata["source"] = "Medical_book"
    metadata["page_number"] = metadata.get("page", "N/A")
    metadata["length"] = len(text)
    metadata["hash"] = md5(text.encode()).hexdigest()

    return Document(page_content=text, metadata=metadata)

seen_hashes = set()

for doc in docs:
    processed_doc = preprocess_page(doc)
    if processed_doc.metadata["hash"] not in seen_hashes and len(processed_doc.page_content.strip()) > 20:
        preprocessed_docs.append(processed_doc)
        seen_hashes.add(processed_doc.metadata["hash"])

In [None]:
preprocessed_docs[0].dict()

# Splitter and Chucking Method 

In [None]:
R_chunks = RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=200,separators=["\n\n", "\n", ".", " ", ""]).split_documents(preprocessed_docs)
chunks   = TokenTextSplitter.from_huggingface_tokenizer(tokenizer=tokenizer,chunk_size=800,chunk_overlap=200).split_documents(R_chunks)

In [None]:
for i, doc in enumerate(chunks):
    num_tokens = len(tokenizer(doc.page_content)["input_ids"])
    if num_tokens > 1024:
        print(f"Chunk {i} is too long: {num_tokens} tokens")

In [None]:
print(f"Split blog post into {len(chunks)} sub-documents.")

In [None]:
chunks[9].dict()

In [None]:
chunk_data = [
    {
        "Chunk": i+1,
        "Page": doc.metadata.get("page", "N/A"),
        "Start Index": doc.metadata.get("start_index", "N/A"),
        "Length": len(doc.page_content),
        "Preview": doc.page_content[:200]
    }
    for i, doc in enumerate(chunks)
]

In [None]:
df = pd.DataFrame(chunk_data)

In [None]:
df.head()

---

# Dense DB

In [None]:
qdrant_DB = Qdrant.from_documents(
    documents=chunks,
    embedding=embedding_model,
    location="http://localhost:6333",
    collection_name="medical_data_denses",
    prefer_grpc=False,
    hnsw_config={
        "m": 16,
        "ef_construct": 128,
        "full_scan_threshold": 10000
    },
    force_recreate=True
)

In [None]:
client = QdrantClient(host="localhost", port=6333,timeout=120.0)
client.get_collections()

In [None]:
optional thing
snapshot = client.create_snapshot(collection_name="medical_data_denses")
snapshot

# Loading the Qdrant DB

In [None]:
client = QdrantClient(url="http://localhost:6333", prefer_grpc=True)

In [None]:
qdrant = QdrantVectorStore(client=client,collection_name="medical_data_denses",embedding=embedding_model)

In [None]:
%%time
dense         = "What is Alzheimer’s disease?"
dense_results = qdrant.similarity_search_with_score(dense, k=3)

In [None]:
for doc, score in dense_results:
    print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

---

# Spare DB

In [None]:
elastic_DB = ElasticsearchStore(
                                embedding=embedding_model,
                                index_name="medical_data_spares",
                                es_url="http://localhost:9200", # http://localhost:9201
                                strategy=ElasticsearchStore.BM25RetrievalStrategy(k1=1.2, b=0.75)
                               )

In [None]:
uuids = [str(uuid4()) for _ in range(len(chunks))]

In [None]:
elastic_DB.add_documents(documents=chunks, ids=uuids)

# Loading the ElasticSearch DB

In [None]:
es_client = Elasticsearch("http://localhost:9200")

In [None]:
elastic   = ElasticsearchStore(
                            embedding=embedding_model,
                            index_name="medical_data_spares",
                            es_url="http://localhost:9200",
                            strategy=ElasticsearchStore.BM25RetrievalStrategy()
                            )

In [None]:
spare_retriever = elastic.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
%%time
dense         = "What is Alzheimer disease?"
dense_results = spare_retriever.invoke(dense)

In [None]:
for i, doc in enumerate(dense_results):
    print(f"\nResult {i+1}:\n{doc.page_content}")

---

In [None]:
vectorstore_retreiver = qdrant.as_retriever(search_kwargs={"k": 3})
spare_retriever       = elastic.as_retriever(search_kwargs={"k": 3})

---

# Ensemble Retriever

In [None]:
ensemble_retriever = EnsembleRetriever(
                                      retrievers=[vectorstore_retreiver,spare_retriever],
                                      weights=[0.3, 0.7]  # Score fusion
                                      ) 

In [None]:
print(ensemble_retriever)

In [None]:
def query_embedding(query: str, k: int = 3) -> list[str]:
    try:
        results = ensemble_retriever.invoke(query) 
        return [doc.page_content for doc in results[:k]]  
    except Exception as e:
        print(f"Error during similarity search: {e}")
        return []

In [None]:
es = query_embedding("What is Alzheimer disease?")

In [None]:
chain = RetrievalQA.from_chain_type(llm=llm_model, chain_type="map_reduce", retriever=ensemble_retriever)

In [None]:
%%time
response = chain.invoke("What is Alzheimer disease?")

In [None]:
print(response['result'])

---

# Synthetic Question Generator

In [None]:
full_text = "\n\n".join(doc.page_content for doc in preprocessed_docs)

len(full_text)

In [None]:
R_chunks = RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=200,separators=["\n\n", "\n", ".", " ", ""]).split_text(full_text)
splitter = TokenTextSplitter.from_huggingface_tokenizer(tokenizer=tokenizer,chunk_size=800,chunk_overlap=200)

In [None]:
chunks = []
for chunk in R_chunks:
    chunks.extend(splitter.split_text(chunk))

In [None]:
chunks[:3]

In [None]:
qa_chain  = QAGenerationChain.from_llm(llm_model, max_questions=3)

In [None]:
%%time
qa_pairs  = []
for i, chunk in enumerate(chunks):
    try:
        result = qa_chain.invoke({"text": chunk})
        qa_pairs.extend(result["questions"])  
    except Exception as e:
        print(f"[Chunk {i}] Error generating QA: {e}")
        

df = pd.DataFrame(qa_pairs)
df.to_excel("Synthetic Question Generator.xlsx", index=False)

---

# Testing Questions

In [None]:
df = pd.read_excel('Synthetic Question Generator.xlsx')

In [None]:
sample_queries      = df['user_query'].tolist()
expected_responses  = df['ground_truth'].tolist()

In [None]:
sample_queries

---

# Testing Ensemble Retriever 

In [None]:
dataset = []

for query, reference in zip(sample_queries, expected_responses):
    relevant_docs = query_embedding(query)  
    response      = chain.invoke(query)
    result        = response["result"]

    dataset.append({
        "user_input": query,
        "retrieved_contexts": relevant_docs,
        "reference": reference,
        "response": result
    })

df = pd.DataFrame(dataset)
df.to_excel("Ensemble Retriever.xlsx", index=False)
dataset = df.to_dict('records')

---

# RAG Fusion

In [None]:
prompt = ChatPromptTemplate(
    input_variables=["original_query"],
    messages=[
        SystemMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=[],
                template="You are a helpful assistant that generates multiple search queries based on a single input query."
            )
        ),
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=["original_query"],
                template="Generate multiple search queries related to: {original_query} \nOUTPUT (4 queries):"
            )
        )
    ]
)

In [None]:
generate_queries = (prompt | llm_model | StrOutputParser() | (lambda x: [q.strip() for q in x.split("\n") if q.strip()]))

### Reciprocal Rank Fusion

In [None]:
def reciprocal_rank_fusion(results: List[List[Document]], k: int = 60) -> List[Tuple[Document, float]]:
    """
    Perform Reciprocal Rank Fusion on multiple ranked lists of Documents.
    
    Args:
        results (List[List[Document]]): A list of ranked lists of Documents.
        k (int): Rank-smoothing constant. Default is 60.

    Returns:
        List[Tuple[Document, float]]: Re-ranked list of unique Documents with fusion scores.
    """
    fused_scores = {}

    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc.dict(), sort_keys=True)
            fused_scores[doc_str] = fused_scores.get(doc_str, 0) + 1 / (rank + 1 + k)
    reranked = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    return [(Document.parse_obj(loads(doc_str)), score) for doc_str, score in reranked]

In [None]:
rrf_runnable    = RunnableLambda(lambda results: [doc for doc, _ in reciprocal_rank_fusion(results)])

In [None]:
ragfusion_chain = generate_queries | RunnableLambda(lambda queries: [vectorstore_retreiver.invoke(q) for q in queries]) | rrf_runnable

In [None]:
def rff_query_embedding(query: str, k: int = 3) -> list[str]:
    try:
        results = ragfusion_chain.invoke(query) 
        return [doc.page_content for doc in results[:k]]  
    except Exception as e:
        print(f"Error during similarity search: {e}")
        return []

In [None]:
rrf_content = rff_query_embedding("What is Alzheimer disease?")

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
full_rag_fusion_chain = (
    {
        "context":  ragfusion_chain,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm_model
    | StrOutputParser()
)

In [None]:
%%time
op = full_rag_fusion_chain.invoke("What is Alzheimer disease?")

In [None]:
print(op)

# Testing RAG Fusion with RRF

In [None]:
dataset = []
for query, reference in tqdm(zip(sample_queries, expected_responses)):
    relevant_docs = rff_query_embedding(query)
    result = full_rag_fusion_chain.invoke(query)
    dataset.append({
        "user_input": query,
        "retrieved_contexts": relevant_docs,
        "reference": reference,
        "response": result
    })
    
df = pd.DataFrame(dataset)
df.to_excel("RAG Fusion with RRF.xlsx", index=False)

---

# Contextual Compression Retrieval

In [None]:
filters     = EmbeddingsRedundantFilter(embeddings=embedding_model) 

# Removes near-duplicate documents/chunks using embedding similarity.

In [None]:
def custom_get_input(query: str, doc: Document) -> dict:
    return {"input_text": doc.page_content}

In [None]:
small_llm  = OllamaLLM(model="gemma3:1b")

prompt     = PromptTemplate.from_template("Summarize the following text:\n\n{input_text}")

llm_chain  = LLMChain(llm=small_llm, prompt=prompt)

summarizer = LLMChainExtractor(llm_chain=llm_chain,get_input=custom_get_input)  

# LLMChainExtractor summarizes each document/chunk separately, not the entire document set.

In [None]:
reordering = LongContextReorder() 

# Reorders the chunks to bring the most relevant ones earlier in the input.

In [None]:
pipeline   = DocumentCompressorPipeline(transformers=[filters, summarizer, reordering]) 

# Combines multiple transformers into a single pipeline think of it like a document preprocessing chain: 

#     i) Remove redundancy 

##   ii) Reorder for relevance

### iii) summarize

# This would summarize chunks before reordering. 

# Sometimes works better when summaries are uniform in length and easier to compare for ranking. 

# But generally, summarizing last is preferred.

In [None]:
compression_retriever_reordered = ContextualCompressionRetriever(
                                                                 base_compressor = pipeline, 
                                                                 base_retriever  = vectorstore_retreiver,
                                                                 search_kwargs   = {"k": 3, "include_metadata": True}
                                                                 ) 

# Wraps the base retriever and applies the compression pipeline after retrieval but before passing context to the LLM.

In [None]:
docs = compression_retriever_reordered.invoke("What is Alzheimer disease?")

In [None]:
def CRR_query_embedding(query: str, k: int = 3) -> list[str]:
    try:
        results = compression_retriever_reordered.invoke(query) 
        return [doc.page_content for doc in results[:k]]  
    except Exception as e:
        print(f"Error during similarity search: {e}")
        return []

In [None]:
compressed_docs = CRR_query_embedding("What is Parkinson’s disease?")

In [None]:
chain = RetrievalQA.from_chain_type(llm=llm_model, retriever=compression_retriever_reordered)

In [None]:
query = "What is Alzheimer disease?"

In [None]:
print(chain.invoke(query)['result'])

In [None]:
dataset = []
for query, reference in tqdm(zip(sample_queries, expected_responses)):
    relevant_docs = CRR_query_embedding(query)
    result        = chain.invoke(query)['result']
    dataset.append({
        "user_input": query,
        "retrieved_contexts": relevant_docs,
        "reference": reference,
        "response": result
    })
    
df = pd.DataFrame(dataset)
df.to_excel("Contextual Compression Retrieval.xlsx", index=False)

---

# Hypothetical Document Embedder

In [None]:
hyde  = HypotheticalDocumentEmbedder.from_llm(llm = llm_model,base_embeddings = embedding_model, prompt_key = "web_search")

In [None]:
query   = 'What is Alzheimer disease?'

In [None]:
results = qdrant.similarity_search_by_vector(hyde.embed_query(query), k=3)

In [None]:
def hyde_query_embedding(query: str, k: int = 3) -> list[str]:
    try:
        results = qdrant.similarity_search_by_vector(hyde.embed_query(query), k=3)
        return [doc.page_content for doc in results[:k]]  
    except Exception as e:
        print(f"Error during similarity search: {e}")
        return []

In [None]:
doc_op = hyde_query_embedding("What is Parkinson’s disease?")

In [None]:
hyde_embed_query = RunnableLambda(hyde.embed_query)

qdrant_retriever = RunnableLambda(lambda vector: qdrant.similarity_search_by_vector(vector, k=3))

In [None]:
prompt = PromptTemplate.from_template("""
Use the following context to answer the question:

{context}

Question: {question}
Answer:
""")

In [None]:
def format_docs(docs: list[Document]) -> str:
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
rag_chain = (
    RunnableMap({
        "question": RunnablePassthrough(),  
        "context" : hyde_embed_query | qdrant_retriever | RunnableLambda(format_docs)
    })
    | prompt
    | llm_model
    | StrOutputParser()
)

In [None]:
response = rag_chain.invoke("What is Parkinson’s disease?")

In [None]:
print(response)

In [None]:
dataset = []
for query, reference in tqdm(zip(sample_queries, expected_responses)):
    relevant_docs = hyde_query_embedding(query)
    result        = rag_chain.invoke(query)
    dataset.append({
        "user_input": query,
        "retrieved_contexts": relevant_docs,
        "reference": reference,
        "response": result
    })
    
df = pd.DataFrame(dataset)
df.to_excel("hyde.xlsx", index=False)

---

# Flash Reranker

In [None]:
compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")

In [None]:
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=vectorstore_retreiver)

In [None]:
def FR_query_embedding(query: str, k: int = 3) -> list[str]:
    try:
        results = compression_retriever.invoke(query) 
        return [doc.page_content for doc in results[:k]]  
    except Exception as e:
        print(f"Error during similarity search: {e}")
        return []

In [None]:
compressed_docs = FR_query_embedding("What is Parkinson’s disease?")

In [None]:
prompt = PromptTemplate.from_template("""
Use the following context to answer the question:

{context}

Question: {question}
Answer:
""")

In [None]:
chain = (
    {"context": compression_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm_model
    | StrOutputParser()
)

In [None]:
query  = "What is Alzheimer disease?"

output = chain.invoke(query)

In [None]:
print(output)

In [None]:
dataset = []
for query, reference in tqdm(zip(sample_queries, expected_responses)):
    relevant_docs = FR_query_embedding(query)
    result        = chain.invoke(query)
    dataset.append({
        "user_input": query,
        "retrieved_contexts": relevant_docs,
        "reference": reference,
        "response": result
    })
    
df = pd.DataFrame(dataset)
df.to_excel("Flash Reranker.xlsx", index=False)

---

---

---

# Evaluation metrics

In [None]:
def reteriver_metrics(df, model=SentenceTransformer("all-MiniLM-L6-v2"), threshold=0.55, k=5):
    """
    Evaluate RAG retrieval performance with per-query precision, recall, and top-k ranking metrics.

    Args:
        df (pd.DataFrame): Must have columns 'reference' (string) and 'retrieved_contexts' (list of strings).
        model: SentenceTransformer embedding model.
        threshold (float): Cosine similarity threshold to count a retrieved doc as relevant.
        k (int): Top-k for ranking metrics.

    Returns:
        dict: context-level precision, recall, and top-k ranking metrics.
    """
    qrels = {}  # ground-truth relevance
    run   = {}  # predicted similarity scores

    per_query_precision = []
    per_query_recall    = []

    for idx, row in df.iterrows():
        qid       = f"q{idx}"
        reference = row["reference"]
        retrieved = row["retrieved_contexts"]

        if not isinstance(retrieved, list):
            retrieved = [retrieved]

        # Compute embeddings
        ref_emb  = model.encode(reference, convert_to_tensor=True)
        ctx_embs = model.encode(retrieved, convert_to_tensor=True)

        # Cosine similarity
        sims = util.cos_sim(ref_emb, ctx_embs).flatten().tolist()
        sims = [float(s) for s in sims]

        # Create qrels and run entries
        rels = {f"d{idx}_{i}": int(sim > threshold) for i, sim in enumerate(sims)}
        qrels[qid] = rels
        run[qid]   = {f"d{idx}_{i}": float(sim) for i, sim in enumerate(sims)}

        # Per-query precision & recall
        relevant_count = sum(rels.values())
        retrieved_count = len(retrieved)

        # Precision = relevant retrieved / total retrieved
        precision = relevant_count / retrieved_count if retrieved_count else 0

        # Recall = 1 if at least one relevant doc retrieved, else 0 (binary relevance assumption)
        recall = 1.0 if relevant_count > 0 else 0.0

        per_query_precision.append(precision)
        per_query_recall.append(recall)

    # Context-level aggregated metrics
    context_precision = sum(per_query_precision) / len(per_query_precision)
    context_recall    = sum(per_query_recall) / len(per_query_recall)

    # Top-k ranking metrics using Ranx
    ranking_metrics = evaluate(
        qrels,
        run,
        metrics=[f"precision@{k}", f"recall@{k}", f"mrr@{k}", f"ndcg@{k}"]
    )

    # Combine all results
    final_results = {
        "context_precision": context_precision,
        "context_recall": context_recall,
        **ranking_metrics
    }

    return final_results


async def generator_Metrics(predictions, references, user_input, retrieved_docs):
    """
    Evaluate RAG predictions with multiple metrics and return a DataFrame.

    Args:
        predictions (list): Generated answers.
        references (list): Ground-truth answers.
        user_input (list): Queries.
        retrieved_docs (list): Retrieved contexts.
        Relevancy, FF, bleu_metric, em_metric, rouge_metric: Ragas metric objects.
        meteor_metric, bertscore: other metric objects.

    Returns:
        pd.DataFrame: DataFrame with all metrics per query.
    """
    data = []

    for i in range(len(predictions)):
        # Ensure reference is a string
        ref_str = references[i] if isinstance(references[i], str) else references[i][0]

        sample = SingleTurnSample(
            user_input=user_input[i],
            response=predictions[i],
            reference=ref_str,
            retrieved_contexts=[ref_str]
        )

        # Evaluate all metrics asynchronously
        relevancyy   = await Relevancy.single_turn_ascore(sample)
        faith        = await FF.single_turn_ascore(sample)    
        bleu_score   = await bleu_metric.single_turn_ascore(sample)
        em_score     = await em_metric.single_turn_ascore(sample)
        rouge_score  = await rouge_metric.single_turn_ascore(sample)
        meteor_score = meteor_metric.compute(predictions=[predictions[i]], references=[ref_str])["meteor"]
        bertscore_results  = bertscore.compute(predictions=[predictions[i]], references=[ref_str], lang="en")
        hallucination_rate = 1 - faith

        row = {
            "query": user_input[i],
            "retrieved_doc": retrieved_docs[i],
            "prediction": predictions[i],
            "reference": ref_str,
            "BLEU": bleu_score,
            "ExactMatch": em_score,
            "ROUGE": rouge_score,
            "BERTScore": bertscore_results["f1"][0],
            "METEOR": meteor_score,
            "ResponseRelevancy": relevancyy,
            "Faithfulness": faith,
            "hallucination_rate": hallucination_rate,
        }
        data.append(row)

    return pd.DataFrame(data)

---

In [None]:
ES                       = pd.read_excel(".\excel files\Ensemble Retriever.xlsx")
ES["retrieved_contexts"] = ES["retrieved_contexts"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

RRF                       = pd.read_excel(".\excel files\RAG Fusion with RRF.xlsx")
RRF["retrieved_contexts"] = RRF["retrieved_contexts"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

CCR                       = pd.read_excel(".\excel files\Contextual Compression Retrieval.xlsx")
CCR["retrieved_contexts"] = CCR["retrieved_contexts"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

hyde                       = pd.read_excel(".\excel files\hyde.xlsx")
hyde["retrieved_contexts"] = hyde["retrieved_contexts"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

RR                       = pd.read_excel(".\excel files\Flash Reranker.xlsx")
RR["retrieved_contexts"] = RR["retrieved_contexts"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

## Retriever Metrics

In [None]:
results = {
    "EnsembleRetriever": reteriver_metrics(ES),
    "RAGFusionRRF": reteriver_metrics(RRF),
    "ContextualCompression": reteriver_metrics(CCR),
    "HyDE": reteriver_metrics(hyde),
    "FlashReranker": reteriver_metrics(RR)
}

df_results = pd.DataFrame.from_dict(results, orient="index").reset_index()
df_results.rename(columns={"index": "Method"}, inplace=True)
df_results.to_excel("RAG_Retrieval_Metrics.xlsx", index=False)

In [None]:
df_results

## Generator Metrics

| **Metric**             | **Range**            | **Good When**      | **Bad When** | **Tip / Trick**                                                                                          |
| ---------------------- | -------------------- | ------------------ | ------------ | -------------------------------------------------------------------------------------------------------- |
| **BLEU**               | 0 → 1                | 🔼 Higher = Better | 🔽 Lower     | Good for short, n-gram overlap (machine translation style). Works best if answers are short and factual. |
| **Exact Match (EM)**   | 0 or 1 (sometimes %) | 🔼 Higher = Better | 🔽 Lower     | Strict metric — requires generated text = reference exactly. Only makes sense for factoid Q\&A.          |
| **ROUGE**              | 0 → 1                | 🔼 Higher = Better | 🔽 Lower     | Focuses on recall/coverage. Good if you want the model to capture most of the important reference words. |
| **BERTScore**          | 0 → 1                | 🔼 Higher = Better | 🔽 Lower     | Semantic similarity (uses embeddings). Better than BLEU/ROUGE when wording differs but meaning is same.  |
| **METEOR**             | 0 → 1                | 🔼 Higher = Better | 🔽 Lower     | Balances precision & recall with synonym matching. More forgiving than BLEU.                             |
| **Response Relevancy** | 0 → 1                | 🔼 Higher = Better | 🔽 Lower     | Checks if answer is relevant to the query/context. Low = answer is off-topic.                            |
| **Faithfulness**       | 0 → 1                | 🔼 Higher = Better | 🔽 Lower     | Checks if answer is grounded in retrieved docs. Low = hallucination risk.                                |
| **Hallucination Rate** | 0 → 1                | 🔽 Lower = Better  | 🔼 Higher    | Measures how much the model "makes up" stuff. Ideally close to 0.                                        |


##### EnsembleRetriever

In [None]:
ES_user_input     = ES["user_input"].tolist()
ES_retrieved_docs = ES["retrieved_contexts"].tolist()
ES_predictions    = ES["response"].tolist()
ES_references     = ES["reference"].tolist()

In [None]:
ES_metrics        = asyncio.run(generator_Metrics(ES_predictions,ES_references,ES_user_input,ES_retrieved_docs))

In [None]:
ES_metrics

##### RAG Fusion RRF

In [None]:
RRF_user_input     = RRF["user_input"].tolist()
RRF_retrieved_docs = RRF["retrieved_contexts"].tolist()
RRF_predictions    = RRF["response"].tolist()
RRF_references     = RRF["reference"].tolist()

In [None]:
RRF_metrics        = asyncio.run(generator_Metrics(RRF_predictions,RRF_references,RRF_user_input,RRF_retrieved_docs))

In [None]:
RRF_metrics

##### Contextual Compression

In [None]:
CCR_user_input     = CCR["user_input"].tolist()
CCR_retrieved_docs = CCR["retrieved_contexts"].tolist()
CCR_predictions    = CCR["response"].tolist()
CCR_references     = CCR["reference"].tolist()

In [None]:
CCR_metrics        = asyncio.run(generator_Metrics(CCR_predictions,CCR_references,CCR_user_input,CCR_retrieved_docs))

In [None]:
CCR_metrics

##### HyDE

In [None]:
hyde_user_input     = hyde["user_input"].tolist()
hyde_retrieved_docs = hyde["retrieved_contexts"].tolist()
hyde_predictions    = hyde["response"].tolist()
hyde_references     = hyde["reference"].tolist()

In [None]:
hyde_metrics        = asyncio.run(generator_Metrics(hyde_predictions,hyde_references,hyde_user_input,hyde_retrieved_docs))

In [None]:
hyde_metrics

##### FlashReranker

In [None]:
RR_user_input     = RR["user_input"].tolist()
RR_retrieved_docs = RR["retrieved_contexts"].tolist()
RR_predictions    = RR["response"].tolist()
RR_references     = RR["reference"].tolist()

In [None]:
RR_metrics        = asyncio.run(generator_Metrics(RR_predictions,RR_references,RR_user_input,RR_retrieved_docs))

In [None]:
RR_metrics

In [None]:
# Save multiple sheets in one Excel file
with pd.ExcelWriter("RAG_Generation_Metrics.xlsx") as writer:
    ES_metrics.to_excel(writer, sheet_name="EnsembleRetriever", index=False)
    RRF_metrics.to_excel(writer, sheet_name="RAGFusionRRF", index=False)
    CCR_metrics.to_excel(writer, sheet_name="ContextualCompression", index=False)
    hyde_metrics.to_excel(writer, sheet_name="HyDE", index=False)
    RR_metrics.to_excel(writer, sheet_name="FlashReranker", index=False)

# Conclusion

we systematically evaluated five RAG retrieval approaches for **medical domain QA**:

- **EnsembleRetriever** (Dense + Sparse fusion)  
- **RAG Fusion (RRF)** (query expansion + reciprocal rank fusion)  
- **Contextual Compression Retrieval (CCR)** (summarization + redundancy filtering)  
- **HyDE** (hypothetical document generation)  
- **Flash Reranker** (neural re-ranking on top of base retriever)  

##  Retrieval Metrics
From the `RAG_Retrieval_Metrics.xlsx` results:  
- **HyDE** achieved the **highest context precision (1.0)** while maintaining perfect recall and ranking scores (`mrr@5 = 1.0`, `ndcg@5 = 1.0`).  
- Other methods (EnsembleRetriever, RRF, CCR, FlashReranker) performed almost identically, with strong recall (1.0) but slightly lower precision (≈0.89).  

 **HyDE is the best retriever overall** because it not only retrieves relevant documents consistently but also reduces noise (higher precision).  

##  Generation Metrics
From the `RAG_Generation_Metrics.xlsx` results:  
- **BLEU, ROUGE, METEOR** show how much generated answers overlap with ground-truth text.  
- **BERTScore & Response Relevancy** confirm semantic alignment with the reference.  
- **Faithfulness & Hallucination Rate** are critical in the medical domain. Models with higher faithfulness (closer to 1) and lower hallucination (<0.2) are more reliable.  

##### Across all methods, **HyDE and Contextual Compression Retrieval** tend to give more **faithful and relevant responses**, while **Flash Reranker** improves ranking but sometimes introduces hallucinations.