In [1]:
!pip3 install python-dotenv huggingface-hub llama-index transformers sentence-transformers llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-llms-openrouter llama-index-retrievers-bm25 tabula-py  jpype1 pystemmer


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# =========================================================
# RAG Evaluation Notebook
# =========================================================
# Evaluates RAGIndex retrieval performance using auto-generated question-context pairs
# Metrics: MRR, Hit Rate, Precision, Recall, Faithfulness, Relevancy
# =========================================================

from llama_index.core import (Document, Settings)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openrouter import OpenRouter
from dotenv import load_dotenv
import os, sys
import nest_asyncio

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

load_dotenv()
api_key = os.getenv("OPENROUTER_API_KEY")
if api_key:
    print("✅ API Key Loaded Successfully:", api_key[:5] + "..." + api_key[-5:])
else:
    print("⚠️ API Key is missing! Check your .env file.")


# Initialize OpenRouter LLM
llm = OpenRouter(api_key=api_key, model="mistralai/mistral-7b-instruct", max_tokens=512, context_window=4096) # Creates questions and answers
Judge_llm = OpenRouter(api_key=api_key, model="qwen/qwen-turbo", max_tokens=512, context_window=4096) # Creates questions and answers
Settings.llm = llm

# Apply nest_asyncio to fix event loop issues in Jupyter
nest_asyncio.apply()

# Set up embedding model
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
Settings.embed_model = embed_model

✅ API Key Loaded Successfully: sk-or...94d80


In [17]:
# ---------------------------------------------------------
# Step 1: Prepare the documents
# ---------------------------------------------------------
from pathlib import Path
from rag import RAGIndex

parent_dir = str(Path(os.getcwd()).parent)
project_root = Path(parent_dir)
rag_doc_dir = project_root / "data" / "rag_docs"
faiss_dir = project_root / "data" / "faiss_index"

RAG = RAGIndex(str(rag_doc_dir), str(faiss_dir))

# Load all document chunks from RAG index into a list
# These chunks become the basis for generating test questions
docs = []
if RAG.mode == "faiss" and RAG.vs is not None:
    # Loops through every document chunk stored inside the FAISS docstore
    for doc in RAG.vs.docstore._dict.values():
        # Re-wraps each chunk into standard Document format (for LlamaIndex evaluation)
        docs.append(Document(text=doc.page_content, metadata=doc.metadata))
elif RAG.mode == "bm25" and RAG.retriever is not None:
    for doc in RAG.retriever.docstore:
        docs.append(Document(text=doc.page_content, metadata=doc.metadata))
print(f"Loaded {len(docs)} chunks from RAG index.")

[RAG] Loaded TXT: activity_frequency.txt (1 docs) [category=activity]
[RAG] Loaded TXT: field_standards.txt (1 docs) [category=field]
[RAG] Loaded TXT: mowing_standard.txt (1 docs) [category=mowing]
[RAG] Total raw documents loaded: 3
[RAG] Loaded existing FAISS index (up-to-date)
Loaded 44 chunks from RAG index.


In [18]:
# ---------------------------------------------------------
# Step 2: Prepare nodes from RAG's ORIGINAL chunks
# ---------------------------------------------------------
from llama_index.core.schema import TextNode

# Converts 'Document' objects to `TextNode` objects
# This preserves the exact 900-char chunks RAG uses
nodes = []
for i, doc in enumerate(docs):
    node = TextNode(
        text=doc.text,
        metadata=doc.metadata,
        # Let LlamaIndex generate IDs naturally
    )
    nodes.append(node)

print(f"✅ Created {len(nodes)} nodes (preserving RAG's original 900-char chunks)")
print(f"   First node length: {len(nodes[0].get_content())} chars")

✅ Created 44 nodes (preserving RAG's original 900-char chunks)
   First node length: 900 chars


In [19]:
# ---------------------------------------------------------
# Step 3: Auto-generate question–context pairs
# ---------------------------------------------------------
from llama_index.core.evaluation import generate_question_context_pairs

qa_dataset = generate_question_context_pairs(
    nodes=nodes,
    llm=llm,  # OpenRouter mistral-7b-instruct model
    num_questions_per_chunk=2,  # can change to 1–3 for speed
)

qa_data_dict = qa_dataset.model_dump()

print(f"✅ Generated {len(qa_dataset.queries)} question-context pairs")
print(f"   Corpus size: {len(qa_data_dict['corpus'])} chunks")
print(f"   First corpus text length: {len(list(qa_data_dict['corpus'].values())[0])} chars")

# Verify corpus matches RAG chunks
print("\n=== Verification ===")
sample_corpus_text = list(qa_data_dict["corpus"].values())[0]
sample_rag_text = docs[0].text.strip()

if sample_corpus_text.strip() == sample_rag_text:
    print("✅ Corpus text MATCHES RAG chunks perfectly!")
elif sample_corpus_text in sample_rag_text or sample_rag_text in sample_corpus_text:
    print("⚠️  Corpus text partially matches RAG chunks")
    print(f"   Corpus length: {len(sample_corpus_text)}")
    print(f"   RAG length: {len(sample_rag_text)}")
else:
    print("❌ Corpus text DOES NOT match RAG chunks")
    print(f"   Corpus preview: {sample_corpus_text[:100]}")
    print(f"   RAG preview: {sample_rag_text[:100]}")

100%|██████████| 44/44 [01:33<00:00,  2.11s/it]

✅ Generated 86 question-context pairs
   Corpus size: 44 chunks
   First corpus text length: 900 chars

=== Verification ===
✅ Corpus text MATCHES RAG chunks perfectly!





In [20]:
# =========================================================
# Step 4: Automatic Evaluation
# =========================================================

import pandas as pd
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore, TextNode, QueryBundle
from llama_index.core.evaluation import RetrieverEvaluator
from typing import List

# ===================================================================
# Wrapper: Makes RAG.retrieve() compatible with LlamaIndex
# ===================================================================

class DirectRAGRetriever(BaseRetriever):
    """Wrapper for RAG system"""
    
    def __init__(self, rag_obj, corpus_text_to_id, top_k=2):
        self._rag = rag_obj
        self._text_to_id = corpus_text_to_id
        self._top_k = top_k
        super().__init__()
    
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        # Call RAG
        hits = self._rag.retrieve(query_bundle.query_str, k=self._top_k)
        
        # Convert to LlamaIndex nodes with corpus IDs
        nodes = []
        for hit in hits:
            text = hit.get("text", "").strip()
            corpus_id = self._text_to_id.get(text)
            
            if corpus_id:  # Only add if ID mapping succeeds
                node = TextNode(text=text, id_=corpus_id, metadata=hit)
                nodes.append(NodeWithScore(node=node, score=hit.get("score", 0.0)))
        
        return nodes

# Build ID mapping
corpus_text_to_id = {text: cid for cid, text in qa_data_dict["corpus"].items()}

# Create retriever
retriever = DirectRAGRetriever(RAG, corpus_text_to_id, top_k=2)

print(f"✅ Created automatic retriever for {RAG.mode.upper()} system\n")
print("="*70)
print("RETRIEVAL METRICS (Automatic Evaluation)")
print("="*70)

evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate", "precision", "recall"],
    retriever=retriever
)

eval_results = await evaluator.aevaluate_dataset(qa_dataset)
retrieval_df = pd.DataFrame([r.metric_vals_dict for r in eval_results if r.metric_vals_dict])

print(f"\n{'Metric':<15} {'Score':<10}")
print("-" * 25)
for metric in ["mrr", "hit_rate", "precision", "recall"]:
    print(f"{metric.upper():<15} {retrieval_df[metric].mean():.4f}")

print(f"\nEvaluated {len(retrieval_df)}/{len(qa_dataset.queries)} queries")

# # Save
# retrieval_df.to_csv("rag_retrieval_auto.csv", index=False)
# print("✅ Saved to 'rag_retrieval_auto.csv'")

✅ Created automatic retriever for FAISS system

RETRIEVAL METRICS (Automatic Evaluation)

Metric          Score     
-------------------------
MRR             0.7209
HIT_RATE        0.8023
PRECISION       0.4012
RECALL          0.8023

Evaluated 86/86 queries


In [None]:
# =========================================================
# Step 5: Faithfulness & Relevancy with RetrieverQueryEngine
# =========================================================

from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
import pandas as pd

print("="*70)
print("GENERATION METRICS (Faithfulness & Relevancy)")
print("="*70)

# Step 1: Create RetrieverQueryEngine
# This combines your retriever + LLM into a complete RAG pipeline
query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever,  # Your DirectRAGRetriever from earlier
    llm=llm,  # Your OpenRouter LLM
    response_mode="compact",  # How to combine retrieved chunks
)

print(f"\n✅ Created RetrieverQueryEngine with {RAG.mode.upper()} retriever")

# Step 2: Initialize evaluators
faithfulness_evaluator = FaithfulnessEvaluator(llm=Judge_llm)
relevancy_evaluator = RelevancyEvaluator(llm=Judge_llm)

# Step 3: Evaluate each query
faithfulness_scores = []
relevancy_scores = []
failed_queries = []

print("\nEvaluating responses...")
print(f"(Processing {len(qa_dataset.queries)} questions)\n")

for i, (query_id, query_text) in enumerate(qa_dataset.queries.items()):
    if i % 10 == 0:
        print(f"Progress: {i}/{len(qa_dataset.queries)}")
    
    try:
        # Query the engine - handles retrieval + answer generation
        response = query_engine.query(query_text)
        
        # OPTION 1: Pass Response object directly (evaluators extract contexts automatically)
        faith_result = faithfulness_evaluator.evaluate_response(
            query=query_text,
            response=response  # Just pass the response object
        )
        faith_score = faith_result.score if faith_result.score is not None else 0.0
        faithfulness_scores.append(faith_score)
        
        rel_result = relevancy_evaluator.evaluate_response(
            query=query_text,
            response=response  # Just pass the response object
        )
        rel_score = rel_result.score if rel_result.score is not None else 0.0
        relevancy_scores.append(rel_score)
        
        # Track low-scoring queries
        if faith_score < 0.5 or rel_score < 0.5:
            failed_queries.append({
                "query": query_text,
                "answer": str(response)[:200],
                "faithfulness": faith_score,
                "relevancy": rel_score,
                "faith_feedback": getattr(faith_result, 'feedback', ''),
                "rel_feedback": getattr(rel_result, 'feedback', ''),
                "num_sources": len(response.source_nodes)
            })
    
    except Exception as e:
        print(f"⚠️  Error on query {i}: {str(e)}")
        faithfulness_scores.append(0.0)
        relevancy_scores.append(0.0)
        continue

# Print results
print("\n" + "="*70)
print("RESULTS")
print("="*70)

if faithfulness_scores and relevancy_scores:
    print(f"\n{'Metric':<20} {'Score':<10} {'Min':<10} {'Max':<10}")
    print("-" * 50)
    print(f"{'Faithfulness':<20} {sum(faithfulness_scores)/len(faithfulness_scores):.4f}     "
          f"{min(faithfulness_scores):.4f}     {max(faithfulness_scores):.4f}")
    print(f"{'Relevancy':<20} {sum(relevancy_scores)/len(relevancy_scores):.4f}     "
          f"{min(relevancy_scores):.4f}     {max(relevancy_scores):.4f}")
    
    print(f"\nEvaluated: {len(faithfulness_scores)}/{len(qa_dataset.queries)} queries")
    
    # Save results
    results_df = pd.DataFrame({
        "query": list(qa_dataset.queries.values())[:len(faithfulness_scores)],
        "faithfulness": faithfulness_scores,
        "relevancy": relevancy_scores,
    })
    results_df.to_csv("generation_metrics.csv", index=False)
    print("\n✅ Saved to 'generation_metrics.csv'")
    
else:
    print("❌ No results to display - check for errors above")

GENERATION METRICS (Faithfulness & Relevancy)

✅ Created RetrieverQueryEngine with FAISS retriever

Evaluating responses...
(Processing 86 questions)

Progress: 0/86
Progress: 10/86
Progress: 20/86
Progress: 30/86
Progress: 40/86
Progress: 50/86
Progress: 60/86
Progress: 70/86
Progress: 80/86

RESULTS

Metric               Score      Min        Max       
--------------------------------------------------
Faithfulness         0.9651     0.0000     1.0000
Relevancy            0.6744     0.0000     1.0000

Evaluated: 86/86 queries
Low-scoring (<0.5): 28

=== Sample Low-Scoring Responses ===

1. Query: What is the primary difference between the tasks described in "Paper Picking" (1...
   Answer:  The primary difference between "Paper Picking" (115) and "Regular Maintenance" (120) lies in their descriptions and frequencies. "Paper Picking" involves the manual removal of litter and small debris...
   Scores: Faithfulness=1.00, Relevancy=0.00
   Sources used: 2

2. Query: How often is the Be