In [1]:
!pip3 install python-dotenv huggingface-hub llama-index transformers sentence-transformers llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-llms-openrouter llama-index-retrievers-bm25 tabula-py  jpype1 pystemmer

Collecting llama-index
  Downloading llama_index-0.14.7-py3-none-any.whl.metadata (13 kB)
Collecting llama-index-llms-huggingface
  Downloading llama_index_llms_huggingface-0.6.1-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.6.1-py3-none-any.whl.metadata (458 bytes)
Collecting llama-index-llms-openrouter
  Downloading llama_index_llms_openrouter-0.4.2-py3-none-any.whl.metadata (2.3 kB)
Collecting llama-index-retrievers-bm25
  Downloading llama_index_retrievers_bm25-0.6.5-py3-none-any.whl.metadata (446 bytes)
Collecting tabula-py
  Downloading tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Collecting jpype1
  Downloading jpype1-1.6.0-cp313-cp313-macosx_10_13_universal2.whl.metadata (5.0 kB)
Collecting pystemmer
  Downloading pystemmer-3.0.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (4.0 kB)
Collecting llama-index-cli<0.6,>=0.5.0 (from llama-index)
  Downloading llama_index_cli-0.5.3-py3-none-any.

In [1]:
# =========================================================
# RAG Evaluation Notebook
# =========================================================
# Evaluates RAGIndex retrieval performance using auto-generated question-context pairs
# Metrics: MRR, Hit Rate, Precision, Recall, Relevance, Faithfulness
# =========================================================

from llama_index.core import (Document, Settings)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openrouter import OpenRouter
from dotenv import load_dotenv
import os, sys
import nest_asyncio

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

load_dotenv()
api_key = os.getenv("OPENROUTER_API_KEY")
if api_key:
    print("✅ API Key Loaded Successfully:", api_key[:5] + "..." + api_key[-5:])
else:
    print("⚠️ API Key is missing! Check your .env file.")


# Initialize OpenRouter LLM
llm = OpenRouter(api_key=api_key, model="mistralai/mistral-7b-instruct", max_tokens=512, context_window=4096) # Creates questions and answers
Judge_llm = OpenRouter(api_key=api_key, model="qwen/qwen-turbo", max_tokens=512, context_window=4096) # Creates questions and answers
Settings.llm = llm

# Apply nest_asyncio to fix event loop issues in Jupyter
nest_asyncio.apply()

# Set up embedding model
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
Settings.embed_model = embed_model

  from .autonotebook import tqdm as notebook_tqdm


✅ API Key Loaded Successfully: sk-or...94d80


In [2]:
# ---------------------------------------------------------
# Step 1: Prepare the documents
# ---------------------------------------------------------
from pathlib import Path
from rag import RAGIndex

parent_dir = str(Path(os.getcwd()).parent)
project_root = Path(parent_dir)
rag_doc_dir = project_root / "data" / "rag_docs"
faiss_dir = project_root / "data" / "faiss_index"

RAG = RAGIndex(str(rag_doc_dir), str(faiss_dir))

# Load all document chunks from RAG index into a list
# These chunks become the basis for generating test questions
docs = []
if RAG.mode == "faiss" and RAG.vs is not None:
    # Loops through every document chunk stored inside the FAISS docstore
    for doc in RAG.vs.docstore._dict.values():
        # Re-wraps each chunk into your standard Document format (for LlamaIndex evaluation)
        docs.append(Document(text=doc.page_content, metadata=doc.metadata))
elif RAG.mode == "bm25" and RAG.retriever is not None:
    for doc in RAG.retriever.docstore:
        docs.append(Document(text=doc.page_content, metadata=doc.metadata))
print(f"Loaded {len(docs)} chunks from RAG index.")

[RAG] Total raw documents loaded: 0
[RAG] No documents found in /Users/Yibing/Desktop/01 NEU/CS7980/capstone_mvp/experiment/data/rag_docs
[RAG] Loaded TXT: field_standards.txt (1 docs) [category=field]
[RAG] Loaded TXT: mowing_standard.txt (1 docs) [category=mowing]
[RAG] Total raw documents loaded: 2


  self.emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


[RAG] Loaded existing FAISS index (up-to-date)
Loaded 36 chunks from RAG index.


In [3]:
# ---------------------------------------------------------
# Step 2: Prepare nodes from RAG's ORIGINAL chunks
# ---------------------------------------------------------
from llama_index.core.schema import TextNode

# Converts 'Document' objects to `TextNode` objects
# This preserves the exact 900-char chunks your RAG uses
nodes = []
for i, doc in enumerate(docs):
    node = TextNode(
        text=doc.text,
        metadata=doc.metadata,
        # Let LlamaIndex generate IDs naturally
    )
    nodes.append(node)

print(f"✅ Created {len(nodes)} nodes (preserving RAG's original 900-char chunks)")
print(f"   First node length: {len(nodes[0].get_content())} chars")

✅ Created 36 nodes (preserving RAG's original 900-char chunks)
   First node length: 839 chars


In [4]:
# ---------------------------------------------------------
# Step 3: Auto-generate question–context pairs
# ---------------------------------------------------------
from llama_index.core.evaluation import generate_question_context_pairs

qa_dataset = generate_question_context_pairs(
    nodes=nodes,
    llm=llm,  # OpenRouter mistral-7b-instruct model
    num_questions_per_chunk=1,  # can change to 1–3 for speed
)

qa_data_dict = qa_dataset.model_dump()

print(f"✅ Generated {len(qa_dataset.queries)} question-context pairs")
print(f"   Corpus size: {len(qa_data_dict['corpus'])} chunks")
print(f"   First corpus text length: {len(list(qa_data_dict['corpus'].values())[0])} chars")

# Verify corpus matches RAG chunks
print("\n=== Verification ===")
sample_corpus_text = list(qa_data_dict["corpus"].values())[0]
sample_rag_text = docs[0].text.strip()

if sample_corpus_text.strip() == sample_rag_text:
    print("✅ Corpus text MATCHES RAG chunks perfectly!")
elif sample_corpus_text in sample_rag_text or sample_rag_text in sample_corpus_text:
    print("⚠️  Corpus text partially matches RAG chunks")
    print(f"   Corpus length: {len(sample_corpus_text)}")
    print(f"   RAG length: {len(sample_rag_text)}")
else:
    print("❌ Corpus text DOES NOT match RAG chunks")
    print(f"   Corpus preview: {sample_corpus_text[:100]}")
    print(f"   RAG preview: {sample_rag_text[:100]}")

100%|██████████| 36/36 [00:43<00:00,  1.20s/it]

✅ Generated 36 question-context pairs
   Corpus size: 36 chunks
   First corpus text length: 839 chars

=== Verification ===
✅ Corpus text MATCHES RAG chunks perfectly!





In [5]:
# =========================================================
# Step 4: Automatic Evaluation
# =========================================================

import pandas as pd
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore, TextNode, QueryBundle
from llama_index.core.evaluation import RetrieverEvaluator
from typing import List

# ===================================================================
# Wrapper: Makes YOUR RAG.retrieve() compatible with LlamaIndex
# ===================================================================

class DirectRAGRetriever(BaseRetriever):
    """Wrapper for YOUR RAG system"""
    
    def __init__(self, rag_obj, corpus_text_to_id, top_k=4):
        self._rag = rag_obj
        self._text_to_id = corpus_text_to_id
        self._top_k = top_k
        super().__init__()
    
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        # Call YOUR RAG
        hits = self._rag.retrieve(query_bundle.query_str, k=self._top_k)
        
        # Convert to LlamaIndex nodes with corpus IDs
        nodes = []
        for hit in hits:
            text = hit.get("text", "").strip()
            corpus_id = self._text_to_id.get(text)
            
            if corpus_id:  # Only add if ID mapping succeeds
                node = TextNode(text=text, id_=corpus_id, metadata=hit)
                nodes.append(NodeWithScore(node=node, score=hit.get("score", 0.0)))
        
        return nodes

# Build ID mapping
corpus_text_to_id = {text: cid for cid, text in qa_data_dict["corpus"].items()}

# Create retriever
retriever = DirectRAGRetriever(RAG, corpus_text_to_id, top_k=4)

print(f"✅ Created automatic retriever for {RAG.mode.upper()} system\n")
print("="*70)
print("RETRIEVAL METRICS (Automatic Evaluation)")
print("="*70)

evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate", "precision", "recall"],
    retriever=retriever
)

eval_results = await evaluator.aevaluate_dataset(qa_dataset)
retrieval_df = pd.DataFrame([r.metric_vals_dict for r in eval_results if r.metric_vals_dict])

print(f"\n{'Metric':<15} {'Score':<10}")
print("-" * 25)
for metric in ["mrr", "hit_rate", "precision", "recall"]:
    print(f"{metric.upper():<15} {retrieval_df[metric].mean():.4f}")

print(f"\nEvaluated {len(retrieval_df)}/{len(qa_dataset.queries)} queries")

# # Save
# retrieval_df.to_csv("your_rag_retrieval_auto.csv", index=False)
# print("✅ Saved to 'your_rag_retrieval_auto.csv'")

✅ Created automatic retriever for FAISS system

RETRIEVAL METRICS (Automatic Evaluation)

Metric          Score     
-------------------------
MRR             0.8426
HIT_RATE        0.9167
PRECISION       0.2292
RECALL          0.9167

Evaluated 36/36 queries
