# End-to-End RAG Pipeline - Evaluation

Full RAG pipeline: PDF ingestion → Improved chunking → Embedding → ChromaDB → Retrieval → LLM answer

In [1]:
import sys
import os
from pathlib import Path
import json

# Change to parent directory for config.yaml access
parent_dir = Path.cwd().parent
os.chdir(parent_dir)
sys.path.insert(0, str(parent_dir))

from pdfProcessing.docling_PDF_processor import DoclingPDFProcessor
from pdfProcessing.chunking import create_chunks_from_sections
from embeddingModels.ModernBertEmbedder import ModernBertEmbedder
from embeddingModels.QwenEmbedder import QwenEmbedder
from backend.services.embedder import EmbeddingService
from backend.services.vector_db import VectorDBService
from backend.services.rag_answer_service import ChromaRagRetriever
from llmAG.rag.pipeline import RagPipeline
from llmAG.llm import build_llm
from zotero_integration.metadata_loader import ZoteroMetadataLoader

import pandas as pd
import numpy as np

print(f"Working directory: {os.getcwd()}")

Working directory: c:\Users\kronask\OneDrive - TU Wien\TU Wien\3. Semester\GenAI\GenAI


## 1. Initialize Services

In [2]:
# Configuration
EMBEDDER_TYPE = "bert"  # "bert" or "qwen"
CHROMA_PATH = "./backend/chroma_db"  # Use same DB as backend
MAX_CHUNK_SIZE = 2500
OVERLAP_SIZE = 200
TOP_K_RETRIEVAL = 5

# Database Management
CLEAR_DB_ON_RUN = False  # Set to True to clear DB and re-ingest all PDFs

# Set Ollama URL for local execution (not Docker)
os.environ["OLLAMA_BASE_URL"] = "http://localhost:11434"

# Initialize Zotero metadata loader
print("Initializing Zotero metadata loader...")
try:
    zotero_loader = ZoteroMetadataLoader()
    print(f"✓ Zotero metadata loaded")
except Exception as e:
    print(f"⚠ Zotero metadata not available: {e}")
    print("  Will fall back to Docling extraction")
    zotero_loader = None

# Initialize PDF processor
print("Initializing PDF processor...")
processor = DoclingPDFProcessor()

# Initialize embedding service
print("Initializing embedding service...")
embed_service = EmbeddingService()
# Load the model to have direct access to embedder for manual operations
embedder = embed_service.load_model(EMBEDDER_TYPE)

# Initialize ChromaDB service
print("Initializing ChromaDB...")
db_service = VectorDBService(
    db_path=CHROMA_PATH,
    collection_names={
        "bert": "scientific_papers_bert",
        "qwen": "scientific_papers_qwen"
    }
)

# Initialize LLM
print("Initializing LLM (Ollama mistral-nemo)...")
try:
    llm = build_llm(model="mistral-nemo", temperature=0.1)
    print("✓ LLM initialized")
except Exception as e:
    print(f"✗ LLM initialization failed: {e}")
    print("  Make sure Ollama app is running (check system tray)")
    llm = None

Initializing Zotero metadata loader...
Loaded 24 items from zotero_export_20260112_191851.json
✓ Zotero metadata loaded
Initializing PDF processor...
Initializing Docling Converter...
CUDA detected. Using GPU for PDF Processing.
Initializing embedding service...
Loading Model Key: bert...
Loading Alibaba-NLP/gte-modernbert-base on cuda...


2026-01-17 19:59:01,098 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Initializing ChromaDB...
Initializing LLM (Ollama mistral-nemo)...
✓ LLM initialized


## 2. Database Status

In [3]:
# Check current database state
print(f"{'='*80}")
print("DATABASE STATUS")
print(f"{'='*80}")

try:
    collection = db_service.get_collection(EMBEDDER_TYPE)
    chunk_count = collection.count()
    
    print(f"Current database status (model: {EMBEDDER_TYPE})")
    print(f"  Chunks in database: {chunk_count}")
    
    if chunk_count == 0:
        print(f"  ⚠ Database is empty - run ingestion first")
    else:
        print(f"  ✓ Database ready for evaluation")
    
    print(f"\n{'='*80}")
except Exception as e:
    print(f"Error checking database: {e}")

DATABASE STATUS
Current database status (model: bert)
  Chunks in database: 538
  ✓ Database ready for evaluation



## 3. Load Evaluation Dataset

In [4]:
def load_eval_dataset(filename="eval_dataset.json"):
    file_path = Path.cwd() / filename
    
    if not file_path.exists():
        print(f"⚠ Warning: {filename} not found in {Path.cwd()}")
        return []
        
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    print(f"✓ Loaded {len(data)} questions from {filename}")
    return data

# Load the data
eval_dataset = load_eval_dataset()

✓ Loaded 32 questions from eval_dataset.json


## 4. Initialize RAG Pipeline

In [5]:
# Initialize RAG components
retriever = ChromaRagRetriever(
    embed_service=embed_service,
    db_service=db_service,
    model_name=EMBEDDER_TYPE
)

# Initialize RAG pipeline (builds LLM internally)
rag_pipeline = RagPipeline(
    retriever=retriever,
    model="mistral-nemo",
    temperature=0.1
)
print("✓ RAG pipeline initialized")

✓ RAG pipeline initialized


## 5. RAG Evaluation

In [None]:
import pandas as pd
import time
from tqdm import tqdm

class RAGEvaluator:
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.results = []

    def evaluate(self, dataset, top_k=5):
        print(f"Starting evaluation of {len(dataset)} questions...")
        self.results = []
        
        for item in tqdm(dataset):
            question = item['question']
            target_tag = item.get('target_tag')
            tier = item.get('tier')
            
            start_time = time.time()
            try:
                # Run RAG Pipeline
                response = self.pipeline.run(question, k=top_k, include_sources=True)
                elapsed = time.time() - start_time
                
                # 1. Retrieval Evaluation (Source Matching)
                # Check if ANY of the retrieved docs contain the target tag in their title
                retrieved_titles = [src.metadata.get('title', '').lower() for src in response.sources]
                
                hit = False
                if target_tag:
                    tag_map = {
                        "FAST": ["fast", "autonomous high-resolution scanning"],
                        "liquid lenses": ["liquid lenses", "zhang"],
                        "autofocus": ["autofocus", "zhang", "rebuffi"],
                        "ptychography": ["ptychography", "schloz"],
                        "alignment": ["alignment", "morris", "beamlines"],
                        "optics": ["adaptive optics", "nousiainen", "mareev"]
                    }
                    
                    search_terms = tag_map.get(target_tag, [target_tag.lower()])
                    
                    # Check for hit
                    for title in retrieved_titles:
                        if any(term in title for term in search_terms):
                            hit = True
                            break
                else:
                    hit = None # No target tag defined (Synthesis questions)

                # Store Result
                self.results.append({
                    "Tier": tier,
                    "Question": question,
                    "Target_Tag": target_tag,
                    "Hit": hit,
                    "Answer": response.answer,
                    "Sources": " | ".join([t[:50] + "..." for t in retrieved_titles]),
                    "Latency": round(elapsed, 2)
                })
                
            except Exception as e:
                print(f"Error on question: {question[:30]}... {e}")
                self.results.append({
                    "Tier": tier,
                    "Question": question,
                    "Target_Tag": target_tag,
                    "Hit": False,
                    "Answer": f"ERROR: {str(e)}",
                    "Sources": "",
                    "Latency": 0
                })

        return pd.DataFrame(self.results)

# Initialize and Run
evaluator = RAGEvaluator(rag_pipeline)
df_results = evaluator.evaluate(eval_dataset, top_k=5)

# Display Summary
print("\n=== Evaluation Summary ===")
if 'Hit' in df_results.columns:
    # Filter out synthesis questions (Hit=None) for accuracy calc
    measurable = df_results.dropna(subset=['Hit'])
    print(f"Retrieval Hit Rate (Targeted Questions): {measurable['Hit'].mean():.2%}")

print(f"Average Latency: {df_results['Latency'].mean():.2f}s")
df_results.head()

Starting evaluation of 32 questions...


  0%|          | 0/32 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
2026-01-17 19:59:19,848 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
  3%|▎         | 1/32 [00:15<08:02, 15.56s/it]2026-01-17 19:59:29,349 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
  6%|▋         | 2/32 [00:32<08:07, 16.26s/it]2026-01-17 19:59:46,789 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
  9%|▉         | 3/32 [00:40<05:58, 12.36s/it]2026-01-17 19:59:53,247 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
 12%|█▎        | 4/32 [00:46<04:41, 10.06s/it]2026-01-17 20:00:01,427 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
 16%|█▌        | 5/32 [00:55<04:23,  9.77s/it]2026-01-17 20:00:10,922 - INFO - HTTP Request: POST http://localhost:11434/api/ch


=== Evaluation Summary ===
Retrieval Hit Rate (Targeted Questions): 74.07%
Average Latency: 28.24s





Unnamed: 0,Tier,Question,Target_Tag,Hit,Answer,Sources,Latency
0,1,"What does the acronym ERD stand for, and what ...",FAST,False,I do not know based on the provided context be...,a general bayesian algorithm for the autonomou...,15.56
1,1,What is the size of the test dark-field image ...,FAST,False,Based on the provided context:\n\n- The size o...,autofocus: ai-driven alignment of nanofocusing...,16.76
2,1,Name the static sampling baselines used for co...,FAST,False,The static sampling baselines used for compari...,a general bayesian algorithm for the autonomou...,7.7
3,1,What is the initial scan coverage (%) used bef...,FAST,False,I do not know based on the provided context be...,a general bayesian algorithm for the autonomou...,6.54
4,1,How many new points are acquired per iteration...,FAST,True,The context does not mention how many new poin...,deep reinforcement learning for data-driven ad...,9.25


## 6. Save and Analyze Results

In [None]:
output_filename = "rag_evaluation_results.csv"
df_results.to_csv(output_filename, index=False)
print(f"Results saved to {output_filename}")

# Inspect specifically the "Missed" items to debug retrieval
print("\n=== Missed Retrieval Questions ===")
missed = df_results[(df_results['Hit'] == False) & (df_results['Target_Tag'].notna())]
if not missed.empty:
    for _, row in missed.iterrows():
        print(f"Q: {row['Question']}")
        print(f"Target: {row['Target_Tag']}")
        print(f"Got Sources: {row['Sources']}\n")
else:
    print("Great! No retrieval misses on targeted questions.")

Results saved to rag_evaluation_results.csv

=== Missed Retrieval Questions ===
Q: What does the acronym ERD stand for, and what role does it play in the sampling pipeline?
Target: FAST
Got Sources: a general bayesian algorithm for the autonomous al... | performance metrics to unleash the power of self-d... | self-driving laboratories for chemistry and materi... | performance metrics to unleash the power of self-d... | self-driving laboratories for chemistry and materi...

Q: What is the size of the test dark-field image (in pixels), and how many candidate measurement positions does that imply?
Target: FAST
Got Sources: autofocus: ai-driven alignment of nanofocusing x-r... | a general bayesian algorithm for the autonomous al... | a general bayesian algorithm for the autonomous al... | laboratory experiments of model-based reinforcemen... | laboratory experiments of model-based reinforcemen...

Q: Name the static sampling baselines used for comparison.
Target: FAST
Got Sources: a genera

## 7. Detailed Evaluation by Tier

In [8]:
# Breakdown by tier and target tag
print(f"\n{'='*80}")
print("EVALUATION BREAKDOWN BY TIER")
print(f"{'='*80}\n")

for tier in sorted(df_results['Tier'].unique()):
    tier_data = df_results[df_results['Tier'] == tier]
    print(f"\nTier {tier}:")
    print(f"  Total Questions: {len(tier_data)}")
    
    with_tags = tier_data[tier_data['Target_Tag'].notna()]
    if len(with_tags) > 0:
        hit_rate = with_tags['Hit'].mean()
        print(f"  Retrieval Hit Rate: {hit_rate:.2%} ({int(with_tags['Hit'].sum())}/{len(with_tags)})")
    
    print(f"  Avg Latency: {tier_data['Latency'].mean():.2f}s")


EVALUATION BREAKDOWN BY TIER


Tier 1:
  Total Questions: 11
  Retrieval Hit Rate: 63.64% (7/11)
  Avg Latency: 15.33s

Tier 2:
  Total Questions: 12
  Retrieval Hit Rate: 75.00% (9/12)
  Avg Latency: 26.88s

Tier 3:
  Total Questions: 9
  Retrieval Hit Rate: 100.00% (4/4)
  Avg Latency: 45.83s


## 8. Question-Level Analysis

In [9]:
# Show all questions with their results
display_cols = ['Tier', 'Target_Tag', 'Question', 'Hit', 'Latency']
print(f"\n{'='*80}")
print("ALL EVALUATION RESULTS")
print(f"{'='*80}\n")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 80)

print(df_results[display_cols].to_string())


ALL EVALUATION RESULTS

    Tier     Target_Tag                                                                                                                                                        Question    Hit  Latency
0      1           FAST                                                                       What does the acronym ERD stand for, and what role does it play in the sampling pipeline?  False    15.56
1      1           FAST                                        What is the size of the test dark-field image (in pixels), and how many candidate measurement positions does that imply?  False    16.76
2      1           FAST                                                                                                         Name the static sampling baselines used for comparison.  False     7.70
3      1           FAST                                                                                    What is the initial scan coverage (%) used before adaptive selection