# End-to-End RAG Pipeline - Evaluation

Full RAG pipeline: PDF ingestion → Improved chunking → Embedding → ChromaDB → Retrieval → LLM answer

In [1]:
import sys
import os
from pathlib import Path
import json
import time
from tqdm import tqdm

# Change to parent directory for config.yaml access
parent_dir = Path.cwd().parent
os.chdir(parent_dir)
sys.path.insert(0, str(parent_dir))

from pdfProcessing.docling_PDF_processor import DoclingPDFProcessor
from pdfProcessing.chunking import create_chunks_from_sections
from embeddingModels.ModernBertEmbedder import ModernBertEmbedder
from embeddingModels.QwenEmbedder import QwenEmbedder
from backend.services.embedder import EmbeddingService
from backend.services.vector_db import VectorDBService
from backend.services.rag_answer_service import ChromaRagRetriever
from llmAG.rag.pipeline import RagPipeline
from llmAG.llm import build_llm
from zotero_integration.metadata_loader import ZoteroMetadataLoader

import pandas as pd
import numpy as np

print(f"Working directory: {os.getcwd()}")

Working directory: c:\Users\kronask\OneDrive - TU Wien\TU Wien\3. Semester\GenAI\GenAI


## 1. Initialize Services

In [2]:
# Configuration
EMBEDDER_TYPE = "bert"  # "bert" or "qwen"
CHROMA_PATH = "./backend/chroma_db"  # Use same DB as backend
MAX_CHUNK_SIZE = 2500
OVERLAP_SIZE = 200
TOP_K_RETRIEVAL = 5

# Database Management
CLEAR_DB_ON_RUN = False  # Set to True to clear DB and re-ingest all PDFs

# Set Ollama URL for local execution (not Docker)
os.environ["OLLAMA_BASE_URL"] = "http://localhost:11434"

# Initialize Zotero metadata loader
print("Initializing Zotero metadata loader...")
try:
    zotero_loader = ZoteroMetadataLoader()
    print(f"✓ Zotero metadata loaded")
except Exception as e:
    print(f"⚠ Zotero metadata not available: {e}")
    print("  Will fall back to Docling extraction")
    zotero_loader = None

# Initialize PDF processor
print("Initializing PDF processor...")
processor = DoclingPDFProcessor()

# Initialize embedding service
print("Initializing embedding service...")
embed_service = EmbeddingService()
# Load the model to have direct access to embedder for manual operations
embedder = embed_service.load_model(EMBEDDER_TYPE)

# Initialize ChromaDB service
print("Initializing ChromaDB...")
db_service = VectorDBService(
    db_path=CHROMA_PATH,
    collection_names={
        "bert": "scientific_papers_bert",
        "qwen": "scientific_papers_qwen"
    }
)

# Initialize LLM
print("Initializing LLM (Ollama mistral-nemo)...")
try:
    llm = build_llm(model="mistral-nemo", temperature=0.1)
    print("✓ LLM initialized")
except Exception as e:
    print(f"✗ LLM initialization failed: {e}")
    print("  Make sure Ollama app is running (check system tray)")
    llm = None

Initializing Zotero metadata loader...
Loaded 24 items from zotero_export_20260112_191851.json
✓ Zotero metadata loaded
Initializing PDF processor...
Initializing Docling Converter...
CUDA detected. Using GPU for PDF Processing.
Initializing embedding service...
Loading Model Key: bert...
Loading Alibaba-NLP/gte-modernbert-base on cuda...


2026-01-18 19:06:17,240 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Initializing ChromaDB...
Initializing LLM (Ollama mistral-nemo)...
✓ LLM initialized


## 2. Database Status

In [3]:
# Check current database state
print(f"{'='*80}")
print("DATABASE STATUS")
print(f"{'='*80}")

try:
    collection = db_service.get_collection(EMBEDDER_TYPE)
    chunk_count = collection.count()
    
    print(f"Current database status (model: {EMBEDDER_TYPE})")
    print(f"  Chunks in database: {chunk_count}")
    
    if chunk_count == 0:
        print(f"  ⚠ Database is empty - run ingestion first")
    else:
        print(f"  ✓ Database ready for evaluation")
    
    print(f"\n{'='*80}")
except Exception as e:
    print(f"Error checking database: {e}")

DATABASE STATUS
Current database status (model: bert)
  Chunks in database: 538
  ✓ Database ready for evaluation



## 3. Load Evaluation Dataset

In [4]:
def load_eval_dataset(filename="eval_dataset.json"):
    file_path = Path.cwd() / filename
    
    if not file_path.exists():
        print(f"⚠ Warning: {filename} not found in {Path.cwd()}")
        return []
        
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    print(f"✓ Loaded {len(data)} questions from {filename}")
    return data

# Load the data
eval_dataset = load_eval_dataset()

✓ Loaded 16 questions from eval_dataset.json


## 4. Initialize RAG Pipeline

In [5]:
# Initialize RAG components
retriever = ChromaRagRetriever(
    embed_service=embed_service,
    db_service=db_service,
    model_name=EMBEDDER_TYPE
)

# Initialize RAG pipeline (builds LLM internally)
rag_pipeline = RagPipeline(
    retriever=retriever,
    model="mistral-nemo",
    temperature=0.1
)
print("✓ RAG pipeline initialized")

✓ RAG pipeline initialized


## 5. RAG Evaluation

In [6]:
# import pandas as pd
# import time
# from tqdm import tqdm

# class RAGEvaluator:
#     def __init__(self, pipeline):
#         self.pipeline = pipeline
#         self.results = []

#     def evaluate(self, dataset, top_k=5):
#         print(f"Starting evaluation of {len(dataset)} questions...")
#         self.results = []
        
#         for item in tqdm(dataset):
#             question = item['question']
#             target_tag = item.get('target_tag')
#             tier = item.get('tier')
            
#             start_time = time.time()
#             try:
#                 # Run RAG Pipeline
#                 response = self.pipeline.run(question, k=top_k, include_sources=True)
#                 elapsed = time.time() - start_time
                
#                 # 1. Retrieval Evaluation (Source Matching)
#                 # Check if ANY of the retrieved docs contain the target tag in their title
#                 retrieved_titles = [src.metadata.get('title', '').lower() for src in response.sources]
                
#                 hit = False
#                 if target_tag:
#                     tag_map = {
#                         "FAST": ["fast", "autonomous high-resolution scanning"],
#                         "liquid lenses": ["liquid lenses", "zhang"],
#                         "autofocus": ["autofocus", "zhang", "rebuffi"],
#                         "ptychography": ["ptychography", "schloz"],
#                         "alignment": ["alignment", "morris", "beamlines"],
#                         "optics": ["adaptive optics", "nousiainen", "mareev"]
#                     }
                    
#                     search_terms = tag_map.get(target_tag, [target_tag.lower()])
                    
#                     # Check for hit
#                     for title in retrieved_titles:
#                         if any(term in title for term in search_terms):
#                             hit = True
#                             break
#                 else:
#                     hit = None # No target tag defined (Synthesis questions)

#                 # Store Result
#                 self.results.append({
#                     "Tier": tier,
#                     "Question": question,
#                     "Target_Tag": target_tag,
#                     "Hit": hit,
#                     "Answer": response.answer,
#                     "Sources": " | ".join([t[:50] + "..." for t in retrieved_titles]),
#                     "Latency": round(elapsed, 2)
#                 })
                
#             except Exception as e:
#                 print(f"Error on question: {question[:30]}... {e}")
#                 self.results.append({
#                     "Tier": tier,
#                     "Question": question,
#                     "Target_Tag": target_tag,
#                     "Hit": False,
#                     "Answer": f"ERROR: {str(e)}",
#                     "Sources": "",
#                     "Latency": 0
#                 })

#         return pd.DataFrame(self.results)

# # Initialize and Run
# evaluator = RAGEvaluator(rag_pipeline)
# df_results = evaluator.evaluate(eval_dataset, top_k=5)

# # Display Summary
# print("\n=== Evaluation Summary ===")
# if 'Hit' in df_results.columns:
#     # Filter out synthesis questions (Hit=None) for accuracy calc
#     measurable = df_results.dropna(subset=['Hit'])
#     print(f"Retrieval Hit Rate (Targeted Questions): {measurable['Hit'].mean():.2%}")

# print(f"Average Latency: {df_results['Latency'].mean():.2f}s")
# df_results.head()

## 6. Save and Analyze Results

In [7]:
# output_filename = "rag_evaluation_results.csv"
# df_results.to_csv(output_filename, index=False)
# print(f"Results saved to {output_filename}")

# # Inspect specifically the "Missed" items to debug retrieval
# print("\n=== Missed Retrieval Questions ===")
# missed = df_results[(df_results['Hit'] == False) & (df_results['Target_Tag'].notna())]
# if not missed.empty:
#     for _, row in missed.iterrows():
#         print(f"Q: {row['Question']}")
#         print(f"Target: {row['Target_Tag']}")
#         print(f"Got Sources: {row['Sources']}\n")
# else:
#     print("Great! No retrieval misses on targeted questions.")

## 7. Detailed Evaluation by Tier

In [8]:
# # Breakdown by tier and target tag
# print(f"\n{'='*80}")
# print("EVALUATION BREAKDOWN BY TIER")
# print(f"{'='*80}\n")

# for tier in sorted(df_results['Tier'].unique()):
#     tier_data = df_results[df_results['Tier'] == tier]
#     print(f"\nTier {tier}:")
#     print(f"  Total Questions: {len(tier_data)}")
    
#     with_tags = tier_data[tier_data['Target_Tag'].notna()]
#     if len(with_tags) > 0:
#         hit_rate = with_tags['Hit'].mean()
#         print(f"  Retrieval Hit Rate: {hit_rate:.2%} ({int(with_tags['Hit'].sum())}/{len(with_tags)})")
    
#     print(f"  Avg Latency: {tier_data['Latency'].mean():.2f}s")

## 8. Question-Level Analysis

In [9]:
# # Show all questions with their results
# display_cols = ['Tier', 'Target_Tag', 'Question', 'Hit', 'Latency']
# print(f"\n{'='*80}")
# print("ALL EVALUATION RESULTS")
# print(f"{'='*80}\n")

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', 80)

# print(df_results[display_cols].to_string())

##  Enhanced Evaluation (Chunk + Multi-Paper Tracking)

In [12]:
# Enhanced evaluator: tracks exact chunks (Tier 1-2) AND multi-paper retrieval (Tier 3)

class EnhancedRAGEvaluator:
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.results = []

    def evaluate(self, dataset, top_k=5):
        print(f"Starting enhanced evaluation of {len(dataset)} questions...")
        self.results = []
        
        for item in tqdm(dataset):
            question = item['question']
            target_tag = item.get('target_tag')
            tier = item.get('tier')
            expected_chunk_id = item.get('expected_chunk_id')  # Ground truth
            
            start_time = time.time()
            try:
                # Run RAG Pipeline
                response = self.pipeline.run(question, k=top_k, include_sources=True)
                elapsed = time.time() - start_time
                
                # Extract metadata from retrieved chunks
                retrieved_titles = [src.metadata.get('title', '').lower() for src in response.sources]
                retrieved_filenames = [src.metadata.get('filename', '') for src in response.sources]
                retrieved_parents = [src.metadata.get('parent_id', '') for src in response.sources]
                
                # Get unique papers from retrieved chunks
                unique_papers = list(set(retrieved_filenames))
                num_unique_papers = len(unique_papers)
                
                # 1. Check for EXACT chunk match (for Tier 1-2)
                exact_chunk_match = False
                chunk_found_at_rank = None
                if expected_chunk_id:
                    # Try multiple ways to match the expected chunk ID
                    for rank, src in enumerate(response.sources, 1):
                        parent_id = src.metadata.get('parent_id', '')
                        # Match by parent_id or if expected_chunk_id appears in the parent_id
                        if parent_id == expected_chunk_id or expected_chunk_id.split('#')[0] in parent_id:
                            exact_chunk_match = True
                            chunk_found_at_rank = rank
                            break
                
                # 2. Multi-paper match for Tier 3 (synthesis questions)
                multi_paper_match = num_unique_papers >= 2
                
                # Store Result
                self.results.append({
                    "Tier": tier,
                    "Question": question[:60] + "..." if len(question) > 60 else question,
                    "Target_Tag": target_tag,
                    "Exact_Chunk_Match": exact_chunk_match if expected_chunk_id else None,
                    "Chunk_Rank": chunk_found_at_rank if exact_chunk_match else None,
                    "Num_Papers": num_unique_papers,
                    "Multi_Paper_Match": multi_paper_match if tier == 3 else None,
                    "Papers": " | ".join([p.split(' - ')[0][:30] for p in unique_papers[:2]]),
                    "Latency": round(elapsed, 2)
                })
                
            except Exception as e:
                print(f"Error on question: {question[:30]}... {e}")
                self.results.append({
                    "Tier": tier,
                    "Question": question[:60] + "..." if len(question) > 60 else question,
                    "Target_Tag": target_tag,
                    "Exact_Chunk_Match": False,
                    "Chunk_Rank": None,
                    "Num_Papers": 0,
                    "Multi_Paper_Match": False,
                    "Papers": f"ERROR: {str(e)[:20]}",
                    "Latency": 0
                })

        return pd.DataFrame(self.results)

# Run enhanced evaluation
print("\n" + "="*80)
print("ENHANCED EVALUATION (Chunk-level + Multi-paper)")
print("="*80 + "\n")

evaluator_enhanced = EnhancedRAGEvaluator(rag_pipeline)
df_results_enhanced = evaluator_enhanced.evaluate(eval_dataset, top_k=5)

# Display Summary
print("\n=== EVALUATION SUMMARY ===\n")

# Tier 1-2: Exact chunk matching
tier_12 = df_results_enhanced[df_results_enhanced['Tier'].isin([1, 2])]
if len(tier_12) > 0:
    chunk_match_rate = tier_12['Exact_Chunk_Match'].sum() / tier_12['Exact_Chunk_Match'].notna().sum()
    print(f"Tier 1-2 (Single Paper) - Exact Chunk Hit Rate: {chunk_match_rate:.2%} ({int(tier_12['Exact_Chunk_Match'].sum())}/{int(tier_12['Exact_Chunk_Match'].notna().sum())})")
    
    # Show average rank when chunk is found
    found_ranks = tier_12[tier_12['Exact_Chunk_Match'] == True]['Chunk_Rank']
    if len(found_ranks) > 0:
        print(f"  → Avg rank of correct chunk: {found_ranks.mean():.1f}")

# Tier 3: Multi-paper matching
tier_3 = df_results_enhanced[df_results_enhanced['Tier'] == 3]
if len(tier_3) > 0:
    multi_match_rate = tier_3['Multi_Paper_Match'].sum() / len(tier_3)
    print(f"\nTier 3 (Synthesis) - Multi-Paper Hit Rate: {multi_match_rate:.2%} ({int(tier_3['Multi_Paper_Match'].sum())}/{len(tier_3)})")
    
    avg_papers = tier_3['Num_Papers'].mean()
    print(f"  → Avg papers retrieved: {avg_papers:.1f}")

print(f"\nAverage Latency (all): {df_results_enhanced['Latency'].mean():.2f}s")

# Show detailed results
print("\n" + "="*80)
print("DETAILED RESULTS")
print("="*80)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print(df_results_enhanced.to_string(index=False))



ENHANCED EVALUATION (Chunk-level + Multi-paper)

Starting enhanced evaluation of 16 questions...


  0%|          | 0/16 [00:00<?, ?it/s]2026-01-18 19:07:48,352 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
  6%|▋         | 1/16 [00:13<03:15, 13.06s/it]2026-01-18 19:08:03,289 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
 12%|█▎        | 2/16 [00:36<04:26, 19.03s/it]2026-01-18 19:08:27,475 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
 19%|█▉        | 3/16 [01:00<04:37, 21.38s/it]2026-01-18 19:08:49,825 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
 25%|██▌       | 4/16 [01:13<03:36, 18.07s/it]2026-01-18 19:09:01,860 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
 31%|███▏      | 5/16 [01:35<03:33, 19.38s/it]2026-01-18 19:09:24,268 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
 38%|███▊      | 6/16 [02:10<04:06, 24.69s/it]2026-01-18 19:09:59,724 - INFO - HTTP Request: POST http://localhost:11434


=== EVALUATION SUMMARY ===

Tier 1-2 (Single Paper) - Exact Chunk Hit Rate: 100.00% (12/12)
  → Avg rank of correct chunk: 1.0

Tier 3 (Synthesis) - Multi-Paper Hit Rate: 100.00% (4/4)
  → Avg papers retrieved: 2.5

Average Latency (all): 29.79s

DETAILED RESULTS
 Tier                                                        Question    Target_Tag  Exact_Chunk_Match  Chunk_Rank  Num_Papers Multi_Paper_Match                              Papers  Latency
    1 What physical quantity is the controller changing (the actua... liquid lenses               True           1           2              None       Rebuffi et al. | Zhang et al.    13.06
    1 Which classic search methods are used as baselines in the DR...     autofocus               True           1           2              None       Rebuffi et al. | Zhang et al.    23.21
    1 What is the main objective of 'adaptive scanning' compared t...  ptychography               True           1           1              None                     




## Enhanced Evaluation with Ground Truth Chunk IDs

In [None]:
# First, let's debug what fields are actually available in the retrieved chunks
print("Debugging chunk structure from pipeline response...\n")

# Test with one question
test_question = eval_dataset[0]['question']
print(f"Test question: {test_question}\n")

response = rag_pipeline.run(test_question, k=3, include_sources=True)

print(f"Number of sources: {len(response.sources)}\n")

for i, src in enumerate(response.sources):
    print(f"Source {i+1}:")
    print(f"  Type: {type(src)}")
    print(f"  Available attributes: {dir(src) if hasattr(src, '__dict__') else 'N/A'}")
    print(f"  Metadata keys: {src.metadata.keys() if hasattr(src, 'metadata') else 'No metadata'}")
    print(f"  Full metadata: {src.metadata}")
    print()