# Evaluation Notebook

This notebook is for evaluating and benchmarking the document reasoning agent.

## Features:
- Metric calculation
- Benchmark execution
- Model comparison
- Results visualization


In [None]:
# Import necessary modules
import sys
from pathlib import Path
import json

# Add parent directory to path
sys.path.insert(0, str(Path().absolute().parent))

from src.evaluation.metrics import simple_similarity, chunk_relevance, measure_latency
from src.evaluation.benchmark import Benchmark
from src.agent.agent import Agent
from src.agent.planner import plan
from src.agent.worker import Worker
from src.retrieval.retriever import Retriever
from src.retrieval.chunker import chunk_text
from src.llm.local_model_client import LocalModelClient


## 1. Metric Testing


In [None]:
# Test similarity metric
text1 = "Machine learning is a subset of artificial intelligence"
text2 = "Machine learning uses algorithms to learn from data"
similarity = simple_similarity(text1, text2)
print(f"Text 1: '{text1}'")
print(f"Text 2: '{text2}'")
print(f"Similarity: {similarity:.3f}")


In [None]:
# Test chunk relevance
chunks = [
    "Machine learning is a subset of AI",
    "Deep learning uses neural networks",
    "Natural language processing handles text"
]
query = "machine learning artificial intelligence"
relevance = chunk_relevance(chunks, query)
print(f"Query: '{query}'")
print(f"Chunks: {len(chunks)}")
print(f"Average relevance: {relevance:.3f}")


## 2. Benchmark Execution


In [None]:
# Setup for benchmark
retriever = Retriever()
test_text = """
Machine learning is a subset of artificial intelligence.
Deep learning uses neural networks with multiple layers.
Natural language processing handles text data.
Supervised learning uses labeled data to train models.
"""
chunks = chunk_text(test_text, size=50)
retriever.index_chunks(chunks)

model = LocalModelClient()
worker = Worker(retriever, model)
agent = Agent(plan, worker, model)

benchmark = Benchmark(agent, retriever)


In [None]:
# Define test cases
test_cases = [
    {
        "query": "What is machine learning?",
        "ground_truth": "Machine learning is a subset of artificial intelligence"
    },
    {
        "query": "What is deep learning?",
        "ground_truth": "Deep learning uses neural networks with multiple layers"
    },
    {
        "query": "What is supervised learning?",
        "ground_truth": "Supervised learning uses labeled data to train models"
    }
]

print(f"Test cases: {len(test_cases)}")


In [None]:
# Run benchmark
try:
    summary = benchmark.run_benchmark(test_cases, k=3)
    
    print("=" * 60)
    print("BENCHMARK RESULTS")
    print("=" * 60)
    print(f"Number of queries: {summary['num_queries']}")
    print(f"Average latency: {summary['avg_latency']:.4f} seconds")
    print(f"Average similarity: {summary['avg_similarity']:.3f}")
    print(f"Average chunk relevance: {summary['avg_chunk_relevance']:.3f}")
    
    print("\nDetailed Results:")
    for i, result in enumerate(summary['results'], 1):
        print(f"\nQuery {i}: '{result['query']}'")
        print(f"  Latency: {result['latency']:.4f}s")
        print(f"  Similarity: {result['similarity']:.3f}")
        print(f"  Chunk relevance: {result['chunk_relevance']:.3f}")
        
except NotImplementedError:
    print("(Note: Local model not loaded - benchmark structure is correct)")
except Exception as e:
    print(f"Error: {e}")
