# 05 â€” Evaluation: Baseline vs Agentic

Lightweight, repeatable evaluation over a golden question set.

In [None]:
from pathlib import Path
import os
import pandas as pd

from src.eval import run_eval, build_comparison_report, top_failures

In [None]:
GOLDEN_PATH = Path('../eval/golden_questions.jsonl')
USE_LLM_EVAL = int(os.getenv('USE_LLM_EVAL', '0'))

print(f'Golden set: {GOLDEN_PATH.resolve()}')
print(f'USE_LLM_EVAL={USE_LLM_EVAL}')

In [None]:
baseline_df, agentic_df = run_eval(
    golden_path=GOLDEN_PATH,
    top_k=6,
    max_retries=2,
    use_llm_grader=False,
)

print(f'baseline rows: {len(baseline_df)}')
print(f'agentic rows: {len(agentic_df)}')

In [None]:
report_df = build_comparison_report(baseline_df, agentic_df)
report_df

## Drift-only comparison

In [None]:
report_df[report_df['scope'] == 'drift']

## Top failures (3 examples each pipeline)

Includes query, retrieved doc titles/dates, answer, citations, and failed deterministic checks.

In [None]:
def compact_retrieved(chunks):
    return [
        {
            'doc_title': c.get('doc_title', ''),
            'doc_date': c.get('doc_date', ''),
            'chunk_id': c.get('chunk_id', ''),
        }
        for c in (chunks or [])
    ]

for name, df in [('baseline', baseline_df), ('agentic', agentic_df)]:
    print(f'\n=== {name.upper()} ===')
    failures = top_failures(df, n=3)
    cols = ['question', 'answer', 'citations', 'checks_failed']
    display(failures[cols])
    for _, row in failures.iterrows():
        print('---')
        print('query:', row['question'])
        print('retrieved (title/date/chunk):', compact_retrieved(row['retrieved_chunks']))
        print('answer:', row['answer'])
        print('citations:', row['citations'])
        print('failed checks:', row['checks_failed'])

## Optional LLM-as-judge (disabled by default)

Set `USE_LLM_EVAL=1` in `.env` to append `llm_judge_score` and `llm_judge_rationale` columns during `run_eval`.

In [None]:
if USE_LLM_EVAL == 1:
    display(baseline_df[['id', 'question', 'llm_judge_score', 'llm_judge_rationale']].head())
    display(agentic_df[['id', 'question', 'llm_judge_score', 'llm_judge_rationale']].head())
else:
    print('LLM judge disabled. Set USE_LLM_EVAL=1 to enable.')