In [None]:
# Colab-only: install required packages
# (skip this cell if running locally)
%pip install -q "rouge-score>=0.1.2" "weaviate-client>=4.4.0" "sentence-transformers>=2.7.0"

# Part 4: The Showdown ‚Äî Evaluation Arena

Compare **The Intern** (fine-tuned Llama-3) vs **The Librarian** (Hybrid RAG + gpt-4o-mini)
across three dimensions:

| Metric | What it measures |
|--------|------------------|
| **ROUGE-L** | Textual overlap between generated and ground-truth answers |
| **LLM-as-a-Judge** | Faithfulness & Accuracy scored 1-5 by `o3-mini` |
| **Latency** | Response time in milliseconds |

**Bonus:** Monthly cost projection for 500 daily users √ó 10 queries each.

## Setup: Imports and Environment

In [None]:
import os
import sys
import json
import time
import random
import warnings
from pathlib import Path
from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

warnings.filterwarnings('ignore')
matplotlib.rcParams['figure.dpi'] = 120

# Add project root to path (same pattern as other notebooks)
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(project_root))
os.chdir(project_root)

print(f"‚úì Project root: {project_root}")

In [None]:
# Load OpenAI API key (Colab secrets or .env)
try:
    from google.colab import userdata
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    print("‚úì OpenAI API key loaded from Colab secrets")
except Exception:
    from dotenv import load_dotenv
    load_dotenv(project_root / '.env')
    print("‚úì Environment loaded from .env")

print(f"‚úì OPENAI_API_KEY set: {'Yes' if os.environ.get('OPENAI_API_KEY') else 'No'}")

In [None]:
from src.utils.config_loader import load_config
from src.evaluation.metrics import compute_rouge_l, llm_judge_score, measure_latency
from src.evaluation.cost_analysis import estimate_monthly_cost

# Part 3 ‚Äì Librarian
from src.rag.librarian_inference import query_librarian

print("‚úì All imports successful")

In [None]:
config_path = project_root / 'config' / 'config.yaml'
config = load_config(config_path)

eval_cfg = config.get('evaluation', {})
judge_model = eval_cfg.get('metrics', {}).get('llm_judge', {}).get('model', 'o3-mini')
latency_runs = eval_cfg.get('latency', {}).get('runs_per_query', 3)

print("‚úì Configuration loaded")
print(f"  LLM Judge model : {judge_model}")
print(f"  Latency runs    : {latency_runs}")
print(f"  ROUGE-L enabled : {eval_cfg.get('metrics', {}).get('rouge_l', {}).get('enabled', True)}")

## Step 1: Load & Sample the Golden Test Set

We stratified-sample **20 questions** across the three categories
(`hard_facts`, `strategic_summary`, `stylistic_creative`) so that
each category is proportionally represented.

In [None]:
# Load full golden test set
golden_path = project_root / 'data' / 'output' / 'golden_test_set.jsonl'
assert golden_path.exists(), f"Golden test set not found: {golden_path}"

with open(golden_path, 'r', encoding='utf-8') as f:
    golden_data = [json.loads(line) for line in f if line.strip()]

print(f"‚úì Loaded {len(golden_data)} golden test entries")

# Show category distribution
cat_counts = defaultdict(int)
for entry in golden_data:
    cat_counts[entry.get('category', 'unknown')] += 1

print("\nCategory distribution:")
for cat, count in sorted(cat_counts.items()):
    print(f"  {cat}: {count} ({count/len(golden_data)*100:.1f}%)")

In [None]:
# Stratified sample: 20 questions, proportional to category sizes
SAMPLE_SIZE = 20
SEED = 42
random.seed(SEED)

# Group by category
by_category = defaultdict(list)
for entry in golden_data:
    by_category[entry.get('category', 'unknown')].append(entry)

# Calculate proportional sample per category
sample = []
total = len(golden_data)
remaining = SAMPLE_SIZE

sorted_cats = sorted(by_category.keys())
for i, cat in enumerate(sorted_cats):
    if i == len(sorted_cats) - 1:
        n = remaining  # last category gets whatever is left
    else:
        n = max(1, round(SAMPLE_SIZE * len(by_category[cat]) / total))
    n = min(n, remaining, len(by_category[cat]))
    sample.extend(random.sample(by_category[cat], n))
    remaining -= n

random.shuffle(sample)

print(f"‚úì Sampled {len(sample)} questions (seed={SEED})")
print("\nSample category breakdown:")
sample_cats = defaultdict(int)
for s in sample:
    sample_cats[s['category']] += 1
for cat, count in sorted(sample_cats.items()):
    print(f"  {cat}: {count}")

In [None]:
# Preview the sample
sample_df = pd.DataFrame([
    {
        'idx': i + 1,
        'category': s['category'],
        'question': s['question'][:80] + ('...' if len(s['question']) > 80 else ''),
        'answer_preview': s['answer'][:60] + ('...' if len(s['answer']) > 60 else ''),
    }
    for i, s in enumerate(sample)
])
sample_df

## Step 2: Run The Intern (Fine-Tuned Model)

The Intern uses `query_intern(question, chunk_text=...)` with the ground-truth
chunk as context ‚Äî matching its fine-tuning setup.

> **Note:** Requires a GPU with the fine-tuned adapter files. If unavailable,
> this step will be gracefully skipped.

In [None]:
# Check if Intern is available
import torch

adapter_dir = project_root / config.get('finetuning', {}).get('output_dir', 'models/intern_adapter')
has_gpu = torch.cuda.is_available()
has_adapter = adapter_dir.exists() and any(adapter_dir.iterdir()) if adapter_dir.exists() else False
intern_available = has_gpu and has_adapter

print(f"GPU available     : {has_gpu}")
print(f"Adapter directory : {adapter_dir}")
print(f"Adapter files     : {has_adapter}")
print(f"\n{'‚úì Intern is AVAILABLE ‚Äì will run evaluation' if intern_available else '‚ö†Ô∏è  Intern UNAVAILABLE ‚Äì will be skipped (need GPU + adapter files)'}")

if intern_available:
    from src.finetuning.intern_inference import query_intern

In [None]:
intern_results = []

if intern_available:
    print(f"üîÑ Running Intern on {len(sample)} questions (latency_runs={latency_runs})...\n")
    for i, entry in enumerate(sample):
        print(f"  [{i+1}/{len(sample)}] {entry['question'][:70]}...", end=" ")
        try:
            lat = measure_latency(
                query_intern,
                entry['question'],
                chunk_text=entry.get('chunk_text', ''),
                config_path=str(config_path),
                runs=latency_runs,
            )
            intern_results.append({
                'question': entry['question'],
                'ground_truth': entry['answer'],
                'prediction': lat['result'],
                'latency_median_ms': lat['median_ms'],
                'latency_min_ms': lat['min_ms'],
                'latency_max_ms': lat['max_ms'],
                'error': None,
            })
            print(f"‚úì {lat['median_ms']:.0f}ms")
        except Exception as e:
            intern_results.append({
                'question': entry['question'],
                'ground_truth': entry['answer'],
                'prediction': f'[ERROR] {e}',
                'latency_median_ms': None,
                'latency_min_ms': None,
                'latency_max_ms': None,
                'error': str(e),
            })
            print(f"‚úó {e}")
    print(f"\n‚úì Intern complete: {sum(1 for r in intern_results if r['error'] is None)}/{len(sample)} succeeded")
else:
    print("‚è≠Ô∏è  Skipping Intern evaluation (GPU or adapter files not available)")
    for entry in sample:
        intern_results.append({
            'question': entry['question'],
            'ground_truth': entry['answer'],
            'prediction': '[SKIPPED] Intern not available',
            'latency_median_ms': None,
            'latency_min_ms': None,
            'latency_max_ms': None,
            'error': 'Intern not available (no GPU or adapter)',
        })

## Step 3: Run The Librarian (RAG Pipeline)

The Librarian uses `query_librarian(question)` ‚Äî it retrieves its own context
via hybrid search (Dense + BM25 ‚Üí RRF ‚Üí Cross-Encoder reranking) and then
generates an answer with `gpt-4o-mini`.

In [None]:
librarian_results = []

print(f"üîÑ Running Librarian on {len(sample)} questions...\n")

for i, entry in enumerate(sample):
    print(f"  [{i+1}/{len(sample)}] {entry['question'][:70]}...", end=" ")
    try:
        t0 = time.perf_counter()
        result = query_librarian(
            entry['question'],
            config_path=str(config_path),
            generator_mode='openai',
            verbose=False,
        )
        elapsed_ms = (time.perf_counter() - t0) * 1000.0

        librarian_results.append({
            'question': entry['question'],
            'ground_truth': entry['answer'],
            'prediction': result['answer'],
            'latency_median_ms': round(elapsed_ms, 1),
            'retrieval_ms': result['stats'].get('retrieval_ms'),
            'generation_ms': result['stats'].get('generation_ms'),
            'error': None,
        })
        print(f"‚úì {elapsed_ms:.0f}ms")
    except Exception as e:
        librarian_results.append({
            'question': entry['question'],
            'ground_truth': entry['answer'],
            'prediction': f'[ERROR] {e}',
            'latency_median_ms': None,
            'retrieval_ms': None,
            'generation_ms': None,
            'error': str(e),
        })
        print(f"‚úó {e}")

print(f"\n‚úì Librarian complete: {sum(1 for r in librarian_results if r['error'] is None)}/{len(sample)} succeeded")

## Step 4: ROUGE-L Scoring

ROUGE-L measures the longest common subsequence between the predicted answer
and the ground truth. Higher F1 = better textual overlap.

In [None]:
print("üìä Computing ROUGE-L scores...\n")

for results_list, name in [(intern_results, 'Intern'), (librarian_results, 'Librarian')]:
    for r in results_list:
        if r['error'] is None:
            rouge = compute_rouge_l(r['prediction'], r['ground_truth'])
            r['rouge_l_precision'] = rouge['precision']
            r['rouge_l_recall'] = rouge['recall']
            r['rouge_l_f1'] = rouge['fmeasure']
        else:
            r['rouge_l_precision'] = None
            r['rouge_l_recall'] = None
            r['rouge_l_f1'] = None

# Display ROUGE-L results side by side
rouge_rows = []
for i in range(len(sample)):
    rouge_rows.append({
        'Q#': i + 1,
        'Question': sample[i]['question'][:60] + '...',
        'Intern ROUGE-L F1': intern_results[i].get('rouge_l_f1'),
        'Librarian ROUGE-L F1': librarian_results[i].get('rouge_l_f1'),
    })

rouge_df = pd.DataFrame(rouge_rows)
print(rouge_df.to_string(index=False))

# Averages
intern_f1s = [r['rouge_l_f1'] for r in intern_results if r['rouge_l_f1'] is not None]
lib_f1s = [r['rouge_l_f1'] for r in librarian_results if r['rouge_l_f1'] is not None]

print(f"\n--- ROUGE-L F1 Averages ---")
if intern_f1s:
    print(f"  Intern    : {sum(intern_f1s)/len(intern_f1s):.4f} (n={len(intern_f1s)})")
else:
    print(f"  Intern    : N/A (skipped)")
print(f"  Librarian : {sum(lib_f1s)/len(lib_f1s):.4f} (n={len(lib_f1s)})")

## Step 5: LLM-as-a-Judge (Faithfulness & Accuracy)

We use **o3-mini** (a reasoning model) to score each answer on:
- **Faithfulness** (1-5): Does the answer only contain information supported by the ground truth?
- **Accuracy** (1-5): How factually correct and complete is it?

In [None]:
print(f"‚öñÔ∏è  Running LLM-as-a-Judge ({judge_model})...\n")

for results_list, name in [(intern_results, 'Intern'), (librarian_results, 'Librarian')]:
    print(f"--- Judging {name} answers ---")
    for i, r in enumerate(results_list):
        if r['error'] is not None:
            r['judge_faithfulness'] = None
            r['judge_accuracy'] = None
            r['judge_reasoning'] = 'Skipped (answer not available)'
            print(f"  [{i+1}] SKIPPED")
            continue

        print(f"  [{i+1}/{len(results_list)}] Judging...", end=" ")
        scores = llm_judge_score(
            question=r['question'],
            ground_truth=r['ground_truth'],
            prediction=r['prediction'],
            config=config,
        )
        r['judge_faithfulness'] = scores['faithfulness']
        r['judge_accuracy'] = scores['accuracy']
        r['judge_reasoning'] = scores['reasoning']
        print(f"Faith={scores['faithfulness']} Acc={scores['accuracy']}")
    print()

print("‚úì Judging complete")

## Step 6: Results Table & Visualization

A comprehensive comparison of both systems across all metrics.

In [None]:
# Build per-question results table
detail_rows = []
for i in range(len(sample)):
    ir = intern_results[i]
    lr = librarian_results[i]
    detail_rows.append({
        'Q#': i + 1,
        'Category': sample[i]['category'],
        'Question': sample[i]['question'][:50] + '...',
        # Intern metrics
        'Intern ROUGE-L': ir.get('rouge_l_f1'),
        'Intern Faith.': ir.get('judge_faithfulness'),
        'Intern Acc.': ir.get('judge_accuracy'),
        'Intern Latency (ms)': ir.get('latency_median_ms'),
        # Librarian metrics
        'Lib. ROUGE-L': lr.get('rouge_l_f1'),
        'Lib. Faith.': lr.get('judge_faithfulness'),
        'Lib. Acc.': lr.get('judge_accuracy'),
        'Lib. Latency (ms)': lr.get('latency_median_ms'),
    })

detail_df = pd.DataFrame(detail_rows)
print("üìã Per-Question Results")
print("=" * 120)
detail_df

In [None]:
# Aggregate summary
def safe_mean(values):
    valid = [v for v in values if v is not None]
    return round(sum(valid) / len(valid), 4) if valid else None

def safe_median(values):
    import statistics
    valid = [v for v in values if v is not None]
    return round(statistics.median(valid), 1) if valid else None

summary = {
    'Metric': [
        'ROUGE-L F1 (mean)',
        'Faithfulness (mean, 1-5)',
        'Accuracy (mean, 1-5)',
        'Latency ‚Äì median (ms)',
        'Latency ‚Äì min (ms)',
        'Latency ‚Äì max (ms)',
        'Success Rate',
    ],
    'The Intern': [
        safe_mean([r['rouge_l_f1'] for r in intern_results]),
        safe_mean([r.get('judge_faithfulness') for r in intern_results]),
        safe_mean([r.get('judge_accuracy') for r in intern_results]),
        safe_median([r['latency_median_ms'] for r in intern_results]),
        safe_median([r['latency_min_ms'] for r in intern_results]) if intern_available else None,
        safe_median([r['latency_max_ms'] for r in intern_results]) if intern_available else None,
        f"{sum(1 for r in intern_results if r['error'] is None)}/{len(intern_results)}",
    ],
    'The Librarian': [
        safe_mean([r['rouge_l_f1'] for r in librarian_results]),
        safe_mean([r.get('judge_faithfulness') for r in librarian_results]),
        safe_mean([r.get('judge_accuracy') for r in librarian_results]),
        safe_median([r['latency_median_ms'] for r in librarian_results]),
        safe_median([r.get('latency_min_ms') or r.get('latency_median_ms') for r in librarian_results]),
        safe_median([r.get('latency_max_ms') or r.get('latency_median_ms') for r in librarian_results]),
        f"{sum(1 for r in librarian_results if r['error'] is None)}/{len(librarian_results)}",
    ],
}

summary_df = pd.DataFrame(summary)
print("\n" + "=" * 70)
print("üìä AGGREGATE RESULTS: The Intern vs The Librarian")
print("=" * 70)
summary_df

In [None]:
# Visualization: side-by-side bar charts
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

bar_width = 0.35
colors_intern = '#4A90D9'
colors_librarian = '#E67E22'

# --- ROUGE-L F1 ---
ax = axes[0]
intern_rouge = [r.get('rouge_l_f1') or 0 for r in intern_results]
lib_rouge = [r.get('rouge_l_f1') or 0 for r in librarian_results]
x = range(1, len(sample) + 1)
ax.bar([i - bar_width/2 for i in x], intern_rouge, bar_width, label='Intern', color=colors_intern, alpha=0.85)
ax.bar([i + bar_width/2 for i in x], lib_rouge, bar_width, label='Librarian', color=colors_librarian, alpha=0.85)
ax.set_xlabel('Question #')
ax.set_ylabel('ROUGE-L F1')
ax.set_title('ROUGE-L F1 per Question')
ax.legend()
ax.set_ylim(0, 1)

# --- Faithfulness ---
ax = axes[1]
intern_faith = [r.get('judge_faithfulness') or 0 for r in intern_results]
lib_faith = [r.get('judge_faithfulness') or 0 for r in librarian_results]
ax.bar([i - bar_width/2 for i in x], intern_faith, bar_width, label='Intern', color=colors_intern, alpha=0.85)
ax.bar([i + bar_width/2 for i in x], lib_faith, bar_width, label='Librarian', color=colors_librarian, alpha=0.85)
ax.set_xlabel('Question #')
ax.set_ylabel('Score (1-5)')
ax.set_title('Faithfulness (LLM Judge)')
ax.legend()
ax.set_ylim(0, 5.5)

# --- Accuracy ---
ax = axes[2]
intern_acc = [r.get('judge_accuracy') or 0 for r in intern_results]
lib_acc = [r.get('judge_accuracy') or 0 for r in librarian_results]
ax.bar([i - bar_width/2 for i in x], intern_acc, bar_width, label='Intern', color=colors_intern, alpha=0.85)
ax.bar([i + bar_width/2 for i in x], lib_acc, bar_width, label='Librarian', color=colors_librarian, alpha=0.85)
ax.set_xlabel('Question #')
ax.set_ylabel('Score (1-5)')
ax.set_title('Accuracy (LLM Judge)')
ax.legend()
ax.set_ylim(0, 5.5)

plt.tight_layout()
plt.suptitle('The Intern vs The Librarian ‚Äî Per-Question Comparison', y=1.03, fontsize=14, fontweight='bold')
plt.show()

In [None]:
# Aggregate bar chart
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

metrics_agg = {
    'ROUGE-L F1': (
        safe_mean([r.get('rouge_l_f1') for r in intern_results]) or 0,
        safe_mean([r.get('rouge_l_f1') for r in librarian_results]) or 0,
    ),
    'Faithfulness': (
        safe_mean([r.get('judge_faithfulness') for r in intern_results]) or 0,
        safe_mean([r.get('judge_faithfulness') for r in librarian_results]) or 0,
    ),
    'Accuracy': (
        safe_mean([r.get('judge_accuracy') for r in intern_results]) or 0,
        safe_mean([r.get('judge_accuracy') for r in librarian_results]) or 0,
    ),
}

for ax, (metric, (intern_val, lib_val)) in zip(axes, metrics_agg.items()):
    bars = ax.bar(
        ['Intern', 'Librarian'],
        [intern_val, lib_val],
        color=[colors_intern, colors_librarian],
        alpha=0.85,
        edgecolor='white',
    )
    ax.set_title(metric, fontsize=12, fontweight='bold')
    ax.set_ylabel('Score')
    for bar, val in zip(bars, [intern_val, lib_val]):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                f'{val:.3f}' if val < 1 else f'{val:.2f}',
                ha='center', va='bottom', fontweight='bold', fontsize=11)
    if metric == 'ROUGE-L F1':
        ax.set_ylim(0, 1.1)
    else:
        ax.set_ylim(0, 5.5)

plt.tight_layout()
plt.suptitle('Aggregate Comparison', y=1.03, fontsize=14, fontweight='bold')
plt.show()

## Step 7: Bonus ‚Äî Cost Analysis

Estimate monthly cloud cost for both strategies at scale:
- **500 daily users** x **10 queries each** = **150,000 queries/month**

| Strategy | Infrastructure | API Cost |
|----------|---------------|----------|
| Intern | AWS g4dn.xlarge (24/7) | None (self-hosted) |
| Librarian | Lightweight instance + Weaviate | OpenAI gpt-4o-mini API |

In [None]:
cost = estimate_monthly_cost(config)

print("üí∞ Monthly Cost Estimation")
print("=" * 60)
print(f"Scale: {cost['queries_per_month']:,} queries/month")
print(f"       ({config['cost_analysis']['users_per_day']} users √ó "
      f"{config['cost_analysis']['queries_per_user']} queries √ó "
      f"{config['cost_analysis']['days_per_month']} days)")

print(f"\n--- The Intern (Self-Hosted) ---")
ic = cost['intern']
print(f"  Instance     : {ic['instance_type']}")
print(f"  Hourly cost  : ${ic['hourly_cost_usd']:.3f}")
print(f"  Hours/month  : {ic['hours_per_month']}")
print(f"  Compute cost : ${ic['compute_cost_usd']:,.2f}")
print(f"  API cost     : ${ic['api_cost_usd']:.2f}")
print(f"  ‚ñ∫ TOTAL      : ${ic['total_monthly_usd']:,.2f}/month")

print(f"\n--- The Librarian (API-Based RAG) ---")
lc = cost['librarian']
print(f"  LLM model    : {lc['llm_model']}")
print(f"  Input tokens  : {lc['input_tokens_total']:,} (${lc['api_input_cost_usd']:.2f})")
print(f"  Output tokens : {lc['output_tokens_total']:,} (${lc['api_output_cost_usd']:.2f})")
print(f"  API total     : ${lc['api_total_usd']:.2f}")
print(f"  Compute       : {lc['compute_instance']} ‚Üí ${lc['compute_cost_usd']:,.2f}")
print(f"  ‚ñ∫ TOTAL       : ${lc['total_monthly_usd']:,.2f}/month")

print(f"\n{'=' * 60}")
print(f"üí° Cheaper option: {cost['summary']['cheaper']} "
      f"(saves ~{cost['summary']['savings_pct']}%)")
print(f"\n‚ö†Ô∏è  {cost['summary']['disclaimer']}")

In [None]:
# Cost comparison chart
fig, ax = plt.subplots(figsize=(8, 5))

strategies = ['The Intern\n(Self-Hosted)', 'The Librarian\n(API + RAG)']

# Stacked bar: compute + API
compute_costs = [cost['intern']['compute_cost_usd'], cost['librarian']['compute_cost_usd']]
api_costs = [cost['intern']['api_cost_usd'], cost['librarian']['api_total_usd']]
totals = [cost['intern']['total_monthly_usd'], cost['librarian']['total_monthly_usd']]

bars1 = ax.bar(strategies, compute_costs, label='Compute (GPU/CPU)', color=colors_intern, alpha=0.85)
bars2 = ax.bar(strategies, api_costs, bottom=compute_costs, label='API Cost', color=colors_librarian, alpha=0.85)

for bar, total in zip(bars1, totals):
    ax.text(bar.get_x() + bar.get_width()/2, total + 5,
            f'${total:,.0f}', ha='center', va='bottom', fontweight='bold', fontsize=12)

ax.set_ylabel('Monthly Cost (USD)')
ax.set_title('Monthly Cost at Scale (150K queries/month)', fontweight='bold')
ax.legend()
plt.tight_layout()
plt.show()

## Summary

### What We Evaluated

| Component | Implementation |
|-----------|---------------|
| **Golden Test Set** | 20 stratified-sampled questions from 1,008 total |
| **System A ‚Äì The Intern** | Fine-tuned Llama-3-8B with LoRA adapters (Part 2) |
| **System B ‚Äì The Librarian** | Hybrid RAG (Dense+BM25 ‚Üí RRF ‚Üí Rerank) + gpt-4o-mini (Part 3) |
| **ROUGE-L** | Textual overlap with ground truth (precision, recall, F1) |
| **LLM-as-a-Judge** | o3-mini scoring Faithfulness & Accuracy (1-5 scale) |
| **Latency** | End-to-end response time in milliseconds |
| **Cost Analysis** | Monthly projection for 500 users √ó 10 queries/day |

### Key Takeaways

1. **Accuracy vs Cost Trade-off**: The Librarian (RAG + gpt-4o-mini) typically
   achieves higher accuracy on factual questions because it retrieves fresh context
   from the document. The Intern relies on what it learned during fine-tuning.

2. **Faithfulness**: RAG-based systems tend to score higher on faithfulness because
   the answer is grounded in retrieved passages. Fine-tuned models may hallucinate
   when the question falls outside their training distribution.

3. **Latency**: The Intern runs on a local GPU and is typically faster per query
   (no network round-trip). The Librarian adds retrieval + API latency.

4. **Cost at Scale**: Self-hosting (Intern) has a fixed compute cost regardless
   of query volume. The Librarian's API cost scales linearly but may still be
   cheaper at moderate volumes due to low per-token pricing of gpt-4o-mini.

### Artifacts

- `src/evaluation/` ‚Äî reusable evaluation modules (metrics, cost analysis)
- `config/config.yaml` ‚Äî evaluation parameters under `evaluation.*` and `cost_analysis.*`