# DeepResearch-Bench Evaluation Pipeline

This notebook implements the complete evaluation pipeline for DeepResearch-Bench, including:
1. Data Preparation (Download/Mock)
2. Metric Verification
3. Full Benchmark Execution
4. Results Analysis

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import json
import numpy as np

# Add backend/src to path
sys.path.append(os.path.abspath("../backend/src"))

from evaluation.data import download_benchmark_data
from evaluation.bench import BenchmarkEvaluator
from agent.deep_search_agent import DeepSearchAgent
from langchain_google_genai import ChatGoogleGenerativeAI

## 1. Data Preparation

In [None]:
data_dir = "../data/benchmark"
print(f"Setting up benchmark data in {data_dir}...")
download_benchmark_data(data_dir)

## 2. Setup Agent & Evaluator

In [None]:
# Initialize Agent
try:
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0)
except Exception:
    print("Using Mock LLM for benchmark setup")
    class MockLLM:
        def invoke(self, prompt):
            # Return generic JSON for verifications
            if "JSON" in str(prompt):
                if "claims" in str(prompt):
                    return '```json\n{"claims": ["Claim 1", "Claim 2"]}\n```'
                return '```json\n{"verified": true, "confidence": 0.9, "reasoning": "good"}\n```'
            return "Evaluated Answer"
        def generate(self, prompt): return self.invoke(prompt)
    llm = MockLLM()

agent = DeepSearchAgent(llm_client=llm)

# Initialize Evaluator
evaluator = BenchmarkEvaluator(agent, data_dir=data_dir)

## 3. Run Benchmark

We will run the benchmark on the available data (mock or real).

In [None]:
print("Starting Evaluation...")
results = evaluator.evaluate_full_benchmark(
    output_file="../results/benchmark_run.json"
)

print("Evaluation Finished.")

## 4. Results Analysis

In [None]:
if results:
    print("Final Scores:")
    print(json.dumps(results, indent=2))
else:
    print("No results to analyze.")