In [None]:
# --- COLAB SETUP START ---
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    # 1. Clone the repository
    !git clone https://github.com/google-gemini/gemini-fullstack-langgraph-quickstart
    %cd gemini-fullstack-langgraph-quickstart/backend

    # 2. Prepare Environment (Resolving Conflicts)
    import sys
    print("Uninstalling conflicting pre-installed packages...")
    !pip uninstall -y google-ai-generativelanguage tensorflow grpcio-status

    # Pre-install PyTorch Nightly if Python 3.13+ is detected
    if sys.version_info >= (3, 13):
        print("Detected Python 3.13+. Installing PyTorch Nightly...")
        !pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu

    # 3. Install Backend
    !pip install .

    # 4. Set API Key
    import os
    from google.colab import userdata
    try:
        os.environ["GEMINI_API_KEY"] = userdata.get('GEMINI_API_KEY')
    except:
        print("Please enter your Gemini API Key:")
        os.environ["GEMINI_API_KEY"] = input()
# --- COLAB SETUP END ---

In [None]:
# --- MODEL CONFIGURATION ---
# @title Select Gemini Model
# @markdown Choose the model strategy to use.
# @markdown *Note: Experimental models (2.0/2.5) may require billing/quota enablement.*

MODEL_STRATEGY = "Free Tier (Gemini 1.5 Flash)" # @param ["Free Tier (Gemini 1.5 Flash)", "Experimental (Gemini 2.0 Flash)", "Flash Lite (Gemini 2.0 Flash-Lite)", "Gemini 2.5 Flash (Experimental)", "Gemini 2.5 Pro (Experimental)"]

import os

# Map selection to actual model names
if MODEL_STRATEGY == "Free Tier (Gemini 1.5 Flash)":
    SELECTED_MODEL = "gemini-1.5-flash"
    print("‚úÖ Using Free Tier Model: gemini-1.5-flash")
elif MODEL_STRATEGY == "Experimental (Gemini 2.0 Flash)":
    SELECTED_MODEL = "gemini-2.0-flash-exp"
    print("‚ö†Ô∏è Using Experimental Model: gemini-2.0-flash-exp (Quota Required)")
elif MODEL_STRATEGY == "Flash Lite (Gemini 2.0 Flash-Lite)":
    SELECTED_MODEL = "gemini-2.0-flash-lite-preview-02-05"
    print("‚ö†Ô∏è Using Preview Model: gemini-2.0-flash-lite-preview-02-05 (Quota Required)")
elif MODEL_STRATEGY == "Gemini 2.5 Flash (Experimental)":
    SELECTED_MODEL = "gemini-2.5-flash"
    print("üî• Using Gemini 2.5 Flash: gemini-2.5-flash (Quota Required)")
elif MODEL_STRATEGY == "Gemini 2.5 Pro (Experimental)":
    SELECTED_MODEL = "gemini-2.5-pro"
    print("üî• Using Gemini 2.5 Pro: gemini-2.5-pro (Quota Required)")

# Set Environment Variables to override defaults
os.environ["QUERY_GENERATOR_MODEL"] = SELECTED_MODEL
os.environ["REFLECTION_MODEL"] = SELECTED_MODEL
os.environ["ANSWER_MODEL"] = SELECTED_MODEL
os.environ["TOOLS_MODEL"] = SELECTED_MODEL

# DeepResearch-Bench Evaluation Pipeline

This notebook implements the complete evaluation pipeline for DeepResearch-Bench, including:
1. Data Preparation (Download/Mock)
2. Metric Verification
3. Full Benchmark Execution
4. Results Analysis

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import json
import numpy as np

# Add backend/src to path
sys.path.append(os.path.abspath("../backend/src"))

from evaluation.data import download_benchmark_data
from evaluation.bench import BenchmarkEvaluator
from agent.deep_search_agent import DeepSearchAgent
from langchain_google_genai import ChatGoogleGenerativeAI

## 1. Data Preparation

In [None]:
data_dir = "../data/benchmark"
print(f"Setting up benchmark data in {data_dir}...")
download_benchmark_data(data_dir)

## 2. Setup Agent & Evaluator

In [None]:
# Initialize Agent
try:
    model_name = os.environ.get("ANSWER_MODEL", "gemini-1.5-flash")
    print(f"Initializing LLM with model: {model_name}")
    llm = ChatGoogleGenerativeAI(model=model_name, temperature=0)
except Exception:
    print("Using Mock LLM for benchmark setup")
    class MockLLM:
        def invoke(self, prompt):
            # Return generic JSON for verifications
            if "JSON" in str(prompt):
                if "claims" in str(prompt):
                    return '```json\n{"claims": ["Claim 1", "Claim 2"]}\n```'
                return '```json\n{"verified": true, "confidence": 0.9, "reasoning": "good"}\n```'
            return "Evaluated Answer"
        def generate(self, prompt): return self.invoke(prompt)
    llm = MockLLM()

agent = DeepSearchAgent(llm_client=llm)

# Initialize Evaluator
evaluator = BenchmarkEvaluator(agent, data_dir=data_dir)

## 3. Run Benchmark

We will run the benchmark on the available data (mock or real).

In [None]:
print("Starting Evaluation...")
results = evaluator.evaluate_full_benchmark(
    output_file="../results/benchmark_run.json"
)

print("Evaluation Finished.")

## 4. Results Analysis

In [None]:
if results:
    print("Final Scores:")
    print(json.dumps(results, indent=2))
else:
    print("No results to analyze.")