In [None]:
# Universal Setup for Backend Environment
import sys
import os
import subprocess
from pathlib import Path

def setup_environment():
    """Setup the environment by installing necessary dependencies and setting paths."""
    # Get the backend directory. If we are in 'backend', it is cwd.
    backend_dir = Path.cwd()
    if backend_dir.name != 'backend':
        # Search for backend
        if (backend_dir / 'backend').exists():
             backend_dir = backend_dir / 'backend'
        elif (backend_dir.parent / 'backend').exists():
             backend_dir = backend_dir.parent / 'backend'
    
    # Add src to path if it exists (for 'from agent import ...' style)
    src_dir = backend_dir / 'src'
    if src_dir.exists():
        if str(src_dir) not in sys.path:
            sys.path.append(str(src_dir))
            print(f"  [OK] Added {src_dir} to sys.path")
    
    if str(backend_dir) not in sys.path:
        sys.path.append(str(backend_dir))
        
    # Verify backend/agent can be imported
    try:
        import agent
        print("  [OK] Agent module found and imported.")
    except ImportError:
        print("  [!] Agent module not found. Installing dependencies...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", str(backend_dir)])
        print("  [OK] Backend installed in editable mode.")

setup_environment()

In [None]:
# --- COLAB SETUP START ---
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    # 1. Clone the repository
    !rm -rf gemini-fullstack-langgraph-quickstart
!git clone https://github.com/MasumRab/gemini-fullstack-langgraph-quickstart
    %cd gemini-fullstack-langgraph-quickstart/backend

    # 2. Prepare Environment (Resolving Conflicts)
    import sys
    print("Uninstalling conflicting pre-installed packages...")
    !pip uninstall -y google-ai-generativelanguage tensorflow grpcio-status

    # Pre-install PyTorch Nightly if Python 3.13+ is detected
    if sys.version_info >= (3, 13):
        print("Detected Python 3.13+. Installing PyTorch Nightly...")
        !pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu

    # 3. Install Backend
    !pip install .

    # 4. Set API Key
    import os
    from google.colab import userdata
    try:
        os.environ["GEMINI_API_KEY"] = userdata.get('GEMINI_API_KEY')
    except:
        print("Please enter your Gemini API Key:")
        os.environ["GEMINI_API_KEY"] = input()
# --- COLAB SETUP END ---

In [None]:
# --- MODEL CONFIGURATION ---
# @title Select Gemini Model
# @markdown Choose the Gemini model to use. Only Gemini 2.5 models are currently accessible via the API.

MODEL_STRATEGY = "Gemini 2.5 Flash (Recommended)" # @param ["Gemini 2.5 Flash (Recommended)", "Gemini 2.5 Flash-Lite (Fastest)", "Gemini 2.5 Pro (Best Quality)"]

import os

# Map selection to model ID
# Note: Gemini 1.5 and 2.0 models are deprecated/not accessible via this API
if MODEL_STRATEGY == "Gemini 2.5 Flash (Recommended)":
    SELECTED_MODEL = "gemini-2.5-flash"
elif MODEL_STRATEGY == "Gemini 2.5 Flash-Lite (Fastest)":
    SELECTED_MODEL = "gemini-2.5-flash-lite"
elif MODEL_STRATEGY == "Gemini 2.5 Pro (Best Quality)":
    SELECTED_MODEL = "gemini-2.5-pro"
else:
    # Default fallback
    SELECTED_MODEL = "gemini-2.5-flash"

print(f"Selected Model: {SELECTED_MODEL}")
print(f"Strategy: {MODEL_STRATEGY}")

# Set Environment Variables to override defaults
os.environ["QUERY_GENERATOR_MODEL"] = SELECTED_MODEL
os.environ["REFLECTION_MODEL"] = SELECTED_MODEL
os.environ["ANSWER_MODEL"] = SELECTED_MODEL
os.environ["TOOLS_MODEL"] = SELECTED_MODEL

In [None]:
# --- MODEL VERIFICATION (Optional) ---
# @title Verify Model Configuration
# @markdown Run this cell to verify that your API key is configured correctly and the selected model is available.

import os

# Check if API key is set
if "GEMINI_API_KEY" not in os.environ:
    print("‚ö†Ô∏è  GEMINI_API_KEY not found in environment variables!")
    print("   Please set it before proceeding:")
    print("   export GEMINI_API_KEY='your-api-key-here'")
else:
    try:
        from google import genai
        
        # Initialize the client
        client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
        
        # Test the selected model
        print(f"üß™ Testing model: {SELECTED_MODEL}")
        response = client.models.generate_content(
            model=SELECTED_MODEL,
            contents="Explain how AI works in a few words"
        )
        
        print(f"  [OK] Model verification successful!")
        print(f"   Model: {SELECTED_MODEL}")
        print(f"   Response: {response.text[:100]}...")
        
    except ImportError:
        print("  [!] google-genai package not installed!")
        print("   Installing now...")
        import subprocess
        import sys
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "google-genai"])
        print("  [OK] Installed! Please re-run this cell.")
        
    except Exception as e:
        print(f"  [X] Model verification failed: {e}")
        print(f"   This could mean:")
        print(f"   - Invalid API key")
        print(f"   - Model '{SELECTED_MODEL}' not available in your region")
        print(f"   - Quota/billing issues (for experimental models)")
        print(f"   - Network connectivity issues")

# DeepResearch-Bench Evaluation Pipeline

This notebook implements the complete evaluation pipeline for DeepResearch-Bench, including:
1. Data Preparation (Download/Mock)
2. Metric Verification
3. Full Benchmark Execution
4. Results Analysis

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import json
import numpy as np

# Add backend/src to path
sys.path.append(os.path.abspath("../backend/src"))

from evaluation.data import download_benchmark_data
from evaluation.bench import BenchmarkEvaluator
from agent.deep_search_agent import DeepSearchAgent
from langchain_google_genai import ChatGoogleGenerativeAI

## 1. Data Preparation

In [None]:
data_dir = "../data/benchmark"
print(f"Setting up benchmark data in {data_dir}...")
download_benchmark_data(data_dir)

## 2. Setup Agent & Evaluator

In [None]:
# Initialize Agent
try:
    model_name = os.environ.get("ANSWER_MODEL", "gemini-2.5-flash")
    print(f"Initializing LLM with model: {model_name}")
    llm = ChatGoogleGenerativeAI(model=model_name, temperature=0)
except Exception:
    print("Using Mock LLM for benchmark setup")
    class MockLLM:
        def invoke(self, prompt):
            # Return generic JSON for verifications
            if "JSON" in str(prompt):
                if "claims" in str(prompt):
                    return '```json\n{"claims": ["Claim 1", "Claim 2"]}\n```'
                return '```json\n{"verified": true, "confidence": 0.9, "reasoning": "good"}\n```'
            return "Evaluated Answer"
        def generate(self, prompt): return self.invoke(prompt)
    llm = MockLLM()

agent = DeepSearchAgent(llm_client=llm)

# Initialize Evaluator
evaluator = BenchmarkEvaluator(agent, data_dir=data_dir)

## 3. Run Benchmark

We will run the benchmark on the available data (mock or real).

In [None]:
print("Starting Evaluation...")
results = evaluator.evaluate_full_benchmark(
    output_file="../results/benchmark_run.json"
)

print("Evaluation Finished.")

## 4. Results Analysis

In [None]:
if results:
    print("Final Scores:")
    print(json.dumps(results, indent=2))
else:
    print("No results to analyze.")