In [None]:
# Universal Setup for Backend Environment
import sys
import os
import subprocess
from pathlib import Path

def setup_environment():
    """Setup the environment by installing necessary dependencies and setting paths."""
    # Get the backend directory (assumed to be one level up from notebooks)
    backend_dir = Path.cwd().parent / 'backend'
    if not backend_dir.exists():
        # If running from root
        backend_dir = Path.cwd() / 'backend'
    
    if str(backend_dir) not in sys.path:
        sys.path.append(str(backend_dir))
        
    # Verify backend can be imported
    try:
        import src
        print("✅ Backend module found and imported.")
    except ImportError:
        print("❌ Backend module not found. Installing dependencies...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", str(backend_dir)])
        print("✅ Backend installed in editable mode.")
        
    # Check for Tavily and install if missing (optional/adapter specific)
    try:
        import tavily
    except ImportError:
        print("Installing tavily-python...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "tavily-python"])

setup_environment()

# Search Tool Comparison

This notebook compares the performance of different search providers (Google, Bing, Brave, DuckDuckGo, Tavily) based on citations, highlights, and result quality across different domains.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from src.search.router import SearchRouter, SearchProviderType
from src.config.app_config import config

# Set API keys if not already in env (USER SHOULD REPLACE THESE)
# os.environ["TAVILY_API_KEY"] = "..."
# os.environ["BING_API_KEY"] = "..."
# os.environ["GEMINI_API_KEY"] = "..."
# os.environ["BRAVE_API_KEY"] = "..."

# Initialize Router
router = SearchRouter()

print("Registered Providers:", list(router.providers.keys()))

In [None]:
def compare_search_results(queries, providers=None, max_results=3):
    if providers is None:
        providers = [p.value for p in SearchProviderType]
        
    results_data = []
    
    for query_info in queries:
        domain = query_info['domain']
        query = query_info['query']
        print(f"Processing Query: {query} ({domain})...")
        
        for provider_name in providers:
            try:
                # Force specific provider
                results = router.search(query, max_results=max_results, provider_name=provider_name, attempt_fallback=False)
                
                for i, res in enumerate(results):
                    # Highlight detection: Check for bold tags or source specific metadata
                    has_highlight = False
                    if "<b>" in res.content or "<strong>" in res.content:
                        has_highlight = True
                    
                    # Citation check: Does it have a clear source URL and title?
                    has_citation = bool(res.url and res.title)
                    
                    results_data.append({
                        "Domain": domain,
                        "Query": query,
                        "Provider": provider_name,
                        "Rank": i + 1,
                        "Title": res.title,
                        "URL": res.url,
                        "Snippet": res.content[:200] + "..." if len(res.content) > 200 else res.content,
                        "Has Highlight": has_highlight,
                        "Has Citation": has_citation
                    })
            except Exception as e:
                print(f"Error with {provider_name}: {e}")
                results_data.append({
                    "Domain": domain,
                    "Query": query,
                    "Provider": provider_name,
                    "Rank": -1,
                    "Title": "ERROR",
                    "URL": "",
                    "Snippet": str(e),
                    "Has Highlight": False,
                    "Has Citation": False
                })
                
    return pd.DataFrame(results_data)

In [None]:
# Define Test Queries
queries = [
    {"domain": "Healthcare", "query": "latest diabetes type 2 treatments 2024"},
    {"domain": "Tech", "query": "python 3.13 new features gil"},
    {"domain": "News", "query": "current events in space exploration October 2024"},
    {"domain": "Legal", "query": "GDPR requirements for AI companies"}
]

# Run Comparison
df = compare_search_results(queries)

# Display Results (Grouped by Query)
display(HTML("<h2>Search Results Comparison</h2>"))
display(df)

# Visualization: Citation Quality (Simulated by average snippet length and highlight presence)
# Inspired by SOTA Comparison Notebook (metrics visualization)
if not df.empty:
    # Count valid results per provider
    valid_counts = df[df['Rank'] != -1].groupby('Provider').count()['URL']
    
    plt.figure(figsize=(10, 5))
    valid_counts.plot(kind='bar', color='skyblue')
    plt.title('Valid Results Returned per Provider')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()
    
    # Highlight presence
    highlight_counts = df[df['Has Highlight'] == True].groupby('Provider').count()['URL']
    if not highlight_counts.empty:
        plt.figure(figsize=(10, 5))
        highlight_counts.plot(kind='bar', color='orange')
        plt.title('Results with Highlights/Bolding')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.show()

In [None]:
# Detailed View of Highlights (HTML Rendering)
print("Detailed HTML Snippets (First result for Tech Query):")
if not df.empty:
    tech_results = df[(df['Domain'] == 'Tech') & (df['Rank'] == 1)]

    for _, row in tech_results.iterrows():
        display(HTML(f"<h3>{row['Provider']}</h3><p><b>Title:</b> {row['Title']}</p><p><b>URL:</b> <a href='{row['URL']}'>{row['URL']}</a></p><div style='border:1px solid #ccc; padding:10px; background:#f9f9f9'>{row['Snippet']}</div>"))