In [None]:
# nb22_duckduckgo_search.ipynb
# Goals: DuckDuckGo API + rate limiting + error handling + safe search

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Dependencies & Imports
import time
import json
from typing import List, Dict, Optional, Union
from dataclasses import dataclass
from datetime import datetime, timedelta
import requests
from urllib.parse import quote_plus
import threading
from collections import defaultdict

# Install if needed
try:
    from duckduckgo_search import DDGS
except ImportError:
    print("Installing duckduckgo-search...")
    import subprocess

    subprocess.run(["pip", "install", "duckduckgo-search>=3.8.0"], check=True)
    from duckduckgo_search import DDGS

In [None]:
# Cell 3: Rate Limiter Class
@dataclass
class RateLimit:
    max_requests: int = 10  # per window
    window_seconds: int = 60  # 1 minute window


class RateLimiter:
    def __init__(self, rate_limit: RateLimit):
        self.rate_limit = rate_limit
        self.requests = defaultdict(list)  # {user_id: [timestamps]}
        self.lock = threading.Lock()

    def is_allowed(self, user_id: str = "default") -> bool:
        """Check if request is allowed under rate limit"""
        with self.lock:
            now = datetime.now()
            user_requests = self.requests[user_id]

            # Clean old requests outside window
            cutoff = now - timedelta(seconds=self.rate_limit.window_seconds)
            self.requests[user_id] = [
                req_time for req_time in user_requests if req_time > cutoff
            ]

            # Check if under limit
            if len(self.requests[user_id]) < self.rate_limit.max_requests:
                self.requests[user_id].append(now)
                return True
            return False

    def wait_time(self, user_id: str = "default") -> float:
        """Get seconds to wait before next request allowed"""
        with self.lock:
            now = datetime.now()
            user_requests = self.requests[user_id]
            if not user_requests:
                return 0.0

            oldest_request = min(user_requests)
            wait_until = oldest_request + timedelta(
                seconds=self.rate_limit.window_seconds
            )

            if wait_until > now:
                return (wait_until - now).total_seconds()
            return 0.0

In [None]:
# Cell 4: DuckDuckGo Search Tool
class DDGSearchTool:
    def __init__(self, rate_limit: Optional[RateLimit] = None, timeout: int = 10):
        self.rate_limiter = RateLimiter(
            rate_limit or RateLimit(max_requests=10, window_seconds=60)
        )
        self.timeout = timeout
        self.ddgs = DDGS(timeout=self.timeout)

    def search(
        self,
        query: str,
        max_results: int = 5,
        safe_search: str = "moderate",  # off, moderate, strict
        region: str = "wt-wt",  # worldwide
        user_id: str = "default",
    ) -> Dict[str, Union[List[Dict], str]]:
        """
        Search DuckDuckGo with rate limiting and error handling

        Args:
            query: Search query string
            max_results: Maximum number of results (1-20)
            safe_search: Safe search level
            region: Search region code
            user_id: User identifier for rate limiting

        Returns:
            Dict with 'success', 'results', 'error', 'rate_limited' fields
        """
        # Input validation
        if not query or not query.strip():
            return {"success": False, "error": "Empty query", "results": []}

        query = query.strip()[:500]  # Limit query length
        max_results = min(max(1, max_results), 20)  # Clamp 1-20

        # Rate limiting check
        if not self.rate_limiter.is_allowed(user_id):
            wait_time = self.rate_limiter.wait_time(user_id)
            return {
                "success": False,
                "error": f"Rate limited. Wait {wait_time:.1f} seconds",
                "rate_limited": True,
                "wait_time": wait_time,
                "results": [],
            }

        try:
            print(f"[DDG] Searching: '{query[:50]}...' (max_results={max_results})")
            start_time = time.time()

            # Perform search
            results = list(
                self.ddgs.text(
                    keywords=query,
                    max_results=max_results,
                    safesearch=safe_search,
                    region=region,
                )
            )

            elapsed = time.time() - start_time

            # Format results
            formatted_results = []
            for i, result in enumerate(results):
                formatted_results.append(
                    {
                        "rank": i + 1,
                        "title": result.get("title", "")[:200],  # Limit title length
                        "body": result.get("body", "")[:500],  # Limit snippet length
                        "href": result.get("href", ""),
                        "hostname": self._extract_hostname(result.get("href", "")),
                    }
                )

            return {
                "success": True,
                "results": formatted_results,
                "query": query,
                "count": len(formatted_results),
                "elapsed_seconds": round(elapsed, 2),
                "timestamp": datetime.now().isoformat(),
            }

        except Exception as e:
            error_msg = f"Search failed: {str(e)}"
            print(f"[DDG Error] {error_msg}")
            return {"success": False, "error": error_msg, "results": [], "query": query}

    def _extract_hostname(self, url: str) -> str:
        """Extract hostname from URL for display"""
        try:
            from urllib.parse import urlparse

            return urlparse(url).netloc
        except:
            return ""

    def search_with_retry(self, query: str, max_retries: int = 2, **kwargs) -> Dict:
        """Search with automatic retry on rate limits"""
        for attempt in range(max_retries + 1):
            result = self.search(query, **kwargs)

            if result["success"] or not result.get("rate_limited", False):
                return result

            if attempt < max_retries:
                wait_time = result.get("wait_time", 1.0)
                print(
                    f"[DDG] Rate limited, waiting {wait_time:.1f}s before retry {attempt + 1}/{max_retries}"
                )
                time.sleep(wait_time + 0.1)  # Small buffer

        return result

In [None]:
# Cell 5: JSON Schema for Function Calling
DDG_SEARCH_SCHEMA = {
    "name": "web_search",
    "description": "Search the web using DuckDuckGo",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query string",
                "maxLength": 500,
            },
            "max_results": {
                "type": "integer",
                "description": "Maximum number of results to return",
                "minimum": 1,
                "maximum": 20,
                "default": 5,
            },
            "safe_search": {
                "type": "string",
                "enum": ["off", "moderate", "strict"],
                "description": "Safe search filtering level",
                "default": "moderate",
            },
        },
        "required": ["query"],
    },
}


def validate_search_args(args: Dict) -> Dict:
    """Validate and sanitize search arguments"""
    validated = {}

    # Required: query
    query = args.get("query", "").strip()
    if not query:
        raise ValueError("Query is required and cannot be empty")
    validated["query"] = query[:500]  # Limit length

    # Optional: max_results
    max_results = args.get("max_results", 5)
    if isinstance(max_results, str):
        try:
            max_results = int(max_results)
        except ValueError:
            max_results = 5
    validated["max_results"] = min(max(1, max_results), 20)

    # Optional: safe_search
    safe_search = args.get("safe_search", "moderate")
    if safe_search not in ["off", "moderate", "strict"]:
        safe_search = "moderate"
    validated["safe_search"] = safe_search

    return validated


In [None]:
# Cell 6: Tool Registry Integration
TOOL_REGISTRY = {
    "web_search": {
        "schema": DDG_SEARCH_SCHEMA,
        "validator": validate_search_args,
        "handler": None,  # Will be set below
    }
}


def register_ddg_tool(search_tool: DDGSearchTool):
    """Register DDG search tool in global registry"""

    def handler(args: Dict) -> Dict:
        validated_args = validate_search_args(args)
        return search_tool.search(**validated_args)

    TOOL_REGISTRY["web_search"]["handler"] = handler
    return handler

In [None]:
# Cell 7: Safety & Content Filtering
class SearchSafetyFilter:
    def __init__(self):
        # Basic keyword blocklist (extend as needed)
        self.blocked_keywords = {
            "adult",
            "explicit",
            "nsfw",
            "porn",
            "xxx",
            "violence",
            "illegal",
            "hack",
            "exploit",
            "malware",
        }

        # Allowed domains whitelist (optional, empty means all allowed)
        self.allowed_domains = set()  # e.g., {"wikipedia.org", "github.com"}

        # Blocked domains
        self.blocked_domains = {
            "malicioussite.com",
            "badactor.net",  # Add known bad domains
        }

    def is_query_safe(self, query: str) -> tuple[bool, str]:
        """Check if search query is safe"""
        query_lower = query.lower()

        for keyword in self.blocked_keywords:
            if keyword in query_lower:
                return False, f"Query contains blocked keyword: {keyword}"

        return True, ""

    def filter_results(self, results: List[Dict]) -> List[Dict]:
        """Filter search results based on safety rules"""
        filtered = []

        for result in results:
            hostname = result.get("hostname", "").lower()

            # Check blocked domains
            if any(blocked in hostname for blocked in self.blocked_domains):
                continue

            # Check allowed domains (if whitelist is set)
            if self.allowed_domains and not any(
                allowed in hostname for allowed in self.allowed_domains
            ):
                continue

            # Check content for blocked keywords (basic)
            title_body = (
                result.get("title", "") + " " + result.get("body", "")
            ).lower()
            if any(keyword in title_body for keyword in self.blocked_keywords):
                continue

            filtered.append(result)

        return filtered

In [None]:
# Cell 8: Complete Search Tool with Safety
class SafeDDGSearch:
    def __init__(
        self, rate_limit: Optional[RateLimit] = None, enable_safety: bool = True
    ):
        self.search_tool = DDGSearchTool(rate_limit)
        self.safety_filter = SearchSafetyFilter() if enable_safety else None

    def search(self, query: str, **kwargs) -> Dict:
        """Safe search with filtering"""
        # Safety check on query
        if self.safety_filter:
            is_safe, reason = self.safety_filter.is_query_safe(query)
            if not is_safe:
                return {
                    "success": False,
                    "error": f"Query blocked by safety filter: {reason}",
                    "results": [],
                }

        # Perform search
        result = self.search_tool.search(query, **kwargs)

        # Filter results if search succeeded
        if result["success"] and self.safety_filter:
            original_count = len(result["results"])
            result["results"] = self.safety_filter.filter_results(result["results"])
            filtered_count = len(result["results"])

            if filtered_count < original_count:
                result["filtered_count"] = original_count - filtered_count

        return result

In [None]:
# Cell 9: Smoke Test
def smoke_test():
    """Test DuckDuckGo search functionality"""
    print("=== DDG Search Smoke Test ===")

    # Create search tool with relaxed rate limits for testing
    rate_limit = RateLimit(max_requests=5, window_seconds=10)
    search_tool = SafeDDGSearch(rate_limit=rate_limit, enable_safety=True)

    # Test 1: Basic search
    print("\n1. Basic search test:")
    result = search_tool.search("Python programming tutorial", max_results=3)
    print(f"Success: {result['success']}")
    if result["success"]:
        print(f"Found {result['count']} results in {result['elapsed_seconds']}s")
        for i, res in enumerate(result["results"][:2]):
            print(f"  [{i+1}] {res['title'][:60]}...")
            print(f"      {res['hostname']}")
    else:
        print(f"Error: {result['error']}")

    # Test 2: Rate limiting
    print("\n2. Rate limiting test:")
    for i in range(3):
        result = search_tool.search(f"test query {i}", max_results=1)
        if result.get("rate_limited"):
            print(
                f"  Request {i+1}: Rate limited (wait {result.get('wait_time', 0):.1f}s)"
            )
            break
        else:
            print(f"  Request {i+1}: {'Success' if result['success'] else 'Failed'}")

    # Test 3: Safety filter
    print("\n3. Safety filter test:")
    unsafe_query = "how to hack websites"
    result = search_tool.search(unsafe_query, max_results=1)
    print(f"Unsafe query blocked: {not result['success']}")
    if not result["success"]:
        print(f"  Reason: {result['error']}")

    # Test 4: Tool registry
    print("\n4. Tool registry test:")
    register_ddg_tool(search_tool.search_tool)
    handler = TOOL_REGISTRY["web_search"]["handler"]
    if handler:
        test_args = {"query": "machine learning", "max_results": 2}
        result = handler(test_args)
        print(f"Registry handler: {'Success' if result['success'] else 'Failed'}")

    print("\n=== Smoke Test Complete ===")


# Run smoke test
smoke_test()

In [None]:
# Cell 10: Example Usage Patterns
def example_usage():
    """Demonstrate different usage patterns"""
    print("=== Usage Examples ===")

    # Setup
    search_tool = SafeDDGSearch(
        rate_limit=RateLimit(max_requests=10, window_seconds=60), enable_safety=True
    )

    # Example 1: Simple search
    print("\n1. Simple search:")
    result = search_tool.search("what is RAG in AI", max_results=3)
    if result["success"]:
        print(f"Query: {result['query']}")
        print(
            f"Results: {result['count']}/{result.get('filtered_count', 0)} (filtered)"
        )
        for res in result["results"][:1]:  # Show first result
            print(f"  Title: {res['title']}")
            print(f"  Snippet: {res['body'][:100]}...")
            print(f"  URL: {res['href']}")

    # Example 2: Search with retry
    print("\n2. Search with retry:")
    result = search_tool.search_tool.search_with_retry(
        "Transformer attention mechanism", max_results=2, max_retries=1
    )
    print(f"Retry search: {'Success' if result['success'] else 'Failed'}")

    # Example 3: Function calling format
    print("\n3. Function calling format:")
    function_call = {
        "tool": "web_search",
        "args": {
            "query": "Chinese text chunking strategies",
            "max_results": 3,
            "safe_search": "moderate",
        },
    }

    try:
        validated_args = validate_search_args(function_call["args"])
        print(f"Validated args: {validated_args}")

        # Simulate tool execution
        handler = TOOL_REGISTRY["web_search"]["handler"]
        if handler:
            result = handler(function_call["args"])
            print(f"Tool execution: {'Success' if result['success'] else 'Failed'}")
    except Exception as e:
        print(f"Validation error: {e}")

    print("\n=== Examples Complete ===")


# Run examples
example_usage()

In [None]:
# Cell 11: Performance & Monitoring
def performance_benchmark():
    """Benchmark search performance"""
    print("=== Performance Benchmark ===")

    search_tool = DDGSearchTool(
        rate_limit=RateLimit(max_requests=20, window_seconds=60)
    )

    queries = [
        "Python machine learning",
        "RAG retrieval augmented generation",
        "Chinese NLP preprocessing",
        "FAISS vector database",
        "transformer attention mechanism",
    ]

    total_time = 0
    successful_searches = 0
    total_results = 0

    for i, query in enumerate(queries):
        print(f"\nQuery {i+1}: {query}")

        start_time = time.time()
        result = search_tool.search(query, max_results=3)
        elapsed = time.time() - start_time

        if result["success"]:
            successful_searches += 1
            total_results += result["count"]
            print(f"  ✓ Success: {result['count']} results in {elapsed:.2f}s")
        else:
            print(f"  ✗ Failed: {result['error']}")

        total_time += elapsed

        # Small delay to avoid rate limiting
        time.sleep(0.5)

    print(f"\n=== Benchmark Results ===")
    print(f"Successful searches: {successful_searches}/{len(queries)}")
    print(f"Total results: {total_results}")
    print(f"Average time per search: {total_time/len(queries):.2f}s")
    print(f"Success rate: {successful_searches/len(queries)*100:.1f}%")


# Uncomment to run benchmark (may hit rate limits)
# performance_benchmark()

In [None]:
# Cell 12: What We Built & Key Takeaways
print(
    """
=== What We Built ===
1. ✅ RateLimiter: Thread-safe rate limiting with configurable windows
2. ✅ DDGSearchTool: Core search with error handling and timeouts
3. ✅ Safety filtering: Query validation and result filtering
4. ✅ Tool registry: JSON schema validation for function calling
5. ✅ Retry logic: Automatic retry on rate limit hits
6. ✅ Performance monitoring: Timing and success rate tracking

=== Core Concepts ===
• Rate limiting: Essential for API stability and avoiding blocks
• Error handling: Graceful degradation when searches fail
• Safety filtering: Prevent misuse with query/result validation
• Function calling: Structured tool interface for LLM integration
• Monitoring: Track performance and success rates

=== Pitfalls & Solutions ===
❌ Rate limit violations → ✅ Implement proper rate limiting with retries
❌ Unsafe search queries → ✅ Content filtering and validation
❌ Network timeouts → ✅ Configurable timeouts and error handling
❌ Malformed tool calls → ✅ JSON schema validation with type checking
❌ Memory leaks in threading → ✅ Proper cleanup in RateLimiter

=== When to Use This ===
• Real-time information retrieval for AI agents
• Research tasks requiring current web data
• Fact-checking and information verification
• Content discovery and topic exploration
• As a component in ReAct/tool-calling workflows

=== Next Steps ===
• Integrate with content extraction (trafilatura) → nb23
• Add to ReAct agent workflow → nb25
• Implement result caching for repeated queries
• Add more sophisticated safety rules
• Support for different search types (news, images, etc.)
"""
)