In [None]:
import os
import json
import http.client
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from enum import Enum
import re
from datetime import datetime, timedelta

In [1]:
class ConfidenceLevel(Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"

@dataclass
class QueryAnalysis:
    confidence: ConfidenceLevel
    reasoning: str
    requires_web_search: bool
    search_query: Optional[str] = None

class WebBrowsingAgent:
    def __init__(self, serper_api_key: str):
        self.serper_api_key = serper_api_key
        self.knowledge_cutoff = datetime(2024, 4, 1)  # Adjust based on your model

        # Keywords that indicate need for current information
        self.current_info_keywords = [
            'current', 'latest', 'recent', 'now', 'today', 'this year',
            'price', 'stock', 'weather', 'news', 'trending'
        ]

        # Topics that change frequently
        self.dynamic_topics = [
            'stock market', 'cryptocurrency', 'weather', 'news', 'politics',
            'sports scores', 'exchange rates', 'technology releases'
        ]

    def analyze_query_confidence(self, query: str) -> QueryAnalysis:
        """
        Analyze the query to determine confidence level and need for web search
        """
        query_lower = query.lower()

        # Check for current information indicators
        has_current_keywords = any(keyword in query_lower for keyword in self.current_info_keywords)

        # Check for dynamic topics
        has_dynamic_topics = any(topic in query_lower for topic in self.dynamic_topics)

        # Check for specific dates after knowledge cutoff
        date_pattern = r'\b(202[4-9]|20[3-9]\d)\b'
        has_recent_dates = bool(re.search(date_pattern, query))

        # Check for specific people, companies, or events that might be recent
        proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', query)
        has_specific_entities = len(proper_nouns) > 0

        # Determine confidence level
        if has_current_keywords or has_recent_dates:
            confidence = ConfidenceLevel.LOW
            reasoning = "Query contains current/recent information indicators"
            requires_search = True
        elif has_dynamic_topics:
            confidence = ConfidenceLevel.LOW
            reasoning = "Query involves rapidly changing topics"
            requires_search = True
        elif has_specific_entities and self._is_potentially_unknown_entity(query):
            confidence = ConfidenceLevel.MEDIUM
            reasoning = "Query contains specific entities that may not be in training data"
            requires_search = True
        elif self._is_factual_query(query):
            confidence = ConfidenceLevel.HIGH
            reasoning = "Query involves well-established factual information"
            requires_search = False
        else:
            confidence = ConfidenceLevel.MEDIUM
            reasoning = "Uncertain about completeness of available information"
            requires_search = True

        search_query = self._generate_search_query(query) if requires_search else None

        return QueryAnalysis(
            confidence=confidence,
            reasoning=reasoning,
            requires_web_search=requires_search,
            search_query=search_query
        )

    def _is_potentially_unknown_entity(self, query: str) -> bool:
        """Check if query contains entities that might not be well-known"""
        # Simple heuristic - could be improved with NER
        unknown_indicators = [
            'who is', 'what is', 'tell me about', 'information about'
        ]
        return any(indicator in query.lower() for indicator in unknown_indicators)

    def _is_factual_query(self, query: str) -> bool:
        """Check if query is about well-established facts"""
        factual_indicators = [
            'capital of', 'formula for', 'definition of', 'how to calculate',
            'what does', 'explain', 'difference between'
        ]
        return any(indicator in query.lower() for indicator in factual_indicators)

    def _generate_search_query(self, query: str) -> str:
        """Generate an optimized search query"""
        # Remove question words and optimize for search
        stop_words = ['what', 'who', 'when', 'where', 'why', 'how', 'is', 'are', 'the', 'a', 'an']
        words = query.lower().split()
        filtered_words = [word for word in words if word not in stop_words]
        return ' '.join(filtered_words[:6])  # Limit to 6 words for better search results

    def search_web(self, query: str, country_code: str = "us") -> Dict:
        """Search the web using Serper API"""
        try:
            conn = http.client.HTTPSConnection("google.serper.dev")
            payload = json.dumps({
                "q": query,
                "gl": country_code,
                "num": 5  # Limit results
            })
            headers = {
                'X-API-KEY': self.serper_api_key,
                'Content-Type': 'application/json'
            }

            conn.request("POST", "/search", payload, headers)
            res = conn.getresponse()
            data = res.read()
            conn.close()

            return json.loads(data.decode("utf-8"))
        except Exception as e:
            return {"error": f"Search failed: {str(e)}"}

    def process_search_results(self, search_results: Dict) -> str:
        """Process and summarize search results"""
        if "error" in search_results:
            return f"Search error: {search_results['error']}"

        if "organic" not in search_results:
            return "No search results found."

        summary = "Based on web search results:\n\n"

        for i, result in enumerate(search_results["organic"][:3], 1):
            title = result.get("title", "No title")
            snippet = result.get("snippet", "No description")
            link = result.get("link", "")

            summary += f"{i}. **{title}**\n"
            summary += f"   {snippet}\n"
            summary += f"   Source: {link}\n\n"

        return summary

    def answer_query(self, query: str, base_knowledge_answer: str = None) -> Dict:
        """
        Main method to answer a query, deciding whether to use web search
        """
        analysis = self.analyze_query_confidence(query)

        response = {
            "query": query,
            "confidence_analysis": {
                "level": analysis.confidence.value,
                "reasoning": analysis.reasoning,
                "web_search_used": analysis.requires_web_search
            }
        }

        if not analysis.requires_web_search:
            # High confidence - use base knowledge
            response["answer"] = base_knowledge_answer or "Based on my knowledge: [Provide answer here]"
            response["sources"] = ["Internal knowledge base"]
        else:
            # Low/Medium confidence - search the web
            search_results = self.search_web(analysis.search_query)
            web_summary = self.process_search_results(search_results)

            if base_knowledge_answer:
                response["answer"] = f"{base_knowledge_answer}\n\n{web_summary}"
            else:
                response["answer"] = web_summary

            response["sources"] = ["Web search results"]
            response["search_query_used"] = analysis.search_query

        return response

# Example usage and testing
def main():
    # Initialize agent (you'll need to set your API key)
    agent = WebBrowsingAgent(serper_api_key=os.environ.get("SERPER_API_KEY", "your_api_key_here"))

    # Test queries with different confidence levels
    test_queries = [
        "What is the capital of France?",  # High confidence
        "Current price of Bitcoin",        # Low confidence - current info
        "Who is Ojasw Kant?",             # Medium confidence - specific person
        "Latest news about AI",           # Low confidence - current info
        "How to calculate compound interest", # High confidence - factual
        "Weather in New York today"       # Low confidence - current info
    ]

    print("=== Web Browsing Agent Test ===\n")

    for query in test_queries:
        print(f"Query: {query}")

        # Analyze confidence
        analysis = agent.analyze_query_confidence(query)
        print(f"Confidence: {analysis.confidence.value}")
        print(f"Reasoning: {analysis.reasoning}")
        print(f"Web search needed: {analysis.requires_web_search}")

        if analysis.requires_web_search:
            print(f"Search query: {analysis.search_query}")

        print("-" * 50)

if __name__ == "__main__":
    main()

=== Web Browsing Agent Test ===

Query: What is the capital of France?
Confidence: medium
Reasoning: Query contains specific entities that may not be in training data
Web search needed: True
Search query: capital of france?
--------------------------------------------------
Query: Current price of Bitcoin
Confidence: low
Reasoning: Query contains current/recent information indicators
Web search needed: True
Search query: current price of bitcoin
--------------------------------------------------
Query: Who is Ojasw Kant?
Confidence: medium
Reasoning: Query contains specific entities that may not be in training data
Web search needed: True
Search query: ojasw kant?
--------------------------------------------------
Query: Latest news about AI
Confidence: low
Reasoning: Query contains current/recent information indicators
Web search needed: True
Search query: latest news about ai
--------------------------------------------------
Query: How to calculate compound interest
Confidence: high