<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Masonry_AI_Researcher_Mohd_Kaif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# Install required libraries
!pip install -qU langchain langchain-groq langchain-community crawl4ai requests beautifulsoup4 googlesearch-python nest-asyncio tavily-python langgraph pyOpenSSL==24.2.1

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.4/58.4 kB 4.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 140.3/140.3 kB 6.9 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 147.7/147.7 kB 8.7 MB/s eta 0:00:00
[?25h

In [21]:
import os
import asyncio
import json
from typing import List, Dict, Any, TypedDict, Annotated
from datetime import datetime
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.tools import tool
from langgraph.graph import StateGraph, END
from langchain_core.messages import HumanMessage, AIMessage
from crawl4ai import AsyncWebCrawler
from bs4 import BeautifulSoup
import requests
import nest_asyncio
from langchain_core.prompts import MessagesPlaceholder

# Apply nest_asyncio for compatibility
nest_asyncio.apply()

# Set API keys (replace with your actual keys)
os.environ["GROQ_API_KEY"] = "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"
os.environ["TAVILY_API_KEY"] = "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"
os.environ["SERPER_API_KEY"] = "95c2797a69b167639c98ab054e8597d752c6fe6d"

In [22]:
# Define Agent State
class AgentState(TypedDict):
    query: str
    session_id: str
    query_analysis: Dict[str, Any]
    search_results: List[Dict[str, str]]
    crawled_contents: List[Dict[str, Any]]
    analyzed_contents: List[Dict[str, Any]]
    final_report: Dict[str, Any]
    chat_history: List[Annotated[HumanMessage, AIMessage]]

# Initialize Groq LLMs
def init_llms() -> Dict[str, ChatGroq]:
    """Initialize Groq LLMs for summarization and analysis."""
    return {
        'summary': ChatGroq(model_name="llama3-70b-8192", temperature=0.7),
        'analysis': ChatGroq(model_name="llama3-70b-8192", temperature=0.5)  # Replaced deprecated mixtral-8x7b-32768
    }

# Initialize HTTP session
def init_session():
    """Initialize a requests session with default headers."""
    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"})
    return session

In [23]:
# Define Tools
@tool
def search_google(query: str) -> List[Dict[str, str]]:
    """Perform a Google search using googlesearch-python."""
    print(f"Searching Google for: {query}")
    try:
        from googlesearch import search
        results = [{'url': url, 'title': '', 'snippet': ''} for url in search(query, num_results=5)]
        return results
    except Exception as e:
        print(f"Google search error: {str(e)}")
        return []

@tool
def search_tavily(query: str) -> List[Dict[str, str]]:
    """Perform a search using Tavily API."""
    print(f"Searching Tavily for: {query}")
    try:
        from tavily import TavilyClient
        tavily = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])
        results = tavily.search(query, max_results=5)
        return [{'url': r['url'], 'title': r['title'], 'snippet': r['content']} for r in results['results']]
    except Exception as e:
        print(f"Tavily search error: {str(e)}")
        return []

@tool
def search_serper(query: str) -> List[Dict[str, str]]:
    """Perform a search using Google Serper API."""
    print(f"Searching Serper for: {query}")
    try:
        url = "https://google.serper.dev/search"
        headers = {
            "X-API-KEY": os.environ["SERPER_API_KEY"],
            "Content-Type": "application/json"
        }
        payload = {"q": query, "num": 5}
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        results = response.json().get('organic', [])
        return [{'url': r['link'], 'title': r.get('title', ''), 'snippet': r.get('snippet', '')} for r in results]
    except Exception as e:
        print(f"Serper search error: {str(e)}")
        return []

@tool
async def crawl_url(url: str) -> Dict[str, Any]:
    """Crawl a URL using Crawl4AI with advanced features."""
    print(f"Crawling URL: {url}")
    try:
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(
                url=url,
                params={
                    "return_format": "markdown",
                    "proxy_enabled": False,
                    "request_timeout": 30,
                    "limit": 1,
                    "css_selector": "article, main, .content",
                    "js_execution": True,
                    "extract_hidden": True
                }
            )
            if result.success:
                return {
                    'url': url,
                    'title': result.title or url,
                    'content': result.markdown[:2000],
                    'success': True
                }
    except Exception as e:
        print(f"Error crawling {url}: {str(e)}")
    return {'url': url, 'title': '', 'content': '', 'success': False}

@tool
async def extract_content_fallback(url: str) -> Dict[str, Any]:
    """Fallback content extraction using BeautifulSoup."""
    print(f"Extracting content (fallback) from: {url}")
    session = init_session()
    try:
        response = await asyncio.get_event_loop().run_in_executor(None, lambda: session.get(url, timeout=10))
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        for script in soup(["script", "style"]):
            script.decompose()
        text = soup.get_text(separator=' ', strip=True)
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)[:2000]
        title = soup.title.string if soup.title else url
        return {'url': url, 'title': title, 'content': text, 'success': True}
    except Exception as e:
        print(f"Error extracting content from {url}: {str(e)}")
        return {'url': url, 'title': '', 'content': '', 'success': False}

In [24]:
# Agent Functions
def query_analysis(state: AgentState) -> AgentState:
    """Analyze the query to extract search terms and context."""
    llm = init_llms()['analysis']
    prompt = ChatPromptTemplate.from_template(
        "Analyze the query: '{query}'.\n"
        "Return a JSON object with:\n"
        "- 'search_terms': Main keywords for search.\n"
        "- 'context': Brief context or intent of the query.\n"
        "- 'complexity': 'simple' or 'complex' based on query structure.\n"
        "- 'info_type': 'facts', 'news', 'opinions', or 'mixed'."
    )
    chain = prompt | llm | JsonOutputParser()
    try:
        result = chain.invoke({"query": state['query']})
        print(f"Query analysis: {result}")
        state['query_analysis'] = result
    except Exception as e:
        print(f"Query analysis error: {str(e)}")
        state['query_analysis'] = {
            'search_terms': state['query'],
            'context': 'General query',
            'complexity': 'simple',
            'info_type': 'mixed'
        }
    return state

def search(state: AgentState) -> AgentState:
    """Execute web searches using multiple tools."""
    llm = init_llms()['analysis']
    tools = [search_google, search_tavily, search_serper]

    # Define a ReAct-compatible prompt
    prompt = ChatPromptTemplate.from_template(
        "You are a search agent. Answer the following query using the provided tools.\n\n"
        "Query: {input}\n\n"
        "Tools available: {tool_names}\n\n"
        "Tool descriptions:\n{tools}\n\n"
        "Use the tools to find relevant URLs for the query. Return a list of URLs with titles and snippets.\n\n"
        "Agent scratchpad:\n{agent_scratchpad}"
    )

    # Create the ReAct agent
    agent = create_react_agent(llm, tools, prompt)
    executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

    try:
        # Let the agent decide which tools to use
        result = executor.invoke({"input": state['query_analysis']['search_terms']})
        search_results = result.get('output', []) if isinstance(result, dict) else []

        # Ensure search_results is a list of dictionaries
        if isinstance(search_results, list) and all(isinstance(r, dict) for r in search_results):
            state['search_results'] = search_results
        else:
            state['search_results'] = [{'url': r, 'title': '', 'snippet': ''} for r in search_results]

        # Deduplicate URLs
        seen_urls = set()
        state['search_results'] = [r for r in state['search_results'] if r['url'] not in seen_urls and not seen_urls.add(r['url'])]
        print(f"Search results: {len(state['search_results'])} unique URLs found")
    except Exception as e:
        print(f"Search error: {str(e)}")
        state['search_results'] = []
    return state

async def crawl(state: AgentState) -> AgentState:
    """Crawl URLs to extract content."""
    contents = await asyncio.gather(*[crawl_url.invoke(r['url']) for r in state['search_results'][:5]], return_exceptions=True)
    for i, content in enumerate(contents):
        if isinstance(content, Exception) or not content['success']:
            print(f"Retrying crawl for {state['search_results'][i]['url']} with fallback")
            contents[i] = await extract_content_fallback.invoke(state['search_results'][i]['url'])
    state['crawled_contents'] = contents
    print(f"Crawled {len([c for c in contents if c['success']])} URLs successfully")
    return state

def analyze_content(state: AgentState) -> AgentState:
    """Analyze crawled content for relevance and key information."""
    llm = init_llms()['summary']
    prompt = ChatPromptTemplate.from_template(
        "Given the query: '{query}' and the following content:\n"
        "{content}\n\n"
        "Return a JSON object with:\n"
        "- 'url': Source URL.\n"
        "- 'summary': Brief summary (50-100 words).\n"
        "- 'relevance': Score from 0 to 1 (1 being most relevant).\n"
        "- 'key_points': List of 2-3 key points."
    )
    chain = prompt | llm | JsonOutputParser()
    results = []
    for content in state['crawled_contents']:
        if content['success']:
            try:
                result = chain.invoke({"query": state['query'], "content": content['content'][:1000]})
                result['url'] = content['url']
                results.append(result)
                print(f"Analyzed content from {content['url']}: {result['summary']}")
            except Exception as e:
                print(f"Error analyzing content from {content['url']}: {str(e)}")
    state['analyzed_contents'] = results
    return state

async def synthesize(state: AgentState) -> AgentState:
    """Synthesize analyzed content into a final report."""
    llm = init_llms()['summary']
    prompt = ChatPromptTemplate.from_template(
        "Query: {query}\n"
        "Analyzed Content: {content}\n\n"
        "Synthesize the information into a JSON report with:\n"
        "- 'query': Original query.\n"
        "- 'timestamp': Current timestamp.\n"
        "- 'summary': Comprehensive summary (150-200 words).\n"
        "- 'sources': List of URLs and their summaries.\n"
        "- 'confidence': Overall confidence score (0 to 1).\n"
        "- 'recommendations': List of 1-2 follow-up actions."
    )
    chain = prompt | llm | JsonOutputParser()
    content_str = json.dumps(state['analyzed_contents'], indent=2)
    try:
        report = chain.invoke({"query": state['query'], "content": content_str})
        report['timestamp'] = datetime.utcnow().isoformat()
        print(f"Synthesized report for query: {state['query']}")
        state['final_report'] = report
    except Exception as e:
        print(f"Synthesis error: {str(e)}")
        state['final_report'] = {
            'query': state['query'],
            'timestamp': datetime.utcnow().isoformat(),
            'summary': 'Error synthesizing information.',
            'sources': [],
            'confidence': 0.0,
            'recommendations': []
        }
    return state

In [25]:
# LangGraph Workflow
def build_graph() -> StateGraph:
    """Build the LangGraph workflow."""
    graph = StateGraph(AgentState)

    # Add nodes with unique names
    graph.add_node("query_analyzer", query_analysis)
    graph.add_node("searcher", search)
    graph.add_node("crawler", crawl)
    graph.add_node("content_analyzer", analyze_content)
    graph.add_node("synthesizer", synthesize)

    # Define edges
    graph.set_entry_point("query_analyzer")
    graph.add_edge("query_analyzer", "searcher")
    graph.add_edge("searcher", "crawler")
    graph.add_edge("crawler", "content_analyzer")
    graph.add_edge("content_analyzer", "synthesizer")
    graph.add_edge("synthesizer", END)

    return graph.compile()

# Save Report
def save_report(report: Dict[str, Any], filename: str = "research_report.json"):
    """Save the research report to a JSON file."""
    try:
        with open(filename, 'w') as f:
            json.dump(report, f, indent=2)
        print(f"Report saved to {filename}")
    except Exception as e:
        print(f"Error saving report to {filename}: {str(e)}")

In [26]:
# Main Research Function
async def research(query: str, session_id: str = "default") -> Dict[str, Any]:
    """Execute the research pipeline."""
    print(f"Starting research for query: {query}")
    graph = build_graph()
    state = AgentState(
        query=query,
        session_id=session_id,
        query_analysis={},
        search_results=[],
        crawled_contents=[],
        analyzed_contents=[],
        final_report={},
        chat_history=[]
    )
    final_state = await graph.ainvoke(state)
    return final_state['final_report']

# Test the Agent
async def test_research_agent():
    """Test the research pipeline with sample queries."""
    test_queries = [
        "What are the latest advancements in quantum computing as of 2025?",
        "Compare the economic impacts of AI adoption in the US vs. China."
    ]
    for query in test_queries:
        print(f"\nTesting query: {query}")
        result = await research(query, session_id=f"test_{query.replace(' ', '_')}")
        print(json.dumps(result, indent=2))
        save_report(result, f"research_report_{query.replace(' ', '_')}.json")

# Run tests
if __name__ == "__main__":
    asyncio.run(test_research_agent())


Testing query: What are the latest advancements in quantum computing as of 2025?
Starting research for query: What are the latest advancements in quantum computing as of 2025?
Query analysis: {'search_terms': ['latest advancements', 'quantum computing', '2025'], 'context': 'The user is looking for recent developments and updates in the field of quantum computing as of the year 2025.', 'complexity': 'simple', 'info_type': 'facts'}


> Entering new AgentExecutor chain...
Search error: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse LLM output: `To answer the query, I'll use a combination of the available tools to find relevant URLs. Here's my approach:

```python
def search_agent(query):
    results = []
    
    # Search Google using googlesearch-python
    google_results = search_google(' '.join(query))
    for result in google_results:
    