# Deep Search Agent

In [None]:
!pip install -qqq crawl4ai
!crawl4ai-setup
!pip install -qqq langchain langchain-community google-ai-generativelanguage==0.6.15
!pip install -qqq langchain-google-genai
!pip install -qqq googlesearch-python mistralai PyPDF2

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/292.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.0/278.0 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
import os
import json
import base64
import requests
from tqdm.auto import tqdm
from mistralai import Mistral
from PyPDF2 import PdfReader, PdfWriter

from pydantic import BaseModel, Field
from typing import List, Tuple

from googlesearch import search

from langchain_core.tools import tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage, AIMessage, HumanMessage, ToolMessage

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

In [None]:
# API KEYS
import os
from google.colab import userdata

GEMINI_API_KEY = userdata.get('GOOGLE_API_MLWH')
MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')

## Helper functions

In [None]:
def generate_plan(query: str) -> dict:
    model = ChatGoogleGenerativeAI(
      model="models/gemini-2.0-flash",
      temperature=0,
      max_tokens=None,
      timeout=None,
      max_retries=2,
      api_key=GEMINI_API_KEY
    )

    planning_system_prompt = """
You are an expert Research Analyst and Strategic Planner. Your mission is to deconstruct a user's query into a comprehensive and logical research plan. This plan will be the foundation for generating a thorough, well-supported report.

You must meticulously follow the instructions for each field of the output.

**1. Task: `query_breakdown`**

Your first task is to deeply analyze the user's query. Do not just summarize it. Your breakdown must identify the following components:
* **Core Intent:** What is the user's ultimate goal? What are they trying to achieve or understand?
* **Key Entities & Concepts:** Identify the main subjects, organizations, people, technologies, or abstract concepts at the heart of the query.
* **Scope & Constraints:** Define the boundaries of the query. Is it limited by time (e.g., "in the last 5 years"), geography (e.g., "in Europe"), or other factors? If not specified, note that the scope is broad.

**2. Task: `sub_questions`**

Based on your `query_breakdown`, create a list of sub-questions that form a logical pathway to a complete answer. The questions must build upon each other, progressing from foundational knowledge to specific, analytical details.

* **Structure:** Start with foundational questions (e.g., "What is [concept]?", "What is the history of [event]?"), then move to core details (e.g., "How does [entity A] work?", "What are the key factors driving [trend]?"), and conclude with analytical or comparative questions (e.g., "What are the long-term impacts of [event]?", "How does [option A] compare to [option B]?").
* **Clarity:** Each question should be clear, concise, and answerable.
* **Requirement:** Generate at least 3, but preferably 4-5, sub-questions to ensure thoroughness.

**3. Task: `search_queries`**

For each sub-question you generated, create a web search query. It is crucial that you **do not simply rephrase the sub-questions**. Instead, craft queries a human expert would use to get the best possible results from a search engine like Google.

* **Optimization:** Employ search operator best practices. Think about keywords, phrases in quotes for exact matches, and adding context words.
* **Query-Crafting Techniques:**
    * **Keyword Queries:** `[entity] benefits disadvantages`
    * **Statistical Queries:** `[topic] statistics 2024` or `growth rate of [industry]`
    * **Comparative Queries:** `[product A] vs [product B] review`
    * **Process Queries:** `how to implement [strategy]` or `[technology] working principle`
    * **Authoritative Source Queries:** `[topic] site:.gov` or `[medical condition] site:who.int`
* Use the technique most suitable for the topic at hand, in case of uncertainty, just convert the sub-question to a web search query.
    """
    class PlanningOutput(BaseModel):
      query_breakdown: str = Field(..., description="breakdown of the user query")
      sub_questions: list[str] = Field(..., description="list of sub-questions to answer")
      search_queries: list[str] = Field(..., description="list of web search queries to run")

    planning_model = model.with_structured_output(PlanningOutput)

    messages = [
        ("system", planning_system_prompt),
        ("user", query)
    ]

    response = planning_model.invoke(messages)
    plan = response.model_dump()

    return plan

In [None]:
res = generate_plan("How the retail industry has changed in the past 3 years")

In [None]:
res

{'query_breakdown': "The user wants to understand the changes that have occurred in the retail industry over the past three years. The core intent is to get an overview of the transformations within the retail sector. Key entities include the 'retail industry' and the timeframe is 'past 3 years'. The scope is broad, covering all aspects of the retail industry without specific geographic or sub-sector limitations.",
 'sub_questions': ['What were the major trends affecting the retail industry in the past 3 years?',
  'How did the COVID-19 pandemic impact the retail industry in the past 3 years?',
  'What has been the growth rate of e-commerce in the retail sector over the past 3 years?',
  'How have supply chain disruptions affected the retail industry in the past 3 years?',
  'What new technologies have been adopted by the retail industry in the past 3 years?'],
 'search_queries': ['"retail industry trends" 2021-2024',
  '"impact of COVID-19 on retail" 2021-2024',
  '"e-commerce growth"

In [None]:
def web_search(search_queries: list[str], num_results: int = 5) -> list[str]:
    """
    Return a list of top N URLs from a Google search.
    """
    results = []
    for search_query in search_queries:
      query_related_urls = []
      urls = search(search_query, num_results=num_results, timeout=5, unique=True)
      for url in urls:
        verdict, url_type = should_skip_url(url)
        if not verdict:
          query_related_urls.append((url_type, url))
      results.append(query_related_urls)
    return results

In [None]:
def should_skip_url(url: str) -> Tuple[bool, str]:
    # New check: Skip non-HTTPS URLs
    if not url.lower().startswith('https://'):
        print(f"[INFO] Skipping URL (non-https): {url}")
        return True, 'non-https'

    # Existing logic below (unchanged)
    NON_TEXTUAL_MIME_TYPES = [
        'application/zip',
        'application/x-rar-compressed',
        'application/octet-stream',
        'image/',
        'video/',
        'audio/',
        'font/',
    ]

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        head_response = requests.head(url, allow_redirects=True, timeout=5, headers=headers)
        head_response.raise_for_status()
        content_type = head_response.headers.get('Content-Type', '').lower()

        for mime_type in NON_TEXTUAL_MIME_TYPES:
            if content_type.startswith(mime_type):
                detected_type = mime_type.strip('/')
                print(f"[INFO] Skipping URL (non-textual content type: {detected_type}): {url}")
                return True, detected_type

        with requests.get(url, allow_redirects=True, timeout=10, headers=headers, stream=True) as get_response:
            get_response.raise_for_status()
            final_content_type = get_response.headers.get('Content-Type', '').lower()

            for mime_type in NON_TEXTUAL_MIME_TYPES:
                if final_content_type.startswith(mime_type):
                    detected_type = mime_type.strip('/')
                    print(f"[INFO] Skipping URL (non-textual content type on GET: {detected_type}): {url}")
                    return True, detected_type

            if 'application/pdf' in final_content_type:
                print(f"[SUCCESS] URL content is valid (pdf): {url}")
                return False, 'pdf'
            else:
                print(f"[SUCCESS] URL content is valid (html/text): {url}")
                return False, 'html'

    except requests.exceptions.RequestException as e:
        print(f"[ERROR] Skipping URL due to request error: {url}\n\tDetails: {e}")
        return True, 'error'
    except Exception as e:
        print(f"[ERROR] Skipping URL due to an unexpected error: {url}\n\tDetails: {e}")
        return True, 'error'

In [None]:
async def crawl4ai_func(url):
    # Define markdown generator
    md_generator = DefaultMarkdownGenerator(
        options={
            "ignore_links": True,
            "skip_internal_links": True,
            "ignore_images": True
        }
    )

    # Build the crawler config
    crawl_config = CrawlerRunConfig(
        markdown_generator=md_generator,
        scraping_strategy=LXMLWebScrapingStrategy(),
        cache_mode=CacheMode.BYPASS
    )

    # Create a browser config if needed
    browser_cfg = BrowserConfig(headless=True)

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        result = await crawler.arun(
            url=url,
            config=crawl_config
        )

        if result.success:
            if result.extracted_content:
                data = json.loads(result.extracted_content)
                print("Extracted items:", data)
            return result[0].markdown
        else:
            print("Error:", result.error_message)

In [None]:
def download_pdf(pdf_url, file_name="original"):
  response = requests.get(pdf_url)
  original_pdf_path = f"{file_name}.pdf"

  with open(original_pdf_path, "wb") as f:
      f.write(response.content)
  return original_pdf_path

In [None]:
# Extract only the first N pages
def extract_pages(input_path, output_path, page_count=3):
    reader = PdfReader(input_path)
    writer = PdfWriter()
    for i in range(min(page_count, len(reader.pages))):
        writer.add_page(reader.pages[i])
    with open(output_path, "wb") as f:
        writer.write(f)

In [None]:
def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None

In [None]:
def pdf_ocr(url, page_count=5):
  # Download the PDF from URL
  original_pdf_path = download_pdf(url)

  shortened_pdf_path = "shortened.pdf"
  extract_pages(original_pdf_path, shortened_pdf_path, page_count=page_count)

  # Getting the base64 string
  base64_pdf = encode_pdf(shortened_pdf_path)

  client = Mistral(api_key=MISTRAL_API_KEY)

  ocr_response = client.ocr.process(
      model="mistral-ocr-latest",
      document={
          "type": "document_url",
          "document_url": f"data:application/pdf;base64,{base64_pdf}"
      },
      include_image_base64=True
  )
  return ocr_response

In [None]:
def clean_scraped_data_llm(sub_question, scraped_data):
  model = ChatGoogleGenerativeAI(
      model="models/gemini-2.0-flash", #"models/gemini-2.5-flash-preview-05-20"
      temperature=0,
      max_tokens=None,
      timeout=None,
      max_retries=2,
      api_key=GEMINI_API_KEY
    )

  clean_data_system_prompt = """
You are an expert Information Retrieval Assistant. Your task is to extract all relevant text chunks from a scraped text based on a user query. You must follow a precise, multi-step process to ensure maximum relevance and recall.

What you will get as input:
1. `query`: The user's original question or topic.
2. `scraped_text`: A large body of text.

Your Job (Multi-Step Process):

**Step 1: Query Analysis and Expansion**
* Analyze the user's `query` to understand its core intent.
* Generate a list of at least 5-10 related keywords, synonyms, and alternative phrases. For example, if the query is about "cost of living," you might expand it to include "housing prices," "rent," "grocery bills," "inflation," "expenses," etc. This expanded set will be used for retrieval.

**Step 2: Text Segmentation (Chunking)**
* Segment the `scraped_text` into logical chunks. The ideal chunk is a single paragraph. If the text has no clear paragraphs, split it by sentences. Do not use arbitrary fixed-size chunks. Create a list of these text chunks.

**Step 3: Initial Retrieval (Candidate Generation)**
* Go through your list of text chunks from Step 2.
* Identify an initial set of "candidate chunks." A chunk is a candidate if it contains ANY of the original query keywords OR any of the expanded keywords/phrases from Step 1.
* This step should be broad; the goal is to capture everything that could possibly be relevant.

**Step 4: Relevance Re-ranking and Filtering**
* For each "candidate chunk" you identified in Step 3, perform a fine-grained relevance analysis.
* Evaluate how directly the chunk answers or relates to the user's ORIGINAL `query`.
* Assign a relevance score: 'High', 'Medium', or 'Low'.
* Only keep the chunks that you score as 'High' or 'Medium' relevance.

**Step 5: Final Output Generation**
* Create a final list containing only the 'High' and 'Medium' relevance chunks you filtered in Step 4.
* Crucially, the text in this final list must be the **original, unaltered text** from the chunks.
* Return this final list of chunks.

Important:
* In case of no relevant information return empty list.
"""

  class ChunksOutput(BaseModel):
    chunks_list: List[str] = Field(..., description="List of relevant text chunks to the query, in case of no relevant information return empty list")

  data_cleaning_model = model.with_structured_output(ChunksOutput)

  chunks_list = []
  parag_size = 20000  # size in charachters (not tokens)
  nbr_parag, remaining = divmod(len(scraped_data), parag_size)
  max_nbr_para = min(nbr_parag, 5)
  for i in range(max_nbr_para):
    messages = [
        ("system", clean_data_system_prompt),
        ("user", f"Query:\n{sub_question}\nScraped text:\n{scraped_data[i*parag_size:(i+1)*parag_size]}")
    ]
    response = data_cleaning_model.invoke(messages)
    chunks_list.extend(response.chunks_list)
  if remaining:
    messages = [
          ("system", clean_data_system_prompt),
          ("user", f"Query:\n{sub_question}\nScraped text:\n{scraped_data[max_nbr_para*parag_size:(max_nbr_para+1)*parag_size]}")
      ]
    response = data_cleaning_model.invoke(messages)
    chunks_list.extend(response.chunks_list)
  clean_content = "\n---\n".join(chunks_list) if chunks_list else ""

  return clean_content

In [None]:
async def fetch_and_clean(url_info, sub_question):
    """Helper to fetch and clean data from a single URL"""
    if url_info[0] == 'html':
        scraped_data = await crawl4ai_func(url_info[1])
        return clean_scraped_data_llm(sub_question, scraped_data)
    else:
        ocr_response = pdf_ocr(url_info[1])
        scraped_data = "\n".join(page.markdown for page in ocr_response.pages[:10])
        return clean_scraped_data_llm(sub_question, scraped_data)

In [None]:
def judging_collected_data(gathered_info, sub_question, query):
  model = ChatGoogleGenerativeAI(
      model="models/gemini-2.0-flash",
      temperature=0,
      max_tokens=None,
      timeout=None,
      max_retries=2,
      api_key=GEMINI_API_KEY
    )
  judgement_system_prompt = """
You are a judgment-oriented assistant whose sole role is to evaluate whether provided data chunks are sufficient to answer a given question thoroughly. Follow these rules:

1. INPUT FORMAT
   • The user will supply:
     – A “question” (a natural-language query).
     - A “web search query” (query used to perform web search to collect data)
     – A list of “data_chunks”, where each chunk is a fragment of collected information. These chunks are not guaranteed to be coherent narrative—treat them as separate pieces of evidence.
   • Example input (not part of your prompt):
     Question:
     What are the demographic trends of electric vehicle adoption in urban areas over the last five years?
     Web search query:
     electric vehicle adoption in urban areas
     Data Chunks:
     Data from City A’s transportation survey (2020): 12% EV penetration ...
     ---
     Academic paper excerpt: In 2019, urban EV buyers skewed younger ...
     ---
     News article (2024) about tax incentives affecting EV sales ...

2. PROCESS
   • Treat the “data_chunks” as discrete units; do NOT assume they connect seamlessly.
   • For each chunk, extract the key facts, dates, and context.
   • Evaluate coverage: timeline span, geographic scope, demographic variables, methodologies, sample sizes, and relevance to the question.
   • Do not attempt to “answer” the question yourself using outside knowledge; only judge sufficiency of the provided chunks.

3. Output
  - State whether the data is sufficient (True or False)
  """
  class JudgementOutput(BaseModel):
    sufficient: bool = Field(..., description="Whether the collected data is sufficient or not")

  judgement_model = model.with_structured_output(JudgementOutput)
  gethered_info_concat = '---\n'.join(gathered_info)
  messages = [
      ("system", judgement_system_prompt),
      ("user", f"Question:\n{sub_question}\nWeb search query:\n{query}\nData Chunks:\n{gethered_info_concat}")
  ]

  response = judgement_model.invoke(messages)
  judgment = response.model_dump()

  return judgment

In [None]:
def sub_report_generation(sub_question, accepted_gathered_data):
  model = ChatGoogleGenerativeAI(
      model="models/gemini-2.5-flash-preview-05-20",
      temperature=0,
      max_tokens=None,
      timeout=None,
      max_retries=2,
      api_key=GEMINI_API_KEY
    )

  sub_report_system_prompt = """
You are a detailed-report generator that receives:
  • A “question” (a natural-language query).
  • A collection of “data_chunks” grouped by references (e.g., “[Ref: Source A url] Chunk 1 \n\n\n[Ref: Source B url] Chunk 2....), containing gathered information relevant to the question.

YOUR TASK:
  1. Read and extract key facts from each data_chunk, noting its reference.
  2. Generate a comprehensive report that answers the question in no more than three main paragraphs.
     – Each paragraph should flow logically: e.g., context/setup, analysis/details, and conclusion/insight.
     – Use the provided data_chunks as the primary source of evidence.
     – You may incorporate your own external knowledge only if you are certain of its correctness; otherwise, rely solely on the chunks.
  3. In the report, whenever you use specific information from a chunk, include its reference in brackets immediately after the fact (e.g., “According to the 2022 survey, 45% of respondents… [Source B](url)”).
  4. Don't add any extra information from your own knowledge, unless you're absolutely certain that it is correct.
  5. Capture as many relevant details as possible: dates, figures, definitions, context, and any qualifying conditions.

RESPONSE FORMAT:
  • Each time you reference a chunk, use its exact reference tag with the url (e.g., “[Source A](url)”) immediately after the cited information.
  • Do not exceed five main paragraphs. Maintain coherent prose.

TONE & STYLE:
  • Formal, precise, and objective.
  • Prioritize clarity and completeness.
  • Avoid speculation—only state what is directly supported by references or by verifiably correct knowledge.
"""

  data_chunks_with_ref = "\n\n\n".join([f"[Ref: {url}]\n {chunks}" for url, chunks in zip(*accepted_gathered_data)])
  user_prompt = f"Question:\n{sub_question}\nData Chunks:\n{data_chunks_with_ref}"

  messages = [
          ("system", sub_report_system_prompt),
          ("user", user_prompt)
      ]
  response = model.invoke(messages)
  sub_report = response.content

  return sub_report


In [None]:
def final_report_generation(query, query_breakdown, sub_questions, sub_reports):
  model = ChatGoogleGenerativeAI(
      model="models/gemini-2.5-flash-preview-05-20",
      temperature=0,
      max_tokens=None,
      timeout=None,
      max_retries=2,
      api_key=GEMINI_API_KEY
    )

  sub_report_system_prompt = """
You are a final-report generator that receives:
  • The original user query that we're trying to answer thoroughly.
  • A brief breakdown of the original user query, to better understand it.
  • A list of miticoulously crafted “sub_reports”. Each sub_report consists of:
    – A “sub_question” it was intended to answer, related to the original user query.
    – The complete text of the sub_report, which already includes detailed analysis and citations in the form different sources (e.g., “[Source X]”)

YOUR TASK:
  1. Read and understand each sub_report in the context of the question it addressed.
  2. Create one unified, coherent final report that synthesizes all sub_reports into a single narrative. The final report must:
     – Introduce the overarching subject by summarizing how the individual questions connect.
     – Provide a clear plan or outline at the beginning, listing major sections that correspond to thematic groupings of the sub_reports.
     – Maintain and preserve all existing references from each sub_report, but this time use wikipedia citation style (numbered references that link to footnotes list of references links)
     - The final report has to be in Markdown format, so use the proper notation for lists and references.
     – If you incorporate any additional external facts, only do so if you are certain of their correctness.
     – Integrate findings so that the final document reads as a cohesive, logically flowing report rather than a sequence of disconnected summaries.
     – Ensure the final report is long, thorough, and richly detailed, fully addressing the combined scope of all sub_reports and answering the user query and all the sub_questions.

TONE & STYLE:
  • Formal, authoritative, and objective, with an engaging title.
  • Write using a combination of complete paragraphs and bullet points, as you see fit.
  • Prioritize clarity: Suppose the reader doesn't know anything about the query, unless it is stated otherwise by the user in the query.
  • Do not speculate beyond what is supported by sub_report citations or verifiably correct LLM knowledge.

IMPORTANT:
  • Only use content from the provided sub_reports (and, if necessary, verifiable knowledge).
  • In case of contradictory information between sub_reports, either pick the one that has a more reliable source,
  or state that there are two opinions on the topic and the user should investigate it further manually.
  • Accurately preserve all reference tags from sub_reports.
  • The final report must be one cohesive document, not a simple concatenation of sections.
  • Ensure the report is sufficiently long and detailed to cover all combined questions.
"""

  sub_reports_concat = "\n".join([f"### Sub_question: {sub_question}\nSub_report: {sub_report}" for sub_question, sub_report in zip(sub_questions, sub_reports)])
  user_prompt = f"Original user query:\n{query}\nQuery breakdown:\n{query_breakdown}\n{sub_reports_concat}"

  messages = [
          ("system", sub_report_system_prompt),
          ("user", user_prompt)
      ]
  response = model.invoke(messages)
  final_report = response.content

  return final_report

## Agent code

In [None]:
user_query = "Compile a research report on how the retail industry has changed in the past 3 years"
plan = generate_plan(user_query)
query_breakdown, sub_questions, search_queries = plan["query_breakdown"], plan["sub_questions"], plan["search_queries"]
print("plan generated!")

search_results = web_search(search_queries, num_results=15)
print("search results generated!")

# Data collection phase
gathered_info = []
for sq, urls in zip(sub_questions, search_results):
    query_data = []
    if urls:
        query_data.append(await fetch_and_clean(urls[0], sq))
    gathered_info.append(query_data)
print("data collected!")

# Data validation and supplementation
accepted_data = []
for sq, query, data_list, urls in zip(sub_questions, search_queries, gathered_info, search_results):
    sufficient = bool(''.join(data_list)) and judging_collected_data(data_list, sq, query)["sufficient"]
    print(f"Sub-question: {sq}\nJudgment: {sufficient}")

    idx = 1
    while not sufficient and idx < len(urls):
        data_list.append(await fetch_and_clean(urls[idx], sq))
        sufficient = bool(''.join(data_list)) and judging_collected_data(data_list, sq, query)["sufficient"]
        idx += 1

    accepted_data.append((urls, data_list))
print("data accepted!")

# Report generation
sub_reports = [sub_report_generation(sq, data) for sq, data in zip(sub_questions, accepted_data)]
print("sub reports generated!")

final_report = final_report_generation(user_query, query_breakdown, sub_questions, sub_reports)
print("final report generated!")

plan generated!
[SUCCESS] URL content is valid (html/text): https://www2.deloitte.com/us/en/pages/consumer-business/articles/retail-distribution-industry-outlook.html
[SUCCESS] URL content is valid (html/text): https://www.abiresearch.com/blog/retail-trends-2024
[SUCCESS] URL content is valid (html/text): https://www.mastercardservices.com/en/industries/retail/insights/retail-industry-trends-2024
[ERROR] Skipping URL due to request error: https://pos.toasttab.com/blog/on-the-line/retail-trends?srsltid=AfmBOoq3mZDWdKkZ1mIeIZUVseIAgaoNvFf3QjvDKlUY4sBFxhRrYwgM
	Details: 403 Client Error: Forbidden for url: https://pos.toasttab.com/blog/on-the-line/retail-trends?srsltid=AfmBOoq3mZDWdKkZ1mIeIZUVseIAgaoNvFf3QjvDKlUY4sBFxhRrYwgM
[SUCCESS] URL content is valid (html/text): https://www2.deloitte.com/us/en/insights/industry/retail-distribution/retail-distribution-industry-outlook.html
[SUCCESS] URL content is valid (html/text): https://www.westrock.com/blog/7-biggest-retail-trends-of-2024
[ERROR

data collected!
Sub-question: What were the major trends and events that shaped the retail industry between 2021 and 2024?
Judgment: False


Sub-question: How has e-commerce impacted traditional brick-and-mortar retail in the past 3 years?
Judgment: False


Sub-question: What technological advancements have been most transformative in the retail sector during this period?
Judgment: False


Sub-question: How have consumer behaviors and preferences evolved, and how has the retail industry adapted?
Judgment: False


Sub-question: What are the key challenges and opportunities facing the retail industry as of 2024?
Judgment: False


data accepted!
sub reports generated!
final report generated!


## Agent as a langchain tool

In [None]:
class DeepSearchToolInput(BaseModel):
    user_query: str = Field(description="User query to perform deep search on")

@tool("deep-search-tool",
      description="Performs an in-depth web-based search and returns a detailed report about the user's query.",
      args_schema=DeepSearchToolInput
      )
async def run_deepsearch(user_query: str) -> str:
  plan = generate_plan(user_query)
  query_breakdown, sub_questions, search_queries = plan["query_breakdown"], plan["sub_questions"], plan["search_queries"]
  print("plan generated!")

  search_results = web_search(search_queries, num_results=15)
  print("search results generated!")

  # Data collection phase
  gathered_info = []
  for sq, urls in zip(sub_questions, search_results):
      query_data = []
      if urls:
          query_data.append(await fetch_and_clean(urls[0], sq))
      gathered_info.append(query_data)
  print("data collected!")

  # Data validation and supplementation
  accepted_data = []
  for sq, query, data_list, urls in zip(sub_questions, search_queries, gathered_info, search_results):
      sufficient = bool(''.join(data_list)) and judging_collected_data(data_list, sq, query)["sufficient"]
      print(f"Sub-question: {sq}\nJudgment: {sufficient}")

      idx = 1
      while not sufficient and idx < len(urls):
          data_list.append(await fetch_and_clean(urls[idx], sq))
          sufficient = bool(''.join(data_list)) and judging_collected_data(data_list, sq, query)["sufficient"]
          idx += 1

      accepted_data.append((urls, data_list))
  print("data accepted!")

  # Report generation
  sub_reports = [sub_report_generation(sq, data) for sq, data in zip(sub_questions, accepted_data)]
  print("sub reports generated!")

  final_report = final_report_generation(user_query, query_breakdown, sub_questions, sub_reports)
  print("final report generated!")
  final_report = "Final report generated!"
  return final_report