<a href="https://colab.research.google.com/github/HighestPotential/HuggingFaceCourseAgents/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Key Functionality**

**Web Search :**
Leverages DuckDuckGoSearchTool to dynamically fetch real-time search results for user queries.

**Result Grouping :**
Aggregates raw search results into large, manageable chunks (e.g., 1,000 characters per group) to create structured Document objects for downstream processing.


**Text Splitting :**
Splits grouped documents into smaller, overlapping segments (e.g., 200 tokens with 10-token overlap) to optimize context retention and retrieval efficiency.



**BM25 Retrieval :**
Uses the BM25 algorithm to rank and retrieve the top 5 most relevant text chunks based on query similarity, ensuring precise information filtering.



**Output :**
Returns results as clean, formatted snippets (e.g., "Relevant Content 1: ...") for easy interpretation and integration into workflows.



In [1]:
pip install smolagents -U

Collecting smolagents
  Downloading smolagents-1.9.2-py3-none-any.whl.metadata (14 kB)
Collecting pandas>=2.2.3 (from smolagents)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting markdownify>=0.14.1 (from smolagents)
  Downloading markdownify-1.0.0-py3-none-any.whl.metadata (9.1 kB)
Collecting duckduckgo-search>=6.3.7 (from smolagents)
  Downloading duckduckgo_search-7.5.0-py3-none-any.whl.metadata (17 kB)
Collecting python-dotenv (from smolagents)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting primp>=0.14.0 (from duckduckgo-search>=6.3.7->smolagents)
  Downloading primp-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading smolagents-1.9.2-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/1

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!pip install langchain-community rank_bm25

Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->datacla

In [10]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from smolagents import Tool
from langchain_community.retrievers import BM25Retriever
from smolagents import CodeAgent, HfApiModel, VisitWebpageTool, DuckDuckGoSearchTool

In [37]:
class LawExtractionTool(Tool):
    name = "LawExtractionSearchTool"
    description = "Performs web search and uses BM25 to retrieve relevant results for a query, use 'def forward' for that"
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform.",
        }
    }
    output_type = "string"


    def forward(self, query: str) -> str:
        assert isinstance(query, str), "Your search query must be a string"
        search_results = DuckDuckGoSearchTool().forward(query)

        grouped_results = [
        " ".join(search_results[i:i+1000])
        for i in range(0, len(search_results), 1000)
        ]

        docs = [Document(page_content=result) for result in grouped_results]


        self.retriever = BM25Retriever.from_documents(
            docs, k=3
        )

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=200,
            chunk_overlap=10,
            separators=["\n\n"],
        )
        split_docs = text_splitter.split_documents(docs)

        retriever = BM25Retriever.from_documents(split_docs, k=5)
        retrieved_docs = retriever.invoke(query)

        return "\n\n".join(
            [f"Relevant Content {i+1}:\n{doc.page_content}" for i, doc in enumerate(retrieved_docs)]
        )


In [38]:
agent = CodeAgent(tools=[LawExtractionTool()], model=HfApiModel(), max_steps=5)
response = agent.run(
    "How long are you allowed to be in the USA without a visia if you are a Canadian citizen?"
)