In [13]:
import os
from langchain.tools import BaseTool
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.document_loaders import WebBaseLoader
from pydantic import BaseModel, Field
from typing import Type
from langchain.schema import SystemMessage

# Initialize LLM
llm = ChatOpenAI(model_name="gpt-4o-mini-2024-07-18", temperature=0.1)

# Define Argument Schema for Tools
class SearchToolArgsSchema(BaseModel):
    query: str = Field(description="The query you will search for")

class URLArgsSchema(BaseModel):
    url: str = Field(description="The URL of the webpage to scrape")

class SaveFileArgsSchema(BaseModel):
    content: str = Field(description="The research content to save into a .txt file")

# 📌 Wikipedia Search Tool
class WikiSearchTool(BaseTool):
    name: str = "WikiSearchTool"
    description: str = "Search Wikipedia for relevant information and extract the text from the page."
    
    args_schema: Type[SearchToolArgsSchema] = SearchToolArgsSchema

    def _run(self, query: str):
        wikipedia_api = WikipediaAPIWrapper()
        wiki_search = WikipediaQueryRun(api_wrapper=wikipedia_api)
        result = wiki_search.run(query)
        return result

# 📌 DuckDuckGo Search Tool
class DuckDuckGoSearchTool(BaseTool):
    name: str = "DuckDuckGoSearchTool"
    description: str = "Use DuckDuckGo to search the web for relevant links."
    
    args_schema: Type[SearchToolArgsSchema] = SearchToolArgsSchema

    def _run(self, query: str):
        ddg_search = DuckDuckGoSearchRun()
        result = ddg_search.run(query)
        return result

# 📌 Web Scraping Tool
class WebScraperTool(BaseTool):
    name: str = "WebScraperTool"
    description: str = "Extract the text content of a given webpage."

    args_schema: Type[URLArgsSchema] = URLArgsSchema

    def _run(self, url: str):
        try:
            loader = WebBaseLoader(url)
            documents = loader.load()
            extracted_texts = "\n\n".join([doc.page_content for doc in documents])
            return extracted_texts[:2000]  # Limit output to 2000 characters
        except Exception as e:
            return f"Error scraping website: {str(e)}"

# 📌 Save Research to File Tool
class SaveResearchTool(BaseTool):
    name: str = "SaveResearchTool"
    description: str = "Save the research content into a .txt file."

    args_schema: Type[SaveFileArgsSchema] = SaveFileArgsSchema

    def _run(self, content: str):
        try:
            with open("research.txt", "w", encoding="utf-8") as f:
                f.write(content)
            return "Research saved to research.txt"
        except Exception as e:
            return f"Error saving file: {str(e)}"

# 📌 Initialize Agent
agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    tools=[
        DuckDuckGoSearchTool(),
        WikiSearchTool(),
        WebScraperTool(),
        SaveResearchTool()
    ],
    agent_kwargs={
        "system_message": SystemMessage(
            content="""
            You are an AI research assistant that gathers information using multiple sources.

            1. Randomly choose either DuckDuckGoSearchTool or WikiSearchTool to start the research.
            2. If Wikipedia was used last, use DuckDuckGo next. If DuckDuckGo was used last, use Wikipedia next.
            3. Keep track of which search tool was last used and make sure to switch for the next query.
            4. If DuckDuckGo provides a website link, extract the website content using WebScraperTool.
            5. Ensure that no search tool is used twice in a row unless absolutely necessary.
            6. Summarize the research findings after both sources have been queried.
            7. If enough useful information has been collected, finalize by saving the research into a `.txt` file.

            **IMPORTANT:** If Wikipedia was used in the last search, always choose DuckDuckGo next. If DuckDuckGo was used last, always choose Wikipedia next. If unsure, start with DuckDuckGo.
            """
        )
    }
)

# 📌 Run the agent with the required query
query = "Research about the XZ backdoor"
result = agent.invoke(query)

# Print the result
print(result)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to gather information about the XZ backdoor, which is likely a type of malware or security vulnerability. I will start by searching for relevant information on the web to understand its characteristics, impact, and any notable incidents associated with it. 

Action: DuckDuckGoSearchTool  
Action Input: "XZ backdoor malware"  [0m
Observation: [36;1m[1;3mOn Friday, a lone Microsoft developer rocked the world when he revealed a backdoor had been intentionally planted in XZ Utils, an open source data compression utility available on almost all ... Part 3: XZ backdoor. Hook analysis. On March 29, 2024, a single message on the Openwall OSS-security mailing list marked an important discovery for the information security, open source and Linux communities: the discovery of a malicious backdoor in XZ. XZ is a compression utility integrated into many popular distributions of Linux. Immediately downgrade your version of xz to 

In [None]:
from langchain_community.tools import DuckDuckGoSearchRun

search = DuckDuckGoSearchRun()
search.run("Previous president Obama's first name")

In [None]:
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
wikipedia.run("murakami haruki")


In [None]:
import requests
from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup

# def parse_page(soup):
#     header = soup.find("header")
#     footer = soup.find("footer")
#     if header:
#         header.decompose()
#     if footer:
#         footer.decompose()
#     return (
#         str(soup.get_text())
#         .replace("\n", " ")
#     )

loader = WebBaseLoader("https://www.espn.com/")
data = loader.load()