In [1]:
import asyncio 
import bs4 

import nest_asyncio
import pandas as pd
from langchain.docstore.document import Document
from langchain_experimental.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent 
from langchain_experimental.autonomous_agents import AutoGPT 
from langchain_openai import ChatOpenAI 

In [2]:
nest_asyncio.apply()

In [3]:
llm = ChatOpenAI(model_name = "gpt-4o-mini", temperature = 1.0)

#### Tools Setup for AutoGPT 

In [4]:
import os 
from contextlib import contextmanager
from typing import Optional 

from langchain.agents import tool 
from langchain_community.tools.file_management.read import ReadFileTool
from langchain_community.tools.file_management.write import WriteFileTool 

ROOT_DIR = "./data/" 

@contextmanager 
def push_to_dir(new_dir): 
    """Context manager for changing the current working directory""" 
    prev_dir = os.getcwd() 
    os.chdir(new_dir) 
    try: 
        yield 
    finally: 
        os.chdir(prev_dir)

In [5]:
new_dir = push_to_dir(ROOT_DIR)

In [6]:
print(os.getcwd())

/home/mpariaszevski/essay_project/Essay_Project/AutoGPT_Project


In [7]:
@tool
def process_csv(
    csv_file_path: str, instructions: str, output_path: Optional[str] = None
) -> str:
    """Process a CSV by with pandas in a limited REPL.\
 Only use this after writing data to disk as a csv file.\
 Any figures must be saved to disk to be viewed by the human.\
 Instructions should be written in natural language, not code. Assume the dataframe is already loaded."""
    with push_to_dir(ROOT_DIR):
        try:
            df = pd.read_csv(csv_file_path)
        except Exception as e:
            return f"Error: {e}"
        agent = create_pandas_dataframe_agent(llm, df, max_iterations=30, verbose=True)
        if output_path is not None:
            instructions += f" Save output to disk at {output_path}"
        try:
            result = agent.run(instructions)
            return result
        except Exception as e:
            return f"Error: {e}"

In [8]:
async def async_load_playwright(url: str) -> str:
    """Load the specified URLs using Playwright and parse using BeautifulSoup."""
    from bs4 import BeautifulSoup
    from playwright.async_api import async_playwright

    results = ""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        try:
            page = await browser.new_page()
            await page.goto(url)

            page_source = await page.content()
            soup = BeautifulSoup(page_source, "html.parser")

            for script in soup(["script", "style"]):
                script.extract()

            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            results = "\n".join(chunk for chunk in chunks if chunk)
        except Exception as e:
            results = f"Error: {e}"
        await browser.close()
    return results


def run_async(coro):
    event_loop = asyncio.get_event_loop()
    return event_loop.run_until_complete(coro)


@tool
def browse_web_page(url: str) -> str:
    """Verbose way to scrape a whole webpage. Likely to cause issues parsing."""
    return run_async(async_load_playwright(url))

In [9]:
from langchain.chains.qa_with_sources.loading import (
    BaseCombineDocumentsChain,
    load_qa_with_sources_chain,
)
from langchain.tools import BaseTool, DuckDuckGoSearchRun
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pydantic import Field


def _get_text_splitter():
    return RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=500,
        chunk_overlap=20,
        length_function=len,
    )


class WebpageQATool(BaseTool):
    name = "query_webpage"
    description = (
        "Browse a webpage and retrieve the information relevant to the question."
    )
    text_splitter: RecursiveCharacterTextSplitter = Field(
        default_factory=_get_text_splitter
    )
    qa_chain: BaseCombineDocumentsChain

    def _run(self, url: str, question: str) -> str:
        """Useful for browsing websites and scraping the text information."""
        result = browse_web_page.run(url)
        docs = [Document(page_content=result, metadata={"source": url})]
        web_docs = self.text_splitter.split_documents(docs)
        results = []
        # TODO: Handle this with a MapReduceChain
        for i in range(0, len(web_docs), 4):
            input_docs = web_docs[i : i + 4]
            window_result = self.qa_chain(
                {"input_documents": input_docs, "question": question},
                return_only_outputs=True,
            )
            results.append(f"Response from window {i} - {window_result}")
        results_docs = [
            Document(page_content="\n".join(results), metadata={"source": url})
        ]
        return self.qa_chain(
            {"input_documents": results_docs, "question": question},
            return_only_outputs=True,
        )

    async def _arun(self, url: str, question: str) -> str:
        raise NotImplementedError

In [10]:
query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))

See also the following migration guides for replacements based on `chain_type`:
stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

  query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))


In [11]:
# Memory
import faiss
from langchain.docstore import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [12]:
web_search = DuckDuckGoSearchRun()

In [13]:
tools = [
    web_search,
    WriteFileTool(root_dir="./data"),
    ReadFileTool(root_dir="./data"),
    process_csv,
    query_website_tool,
    # HumanInputRun(), # Activate if you want the permit asking for help from the human
]

In [14]:
agent = AutoGPT.from_llm_and_tools(
    ai_name="Tom",
    ai_role="Assistant",
    tools=tools,
    llm=llm,
    memory=vectorstore.as_retriever(search_kwargs={"k": 8}),
    # human_in_the_loop=True, # Set to True if you want to add feedback at each step.
)
# agent.chain.verbose = True

In [15]:
# agent.run(
#     [
#         """What were the winning boston marathon times for the past 5 years (ending in 2022)? 
#         Generate a table of the year, name, country of origin, and times.
#         Display this table to me."""
#     ]
# )

In [16]:
agent.run(
    [
        """What are the highest market cap stocks in NASDAQ. 
        Generate and display a table of the stock tickers, current market cap, and amount of public shares."""
    ]
)

{
    "thoughts": {
        "text": "I need to gather information about the highest market cap stocks in NASDAQ.",
        "reasoning": "The best approach to find this information is to perform a search query to obtain the latest data.",
        "plan": "- Execute a search query for NASDAQ highest market cap stocks\n- Gather the results\n- Structure data into a table format and save it",
        "criticism": "I should ensure the search query is specific enough to bring up the most relevant results.",
        "speak": "I will search for the highest market cap stocks in NASDAQ now."
    },
    "command": {
        "name": "duckduckgo_search",
        "args": {
            "query": "highest market cap stocks in NASDAQ"
        }
    }
}
{
    "thoughts": {
        "text": "I need to extract the relevant information about the top NASDAQ companies and format it into a table.",
        "reasoning": "I have received the search results, and now I need to organize this data into a structured fo

"Completed all objectives regarding NASDAQ's highest market cap stocks."