In [30]:
# Install necessary packages
%pip install -qU pypdf==4.0.1 langchain_community
%pip install -U duckduckgo-search
%pip install -qU langchain-openai

1655.92s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


1662.12s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


1668.31s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [64]:
# Import required libraries
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool  # Import StructuredTool
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_core.tools import Tool
from pydantic import BaseModel, Field  # Import Pydantic BaseModel

In [65]:
# Load the PDF document
file_path = "../../data/414759-1-_5_Nike-NPS-Combo_Form-10-K_WR.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

In [67]:
# Display basic information about the document
print(len(docs))
print(docs[0].page_content[0:100])
print(docs[0].metadata)

# Set up the OpenAI API key
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
os.environ["API_KEY_OPENAI"] = api_key

106
FORM 10-K FORM 10-K
{'source': '../../data/414759-1-_5_Nike-NPS-Combo_Form-10-K_WR.pdf', 'page': 0}


In [68]:
# Initialize the language model
llm = ChatOpenAI(model="gpt-4")

In [69]:
# Split the document into manageable chunks and create a vector store
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = InMemoryVectorStore.from_documents(
    documents=splits, embedding=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()

In [70]:
# Step 3: Define the revenue growth calculation tool with input schema
# Define the input schema using Pydantic
class RevenueGrowthInput(BaseModel):
    revenue_new: float = Field(..., description="Revenue in the new year")
    revenue_old: float = Field(..., description="Revenue in the old year")

# Define the revenue growth calculation function
def calculate_revenue_growth(revenue_new: float, revenue_old: float) -> float:
    """Calculates revenue growth from the old year to the new year."""
    return ((revenue_new - revenue_old) / revenue_old) * 100

# Update the tool definition using StructuredTool
calculate_growth_tool = StructuredTool.from_function(
    func=calculate_revenue_growth,
    name="Revenue Growth Calculator",
    description=(
        "Calculates the revenue growth percentage between two years. "
        "Requires 'revenue_new' and 'revenue_old' as inputs."
    ),
    args_schema=RevenueGrowthInput
)

# Define the DuckDuckGo search tool with updated description
web_search_tool = DuckDuckGoSearchResults(
    name="Web Search",
    description=(
        "Use this tool to search the web for financial information such as revenue figures "
        "if they are not found in the document."
    )
)


# Step 1: Create a RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can choose 'stuff', 'map_reduce', etc.
    retriever=retriever
)

# Step 2: Wrap the RetrievalQA Chain into a Tool
qa_tool = Tool(
    name="Document QA",
    func=qa_chain.run,
    description=(
        "Use this tool to find financial information, such as revenue figures, "
        "from the company's PDF document."
    )
)

In [71]:
# Step 4: Update the Tools List
tools = [calculate_growth_tool, web_search_tool, qa_tool]

In [72]:
# Step 5: Define the system message to guide the agent
system_message = """
You are an assistant that helps answer financial questions about a company.

When given a question, you should:

1. Determine if any information is missing that is needed to answer the question.

2. If information is missing, first try to find it using the 'Document QA' tool.

3. If the information is not found in the document, then try to find it using the 'Web Search' tool.

4. Once you have all the necessary information, use the appropriate tool to compute or provide the answer.

Use the tools as needed, and be sure to provide the final answer to the user.
"""

# Step 6: Initialize the agent with the custom system message
agent_kwargs = {
    "system_message": system_message
}

agent_chain = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    agent_kwargs=agent_kwargs
)

In [73]:
# Step 7: Ask the agent a question without revenue numbers
input_query = "What is the revenue growth between 2023 and 2022?"

# Step 8: Run the agent with the input query
results = agent_chain.run(input_query)

print(results)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe user didn't provide specific revenue figures for 2022 and 2023, so I will need to find this information first. I will use the Document QA tool to search the document for the 2022 and 2023 revenue figures.
Action:
```
{
  "action": "Document QA",
  "action_input": "What is the revenue for 2022 and 2023?"
}
```[0m
Observation: [38;5;200m[1;3mThe revenue for 2022 was $46,710 million and for 2023 it was $51,217 million.[0m
Thought:[32;1m[1;3mNow that I have the revenue figures for both 2022 and 2023, I can calculate the revenue growth between these two years. I will use the Revenue Growth Calculator tool for this. The revenue for 2022 (the old revenue) is $46,710 million and for 2023 (the new revenue) is $51,217 million.
Action:
```
{
  "action": "Revenue Growth Calculator",
  "action_input": {"revenue_new": 51217, "revenue_old": 46710}
}
```
[0m
Observation: [36;1m[1;3m9.648897452365661[0m
Thought:[32;1m[1;3mThe 