In [1]:
# Install necessary packages (ensure these are installed in your environment)
# %pip install -qU pypdf==4.0.1 langchain_community
# %pip install -U duckduckgo-search
# %pip install -qU langchain-openai
# %pip install faiss-cpu
# %pip install langchain==0.3.3


In [2]:
# %pip install -U sentence-transformers
# %pip install -U torch  # For CPU
# If you have a GPU and want to leverage it, install the appropriate PyTorch version:
# pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117


In [3]:

import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool, Tool
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from pydantic import BaseModel, Field
from langchain.llms import OpenAI


In [4]:

import requests
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
from pydantic import BaseModel, Field
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool, Tool
from langchain.chat_models import ChatOpenAI 

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

In [6]:

import langchain
print(langchain.__version__)


0.3.3


In [7]:
from langchain_community.llms import Ollama

In [8]:
%pip install "psycopg[binary,pool]"

Note: you may need to restart the kernel to use updated packages.


In [9]:
from langchain_postgres.vectorstores import PGVector

connection_string = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
# db = PGVector.from_documents(documents, model, connection=connection)

In [10]:
# Securely set your OpenAI API key as an environment variable before running the script
# Example (in your terminal):
# export OPENAI_API_KEY="your-api-key"

import os
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
os.environ["API_KEY_OPENAI"] = api_key


In [11]:
# List of company names and their corresponding PDF file paths

companies = {
    "Nike": "../../data/Nike-Annual-Report.pdf",
    "Lyft": "../../data/Lyft-Annual-Report-2021.pdf",
    "Uber": "../../data/Uber-Annual-Report-2021.pdf"
}


In [12]:
# Initialize dictionaries to hold data for each company

company_docs = {}
company_vectorstores = {}
company_retrievers = {}
company_qa_tools = {}


In [13]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Create embeddings and choose llm

# llm = Ollama(model="llama3", base_url="http://localhost:11434/")
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embeddings = OpenAIEmbeddings(api_key = os.environ["OPENAI_API_KEY"])
llm = ChatOpenAI(api_key = os.environ["OPENAI_API_KEY"], model='gpt-4')  # Ensure you have access to GPT-4


  embeddings = OpenAIEmbeddings(api_key = os.environ["OPENAI_API_KEY"])
  llm = ChatOpenAI(api_key = os.environ["OPENAI_API_KEY"], model='gpt-4')  # Ensure you have access to GPT-4


In [27]:
# Process each company's PDF
for company, file_path in companies.items():
    # Load the PDF document
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    
    # Split the document into chunks
    splits = text_splitter.split_documents(docs)
    
    # FAISS vector Store
    vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)

    # (Optional) Save the FAISS index to disk
    os.makedirs("./faiss_indexes", exist_ok=True)
    vectorstore.save_local(f"./faiss_indexes/{company.lower()}_documents")
    
    # Create a PGVector vector store for the company's documents
    # vectorstore = PGVector.from_documents(
    # documents=splits,
    # embedding=embeddings,  # Correct parameter name
    # connection=connection_string,
    # collection_name=f"{company.lower()}_documents"  # Separate table for each company
    # )
    
    # Create a retriever for the vector store
    retriever = vectorstore.as_retriever()
    
    # Store in dictionaries
    company_docs[company] = docs
    company_vectorstores[company] = vectorstore
    company_retrievers[company] = retriever


In [15]:
from langchain.tools import StructuredTool

# Define the QAInput model
class QAInput(BaseModel):
    query: str = Field(..., description="The query to ask the company's documents.")

# Define a function that extracts the query from QAInput and runs the QA chain
def qa_run(input: QAInput, **kwargs) -> str:
    return qa_chain.run(input.query)

from functools import partial

qa_tools = []

# Define a generic QA run function that accepts a chain
def run_qa(chain: RetrievalQA, query: str) -> str:
    return chain.run(query)

for company, retriever in company_retrievers.items():
    # Initialize the RetrievalQA chain for the company
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Options: 'stuff', 'map_reduce', etc.
        retriever=retriever
    )
    
    # Use functools.partial to bind the current qa_chain to run_qa
    bound_qa_run = partial(run_qa, chain=qa_chain)
    
    # Create the StructuredTool for the company
    qa_tool = StructuredTool.from_function(
        func=bound_qa_run,
        name=f"{company} QA",
        description=(
            f"Use this tool to answer questions about {company}'s information, "
            "especially financial information."
        ),
        args_schema=QAInput  # Ensure the args_schema matches the function signature
    )
    
    qa_tools.append(qa_tool)

In [16]:

# Define the Revenue Growth Calculator Tool
class RevenueGrowthInput(BaseModel):
    revenue_new: float = Field(..., description="Revenue in the new year")
    revenue_old: float = Field(..., description="Revenue in the old year")

def calculate_revenue_growth(revenue_new: float, revenue_old: float) -> float:
    """Calculates revenue growth from the old year to the new year."""
    return ((revenue_new - revenue_old) / revenue_old) * 100

calculate_growth_tool = StructuredTool.from_function(
    func=calculate_revenue_growth,
    name="Revenue Growth Calculator",
    description=(
        "Calculates the revenue growth percentage between two years. "
        "Requires 'revenue_new' and 'revenue_old' as inputs."
    ),
    args_schema=RevenueGrowthInput
)

# Define the DuckDuckGo search tool
web_search_tool = DuckDuckGoSearchResults(
    name="Web Search",
    description=(
        "Use this tool to search the web for financial information such as revenue figures "
        "if they are not found in the documents."
    )
)


In [17]:
# List of company names for routing
company_names = list(companies.keys())

# Create embeddings for routing (reuse the existing embeddings)
routing_embeddings = embeddings  # Using the same embedding model

# Create FAISS vector store for routing
routing_vectorstore = FAISS.from_texts(
    texts=list(companies.keys()),
    embedding=routing_embeddings
)

# (Optional) Save the routing FAISS index to disk
routing_vectorstore.save_local("./faiss_indexes/routing")

# Create a PGVector vector store for routing
# routing_vectorstore = PGVector.from_texts(
#     texts=company_names,
#     embedding=embeddings,  # Correct parameter name
#     connection=connection_string,
#     collection_name="routing"  # Correct parameter name
# )

# Define a function to get the most similar company based on query
def get_most_similar_company(query: str) -> Optional[str]:
    similar_docs = routing_vectorstore.similarity_search(query, k=1)
    if similar_docs:
        return similar_docs[0].page_content
    else:
        return None

# Define the semantic routing function
def semantic_router(query: str) -> str:
    company = get_most_similar_company(query)
    if company and company in companies:
        return company
    else:
        return "Web Search"  # Fallback to web search if no company matches

# Define the Semantic Router Tool
router_tool = Tool(
    name="Semantic Router",
    func=semantic_router,
    description=(
        "Determines which company's knowledge base to use for answering the question. "
        "Returns the company name (Nike, Lyft, Uber) or 'Web Search' if no match is found."
    )
)


In [18]:

# Update the tools list to include the router and individual QA tools
tools = [router_tool] + qa_tools + [calculate_growth_tool, web_search_tool]


In [19]:

# Define the system message
# %%
# Define the system message with clear instructions and examples
system_message = """
You are an assistant that helps answer financial questions about companies.

When given a question, you should:

1. Use the 'Semantic Router' tool to determine which company's information is relevant to the question (Nike, Lyft, or Uber).
2. Use the appropriate '{company} QA' tool to find information from the selected company's documents.
3. If information is missing, use the 'Web Search' tool to find it.
4. Once you have all the necessary information, if you need to do some calculation regarding revenue growth, use the 'Revenue Growth Calculator' tool to compute or provide the answer.

**Important:** When deciding to use a tool, output your response in the following JSON format **exactly**:

```json
{
  "action": "Tool Name",
  "action_input": "Input for the tool"
}
"""


In [20]:

agent_kwargs = {
    "system_message": system_message
}


In [21]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(memory_key="chat_history",return_messages=True,k=7)

# Initialize the agent with the updated tools and system message
agent_chain = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    agent_kwargs=agent_kwargs,
    memory = memory
)


  memory = ConversationBufferWindowMemory(memory_key="chat_history",return_messages=True,k=7)
  agent_chain = initialize_agent(


In [22]:

# Test queries
queries = [
    "What is the revenue growth between 2022 and 2023 for Lyft?",
    "Summarize Uber's financial performance in 2022.",
    "What are Nike's net profits for the last quarter?"
]

for query in queries:
    print(f"\nQuery: {query}")
    results = agent_chain.run(query)
    print(f"Answer: {results}")

# Additional test queries
additional_queries = [
    "Summarize the Lyft document. What info does it mention?",
    "What is Nike's net profit after the end of May 31, 2023?"
]

for query in additional_queries:
    print(f"\nQuery: {query}")
    results = agent_chain.run(query)
    print(f"Answer: {results}")


  results = agent_chain.run(query)



Query: What is the revenue growth between 2022 and 2023 for Lyft?


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find Lyft's revenue for 2022 and 2023 to calculate the revenue growth. Therefore, I will use the Lyft QA tool to find this information.

Action:
```
{
  "action": "Lyft QA",
  "action_input": {"query": "What was Lyft's revenue in 2022?"}
}
```[0m
Observation: [38;5;200m[1;3mThe text doesn't provide information about Lyft's revenue in 2022.[0m
Thought:[32;1m[1;3mSince I couldn't find the revenue for 2022 from the Lyft knowledge base, I will try to find this information using a web search.

Action:
```
{
  "action": "Web Search",
  "action_input": {"query": "Lyft revenue 2022"}
}
``` 
[0m
Observation: [38;5;200m[1;3msnippet: Lyft Media: Launched in-app video ads in Q4 with strong results in terms of views and click-throughs. Lyft Media revenue in Q4 2023 surpassed the level achieved in all of 2022. We are working closely with partners

In [26]:
results = agent_chain.run("What queries I asked you before?")
print(f"Answer: {results}")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: The assistant doesn't have the ability to recall past queries, as each conversation is treated independently without access to past conversations. 
Action:
```
{
  "action": "Final Answer",
  "action_input": "I'm sorry, but I am not able to recall previous queries. Each conversation is handled independently and does not have access to past conversations."
}
```[0m

[1m> Finished chain.[0m
Answer: I'm sorry, but I am not able to recall previous queries. Each conversation is handled independently and does not have access to past conversations.


In [None]:
## Appendix

In [23]:
# import os
# import openai
# from dotenv import load_dotenv, find_dotenv

# _ = load_dotenv(find_dotenv())

# from openai import OpenAI
# print(f'API key: {os.environ.get("OPENAI_API_KEY", "No API Key found")}')
# client = OpenAI(
#     # This is the default and can be omitted
#     api_key=os.environ.get("OPENAI_API_KEY", "No API Key found"),
# )


In [24]:

# # Test queries
# queries = [
#     "What is the revenue growth between 2022 and 2023 for Lyft?",
#     "Summarize Uber's financial performance in 2022.",
#     "What are Nike's net profits for the last quarter?"
# ]

# for query in queries:
#     print(f"\nQuery: {query}")
#     results = agent_chain.run(query)
#     print(f"Answer: {results}")

# # Additional test queries
# additional_queries = [
#     "Summarize the Lyft document. What info does it mention?",
#     "What is Nike's net profit after the end of May 31, 2023?"
# ]

# for query in additional_queries:
#     print(f"\nQuery: {query}")
#     results = agent_chain.run(query)
#     print(f"Answer: {results}")


In [25]:
# from langchain_community.chat_message_histories import (
#     PostgresChatMessageHistory,
# )
# import uuid
# session_id = str(uuid.uuid4())

# history = PostgresChatMessageHistory(
#     connection_string='postgresql://langchain:langchain@localhost:6024/langchain',
#     session_id=session_id,
# )
# history
# from langchain.memory.vectorstore import VectorStoreRetrieverMemory

# # Initialize PGVector for memory
# memory_vectorstore = PGVector.from_documents(
#     # documents=[],  # Initialize with empty documents
#     embedding=embeddings,  # Correct parameter name
#     connection=connection_string,
#     collection_name="memory"  # Correct parameter name
# )

# # Initialize VectorStoreRetrieverMemory
# memory = VectorStoreRetrieverMemory(
#     retriever=memory_vectorstore.as_retriever(),
#     memory_key="chat_history",
#     return_docs=False  # Set to True if you want to return documents in memory
# )