In [1]:
# Install necessary packages (ensure these are installed in your environment)
# %pip install -qU pypdf==4.0.1 langchain_community
# %pip install -U duckduckgo-search
# %pip install -qU langchain-openai
# %pip install faiss-cpu
# %pip install langchain==0.3.3


In [2]:
# %pip install -U sentence-transformers
# %pip install -U torch  # For CPU
# If you have a GPU and want to leverage it, install the appropriate PyTorch version:
# pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117


In [3]:

import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool, Tool
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from pydantic import BaseModel, Field
from langchain.llms import OpenAI


In [4]:

import requests
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
from pydantic import BaseModel, Field
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool, Tool
from langchain.chat_models import ChatOpenAI 

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

In [6]:

import langchain
print(langchain.__version__)


0.3.3


In [7]:
from langchain_community.llms import Ollama

In [8]:
# Define Ollama Configuration and LLM Wrapper
class OllamaConfig(BaseModel):
    url: str = Field(default="http://localhost:11434")  # Default Ollama API URL
    model: str = Field(default="llama3")  # Replace with your model name

class OllamaLLM(LLM):
    config: OllamaConfig

    @property
    def _llm_type(self):
        return "ollama"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        api_endpoint = f"{self.config.url}/v1/llm/{self.config.model}/completions"

        headers = {
            "Content-Type": "application/json",
            # Add authorization headers if required
        }

        payload = {
            "prompt": prompt,
            "max_tokens": 1000,  # Adjust as needed
            "stop": stop,
            # Include other parameters as per Ollama's API
        }

        response = requests.post(api_endpoint, headers=headers, json=payload)

        if response.status_code != 200:
            raise ValueError(f"Error {response.status_code}: {response.text}")

        data = response.json()
        return data.get("text", "")

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return self.config.dict()

    @property
    def _default_params(self):
        return {}

# Initialize your custom Ollama LLM
ollama_config = OllamaConfig(
    url="http://localhost:11434",  # Replace with your Ollama server URL if different
    model="llama3"  # Ensure this matches the model you've pulled
)


In [9]:

# Securely set your OpenAI API key as an environment variable before running the script
# Example (in your terminal):
# export OPENAI_API_KEY="your-api-key"

import os
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
os.environ["API_KEY_OPENAI"] = api_key


In [10]:

# List of company names and their corresponding PDF file paths
companies = {
    "Nike": "../../data/Nike-Annual-Report.pdf",
    "Lyft": "../../data/Lyft-Annual-Report-2021.pdf",
    "Uber": "../../data/Uber-Annual-Report-2021.pdf"
}


In [11]:

# Initialize dictionaries to hold data for each company
company_docs = {}
company_vectorstores = {}
company_retrievers = {}
company_qa_tools = {}


In [12]:

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Create embeddings
# embeddings = OpenAIEmbeddings()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Process each company's PDF
for company, file_path in companies.items():
    # Load the PDF document
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    
    # Split the document into chunks
    splits = text_splitter.split_documents(docs)
    
    # Create a vector store for the company's documents using FAISS
    vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
    
    # Create a retriever for the vector store
    retriever = vectorstore.as_retriever()
    
    # Store in dictionaries
    company_docs[company] = docs
    company_vectorstores[company] = vectorstore
    company_retrievers[company] = retriever


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [32]:

# llm = ChatOpenAI()  # Ensure you have access to GPT-4
llm = Ollama(model="llama3", base_url="http://localhost:11434/")


In [33]:
from langchain.tools import StructuredTool

# Define the QAInput model
class QAInput(BaseModel):
    query: str = Field(..., description="The query to ask the company's documents.")

# Define a function that extracts the query from QAInput and runs the QA chain
def qa_run(input: QAInput, **kwargs) -> str:
    return qa_chain.run(input.query)

# Create StructuredTool instances for each company
qa_tools = []
for company, retriever in company_retrievers.items():
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # You can choose 'stuff', 'map_reduce', etc.
        retriever=retriever
    )
    
    qa_tool = StructuredTool.from_function(
        func=qa_run,  # Use the wrapper function
        name=f"{company} QA",
        description=f"Use this tool to answer questions about {company}'s information, especially financial information.",
        args_schema=QAInput  # Use the concrete Pydantic model
    )
    qa_tools.append(qa_tool)


In [34]:

# Define the Revenue Growth Calculator Tool
class RevenueGrowthInput(BaseModel):
    revenue_new: float = Field(..., description="Revenue in the new year")
    revenue_old: float = Field(..., description="Revenue in the old year")

def calculate_revenue_growth(revenue_new: float, revenue_old: float) -> float:
    """Calculates revenue growth from the old year to the new year."""
    return ((revenue_new - revenue_old) / revenue_old) * 100

calculate_growth_tool = StructuredTool.from_function(
    func=calculate_revenue_growth,
    name="Revenue Growth Calculator",
    description=(
        "Calculates the revenue growth percentage between two years. "
        "Requires 'revenue_new' and 'revenue_old' as inputs."
    ),
    args_schema=RevenueGrowthInput
)

# Define the DuckDuckGo search tool
web_search_tool = DuckDuckGoSearchResults(
    name="Web Search",
    description=(
        "Use this tool to search the web for financial information such as revenue figures "
        "if they are not found in the documents."
    )
)


In [35]:
# List of company names for routing
company_names = list(companies.keys())

# Create a vector store for routing using FAISS
routing_vectorstore = FAISS.from_texts(texts=company_names, embedding=embeddings)

# Define a function to get the most similar company based on query
def get_most_similar_company(query: str) -> str:
    similar_docs = routing_vectorstore.similarity_search(query, k=1)
    if similar_docs:
        return similar_docs[0].page_content
    else:
        return None

# Define the semantic routing function
def semantic_router(query: str) -> str:
    company = get_most_similar_company(query)
    if company and company in companies:
        return company
    else:
        return "Web Search"  # Fallback to web search if no company matches

# Define the Semantic Router Tool
router_tool = Tool(
    name="Semantic Router",
    func=semantic_router,
    description=(
        "Determines which company's knowledge base to use for answering the question. "
        "Returns the company name (Nike, Lyft, Uber) or 'Web Search' if no match is found."
    )
)


In [36]:

# Update the tools list to include the router and individual QA tools
tools = [router_tool] + qa_tools + [calculate_growth_tool, web_search_tool]


In [37]:

# Define the system message
# %%
# Define the system message with clear instructions and examples
system_message = """
You are an assistant that helps answer financial questions about companies.

When given a question, you should:

1. Use the 'Semantic Router' tool to determine which company's information is relevant to the question (Nike, Lyft, or Uber).
2. Use the appropriate '{company} QA' tool to find information from the selected company's documents.
3. If information is missing, use the 'Web Search' tool to find it.
4. Once you have all the necessary information, if you need to do some calculation regarding revenue growth, use the 'Revenue Growth Calculator' tool to compute or provide the answer.

**Important:** When deciding to use a tool, output your response in the following JSON format **exactly**:

```json
{
  "action": "Tool Name",
  "action_input": "Input for the tool"
}
"""

In [38]:

agent_kwargs = {
    "system_message": system_message
}


In [39]:

# Initialize the agent with the updated tools and system message
agent_chain = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    agent_kwargs=agent_kwargs
)

# Test queries
queries = [
    "What is the revenue growth between 2022 and 2023 for Lyft?",
    "Summarize Uber's financial performance in 2022.",
    "What are Nike's net profits for the last quarter?"
]

for query in queries:
    print(f"\nQuery: {query}")
    results = agent_chain.run(query)
    print(f"Answer: {results}")

# Additional test queries
additional_queries = [
    "Summarize the Lyft document. What info does it mention?",
    "What is Nike's net profit after the end of May 31, 2023?"
]

for query in additional_queries:
    print(f"\nQuery: {query}")
    results = agent_chain.run(query)
    print(f"Answer: {results}")



Query: What is the revenue growth between 2022 and 2023 for Lyft?


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Since the question is about Lyft's revenue growth, I should use the correct tool to answer this.

Action:
{
  "action": "Revenue Growth Calculator",
  "action_input": {
    "revenue_new": 2023,
    "revenue_old": 2022
  }
}

[0m

[1m> Finished chain.[0m
Answer: Thought: Since the question is about Lyft's revenue growth, I should use the correct tool to answer this.

Action:
{
  "action": "Revenue Growth Calculator",
  "action_input": {
    "revenue_new": 2023,
    "revenue_old": 2022
  }
}



Query: Summarize Uber's financial performance in 2022.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought:
The human wants to know about Uber's financial performance in 2022. I will use the Uber QA tool to answer this question.

Action:
{
  "action": "Uber QA",
  "action_input": {
    "query": {
      "description": "Summarize Uber's financia

ValidationError: 1 validation error for QAInput
query
  Input should be a valid string [type=string_type, input_value={'description': "What are...uery', 'type': 'string'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/string_type