In [1]:
# Install necessary packages (ensure these are installed in your environment)
# %pip install -qU pypdf==4.0.1 langchain_community
# %pip install -U duckduckgo-search
# %pip install -qU langchain-openai
# %pip install faiss-cpu
# %pip install langchain==0.3.3


In [2]:
# %pip install -U sentence-transformers
# %pip install -U torch  # For CPU
# If you have a GPU and want to leverage it, install the appropriate PyTorch version:
# pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117


In [3]:

import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool, Tool
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from pydantic import BaseModel, Field
from langchain.llms import OpenAI


In [4]:

import requests
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
from functools import partial
from pydantic import BaseModel, Field
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool, Tool
from langchain.chat_models import ChatOpenAI 

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

In [6]:

import langchain
print(langchain.__version__)


0.3.3


In [7]:
from langchain_community.llms import Ollama

In [8]:
%pip install "psycopg[binary,pool]"

Note: you may need to restart the kernel to use updated packages.


In [9]:
from langchain_postgres.vectorstores import PGVector

connection_string = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
# db = PGVector.from_documents(documents, model, connection=connection)

In [10]:
# Securely set your OpenAI API key as an environment variable before running the script
# Example (in your terminal):
# export OPENAI_API_KEY="your-api-key"

import os
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
os.environ["API_KEY_OPENAI"] = api_key


In [11]:
# Define the directory to store FAISS indexes
FAISS_INDEX_DIR = "./faiss_indexes"

# Create the directory if it doesn't exist
os.makedirs(FAISS_INDEX_DIR, exist_ok=True)

In [12]:
def load_or_create_faiss_index_from_documents(
    year: str,
    file_path: str,
    embeddings: OpenAIEmbeddings,
    text_splitter: RecursiveCharacterTextSplitter,
    create_anyway: bool = False
) -> FAISS:
    """
    Loads a FAISS index for a year if it exists; otherwise, creates and saves a new one.
    """
    index_path = os.path.join(FAISS_INDEX_DIR, f"{year.lower()}_documents")
    
    if not create_anyway and os.path.exists(index_path):
        print(f"Loading existing FAISS index for {year} from {index_path}")
        vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    else:
        print(f"Creating FAISS index for {year}")
        # Load and split the documents
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        splits = text_splitter.split_documents(docs)
    
        # Create FAISS vector store
        vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
        
        # Save the FAISS index to disk
        vectorstore.save_local(index_path)
        print(f"Saved FAISS index for {year} to {index_path}")
    
    return vectorstore, splits

def load_or_create_faiss_index_for_routing(
    years: List[str],
    embeddings: OpenAIEmbeddings,
    create_anyway: bool = False
) -> FAISS:
    routing_index_path = os.path.join(FAISS_INDEX_DIR, "routing")
    
    if not create_anyway and os.path.exists(routing_index_path):
        print(f"Loading existing FAISS routing index from {routing_index_path}")
        routing_vectorstore = FAISS.load_local(
            routing_index_path, embeddings, allow_dangerous_deserialization=True
        )
    else:
        print("Creating FAISS routing index")
        routing_vectorstore = FAISS.from_texts(texts=years, embedding=embeddings)
        routing_vectorstore.save_local(routing_index_path)
        print(f"Saved FAISS routing index to {routing_index_path}")
    
    return routing_vectorstore


In [13]:
# List of year names and their corresponding PDF file paths

years = {
    "UNL-Agriculture-2021": "../../data/unl/2021research-results.pdf",
    "UNL-Agriculture-2022": "../../data/unl/2022research-results.pdf",
    "UNL-Agriculture-2023": "../../data/unl/2023research-results.pdf"
}


In [14]:
# Initialize dictionaries to hold data for each year

year_docs = {}
year_vectorstores = {}
year_retrievers = {}
year_qa_tools = {}


In [15]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Create embeddings and choose llm

# llm = Ollama(model="llama3", base_url="http://localhost:11434/")
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embeddings = OpenAIEmbeddings(api_key = os.environ["OPENAI_API_KEY"])
llm = ChatOpenAI(api_key = os.environ["OPENAI_API_KEY"], model='gpt-4')  # Ensure you have access to GPT-4


  embeddings = OpenAIEmbeddings(api_key = os.environ["OPENAI_API_KEY"])
  llm = ChatOpenAI(api_key = os.environ["OPENAI_API_KEY"], model='gpt-4')  # Ensure you have access to GPT-4


In [17]:
# Process each year's PDF
# Process each year's PDF
for year, file_path in years.items():
    vectorstore, splits = load_or_create_faiss_index_from_documents(
        year=year,
        file_path=file_path,
        embeddings=embeddings,
        text_splitter=text_splitter,
        create_anyway=True
    )
    
    # Create a retriever for the vector store
    retriever = vectorstore.as_retriever()
    
    # Create a PGVector vector store for the year's documents
    # vectorstore = PGVector.from_documents(
    # documents=splits,
    # embedding=embeddings,  # Correct parameter name
    # connection=connection_string,
    # collection_name=f"{year.lower()}_documents"  # Separate table for each year
    # )
    
    # Create a retriever for the vector store
    retriever = vectorstore.as_retriever()
    
    # Store in dictionaries
    year_vectorstores[year] = vectorstore
    year_retrievers[year] = retriever
    year_docs[year] = splits


Creating FAISS index for UNL-Agriculture-2021


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


Saved FAISS index for UNL-Agriculture-2021 to ./faiss_indexes/unl-agriculture-2021_documents
Creating FAISS index for UNL-Agriculture-2022
Saved FAISS index for UNL-Agriculture-2022 to ./faiss_indexes/unl-agriculture-2022_documents
Creating FAISS index for UNL-Agriculture-2023
Saved FAISS index for UNL-Agriculture-2023 to ./faiss_indexes/unl-agriculture-2023_documents


In [18]:
# Define the Revenue Growth Calculator Tool
class RevenueGrowthInput(BaseModel):
    revenue_new: float = Field(..., description="Revenue in the new year")
    revenue_old: float = Field(..., description="Revenue in the old year")

def revenue_growth_calculator(revenue_new: float, revenue_old: float) -> float:
    """Calculates revenue growth from the old year to the new year."""
    try:
        growth = ((revenue_new - revenue_old) / revenue_old) * 100
        print(f"Calculated revenue growth: {growth}%")
        return growth
    except ZeroDivisionError:
        print("Division by zero encountered in revenue growth calculation.")
        return 0.0

revenue_growth_tool = StructuredTool.from_function(
    func=revenue_growth_calculator,
    name="Revenue Growth Calculator",
    description=(
        "Calculates the revenue growth percentage between two years. "
        "Requires the following inputs in a JSON object:\n"
        "- 'revenue_new': float - Revenue in the new year\n"
        "- 'revenue_old': float - Revenue in the old year"
    ),
    args_schema=RevenueGrowthInput
)

# Define the DuckDuckGo search tool
web_search_tool = DuckDuckGoSearchResults(
    name="Web Search",
    description=(
        "Use this tool to search the web for information not present in the given documents"
        # "if they are not found in the documents."
    )
)

In [20]:
# List of year names for routing
year_names = list(years.keys())

# Create embeddings for routing (reuse the existing embeddings)
routing_embeddings = embeddings  # Using the same embedding model

# Initialize FAISS vector store for routing using the utility function
routing_vectorstore = load_or_create_faiss_index_for_routing(
    years=list(years.keys()),
    embeddings=embeddings,
    create_anyway=True  # Force recreation of the routing index
)


# Define a function to get the most similar year based on query using FAISS
def get_most_similar_year(query: str) -> Optional[str]:
    try:
        similar_docs = routing_vectorstore.similarity_search(query, k=1)
        if similar_docs:
            year = similar_docs[0].page_content
            print(f"Most similar year for query '{query}': {year}")
            return year
        else:
            print(f"No similar year found for query '{query}'")
            return None
    except Exception as e:
        print(f"Error during similarity search: {e}")
        return None

# Define separate Pydantic models
class RouterInput(BaseModel):
    query: str = Field(..., description="The query to determine the relevant year.")

class QAInput(BaseModel):
    query: str = Field(..., description="The year for the QA tool.")

# Define the Semantic Router function
def semantic_router(query: str) -> str:
    year = get_most_similar_year(query)
    if year and year in years:
        return year
    else:
        return "No Matching Year"  # Or handle as per your requirement


# Define the Semantic Router Tool as a StructuredTool
router_tool = StructuredTool.from_function(
    func=semantic_router,  # Now accepts 'query' directly
    name="Semantic Router",
    description=(
        "Determines which year's knowledge base to use for answering the question. "
        "Requires 'query' as input and returns the year (2021, 2022, 2023) or 'Web Search' if no match is found."
    ),
    args_schema=RouterInput  # Corrected schema
)

# Define a generic QA run function
def run_qa(chain: RetrievalQA, query: str) -> str:
    try:
        response = chain.run(query)
        print(f"QA response: {response}")
        return response
    except Exception as e:
        logger.error(f"Error during QA run: {e}")
        return "I'm sorry, I encountered an error while processing your request."

# Create StructuredTool instances for each year
qa_tools = []
for year, retriever in year_retrievers.items():
    print(f"Setting up QA tool for {year}")
    
    # Initialize the RetrievalQA chain for the year
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Options: 'stuff', 'map_reduce', etc.
        retriever=retriever
    )
    
    # Bind the current qa_chain to the run_qa function using partial
    bound_qa_run = partial(run_qa, chain=qa_chain)
    
    # Create the StructuredTool for the year
    qa_tool = StructuredTool.from_function(
        func=bound_qa_run,
        name=f"{year} QA",
        description=(
            f"Use this tool to answer questions about {year}'s information, "
            "especially agricultural information."
        ),
        args_schema=QAInput  # Use the QAInput schema
    )
    
    qa_tools.append(qa_tool)
    print(f"QA tool for {year} created")

Creating FAISS routing index
Saved FAISS routing index to ./faiss_indexes/routing
Setting up QA tool for UNL-Agriculture-2021
QA tool for UNL-Agriculture-2021 created
Setting up QA tool for UNL-Agriculture-2022
QA tool for UNL-Agriculture-2022 created
Setting up QA tool for UNL-Agriculture-2023
QA tool for UNL-Agriculture-2023 created


In [21]:

# Update the tools list to include the router and individual QA tools
tools = [router_tool] + qa_tools  # Remove web_search_tool if not used


In [22]:

# Define the system message
# %%
# Define the system message with clear instructions and examples
system_message = """
You are an assistant that helps answer agricultural questions based on UNL research reports.

When given a question, you should:

1. Use the 'Semantic Router' tool to determine which year's information is relevant to the question (UNL-Agriculture-2021, UNL-Agriculture-2022, or UNL-Agriculture-2023).
2. Use the appropriate '{year} QA' tool to find information from the selected year's documents.
3. When answering, consider any previous relevant queries asked by the user and mention if they are helpful.
"""


# **Important:** When deciding to use a tool, output your response in the following JSON format **exactly**:

# ```json
# {
#   "action": "Tool Name",
#   "action_input": {
#     // Required fields for the tool
#   }
# }


In [23]:

agent_kwargs = {
    "system_message": system_message
}


In [24]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(memory_key="chat_history",return_messages=True,k=7)

# Initialize the agent with the updated tools and system message
agent_chain = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    agent_kwargs=agent_kwargs,
    memory = memory
)


  memory = ConversationBufferWindowMemory(memory_key="chat_history",return_messages=True,k=7)
  agent_chain = initialize_agent(


In [25]:
# You can print or log the current state of memory
print(memory.load_memory_variables({}))  # This will show the conversation history


{'chat_history': []}


In [26]:
# Test if the memory is working by asking multiple queries
results = agent_chain.run("Tell me about year 2023 agriculture?")
print(f"Answer: {results}")

  results = agent_chain.run("Tell me about year 2023 agriculture?")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to determine which knowledge base year to use to answer this question. In this case, it seems likely that the Semantic Router will return '2023' as the query clearly mentions '2023'. However, let's use the Semantic Router to confirm.
Action:
```
{
  "action": "Semantic Router",
  "action_input": {
    "query": "Tell me about year 2023 agriculture?"
  }
}
```[0mMost similar year for query 'Tell me about year 2023 agriculture?': UNL-Agriculture-2023

Observation: [36;1m[1;3mUNL-Agriculture-2023[0m
Thought:[32;1m[1;3mAs expected, the Semantic Router tool returned 'UNL-Agriculture-2023'. This suggests that I should use the 'UNL-Agriculture-2023 QA' tool to answer the question about agriculture in 2023.
Action:
```
{
  "action": "UNL-Agriculture-2023 QA",
  "action_input": {
    "query": "Tell me about year 2023 agriculture?"
  }
}
```[0mQA response: In the year 2023, the final year of a research project was