In [1]:
# Install necessary packages (ensure these are installed in your environment)
# %pip install -qU pypdf==4.0.1 langchain_community
# %pip install -U duckduckgo-search
# %pip install -qU langchain-openai
# %pip install faiss-cpu
# %pip install langchain==0.3.3
# %pip install textstat


In [2]:
# %pip install -U sentence-transformers
# %pip install -U torch  # For CPU
# If you have a GPU and want to leverage it, install the appropriate PyTorch version:
# pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117


In [3]:

import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool, Tool
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from pydantic import BaseModel, Field
from langchain.llms import OpenAI


In [4]:

import requests
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
from functools import partial
from pydantic import BaseModel, Field
from langchain.agents import initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchResults, StructuredTool, Tool
from langchain.chat_models import ChatOpenAI 
from textstat import textstat

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

In [6]:

import langchain
print(langchain.__version__)


0.3.3


In [7]:
from langchain_community.llms import Ollama

In [8]:
from langchain_postgres.vectorstores import PGVector

connection_string = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
# db = PGVector.from_documents(documents, model, connection=connection)

In [None]:
# Securely set your OpenAI API key as an environment variable before running the script
# Example (in your terminal):
# export OPENAI_API_KEY="your-api-key"

import os
api_key =""
os.environ["OPENAI_API_KEY"] = api_key
os.environ["API_KEY_OPENAI"] = api_key


In [10]:
# Define the directory to store FAISS indexes
FAISS_INDEX_DIR = "./faiss_indexes"

# Create the directory if it doesn't exist
os.makedirs(FAISS_INDEX_DIR, exist_ok=True)

In [11]:
def load_or_create_faiss_index_from_documents(
    year: str,
    file_path: str,
    embeddings: OpenAIEmbeddings,
    text_splitter: RecursiveCharacterTextSplitter,
    create_anyway: bool = False
) -> FAISS:
    """
    Loads a FAISS index for a year if it exists; otherwise, creates and saves a new one.
    """
    index_path = os.path.join(FAISS_INDEX_DIR, f"{year.lower()}_documents")
    
    if not create_anyway and os.path.exists(index_path):
        print(f"Loading existing FAISS index for {year} from {index_path}")
        vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    else:
        print(f"Creating FAISS index for {year}")
        # Load and split the documents
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        splits = text_splitter.split_documents(docs)
    
        # Create FAISS vector store
        vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
        
        # Save the FAISS index to disk
        vectorstore.save_local(index_path)
        print(f"Saved FAISS index for {year} to {index_path}")
    
    return vectorstore, splits

def load_or_create_faiss_index_for_routing(
    years: List[str],
    embeddings: OpenAIEmbeddings,
    create_anyway: bool = False
) -> FAISS:
    routing_index_path = os.path.join(FAISS_INDEX_DIR, "routing")
    
    if not create_anyway and os.path.exists(routing_index_path):
        print(f"Loading existing FAISS routing index from {routing_index_path}")
        routing_vectorstore = FAISS.load_local(
            routing_index_path, embeddings, allow_dangerous_deserialization=True
        )
    else:
        print("Creating FAISS routing index")
        routing_vectorstore = FAISS.from_texts(texts=years, embedding=embeddings)
        routing_vectorstore.save_local(routing_index_path)
        print(f"Saved FAISS routing index to {routing_index_path}")
    
    return routing_vectorstore


In [12]:
# List of year names and their corresponding PDF file paths

# Hardcoded years dictionary
years = {
    "UNL-Agriculture-2013": "../../data/unl/2013research-results.pdf",
    "UNL-Agriculture-2014": "../../data/unl/2014research-results.pdf",
    "UNL-Agriculture-2015": "../../data/unl/2015research-results.pdf",
    "UNL-Agriculture-2016": "../../data/unl/2016research-results.pdf",
    "UNL-Agriculture-2017": "../../data/unl/2017research-results.pdf",
    "UNL-Agriculture-2018": "../../data/unl/2018research-results.pdf",
    "UNL-Agriculture-2019": "../../data/unl/2019research-results.pdf",
    "UNL-Agriculture-2020": "../../data/unl/2020research-results.pdf",
    "UNL-Agriculture-2021": "../../data/unl/2021research-results.pdf",
    "UNL-Agriculture-2022": "../../data/unl/2022research-results.pdf",
    "UNL-Agriculture-2023": "../../data/unl/2023research-results.pdf",
}



In [13]:
def initialize_year_data(
    years_dict: dict,
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    recreate_indexes: bool = True
):
    """
    Initializes data for each year's PDF:
      - Loads/splits documents
      - Creates (or loads) a FAISS vectorstore
      - Creates retrievers
      - Stores all in dictionaries and returns them
    
    Args:
        years_dict (dict): A mapping of year-string -> PDF path.
        chunk_size (int): Chunk size for splitting text.
        chunk_overlap (int): Overlap for splitting text.
        recreate_indexes (bool): Whether to force-create new FAISS indexes.

    Returns:
        A dictionary with:
          {
            "year_docs": {year: [Doc1, Doc2, ...], ...},
            "year_vectorstores": {year: FAISS_store, ...},
            "year_retrievers": {year: VectorStoreRetriever, ...},
            "year_qa_tools": {year: SomeQAToolOrChain, ...}
          }
    """
    import os
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain.embeddings.openai import OpenAIEmbeddings
    from langchain.chat_models import ChatOpenAI
    
    # You could parameterize the API key or read from environment, etc.
    api_key = os.environ.get("OPENAI_API_KEY")
    embeddings = OpenAIEmbeddings(api_key=api_key)
    llm = ChatOpenAI(api_key=api_key, model='gpt-4')

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    year_docs = {}
    year_vectorstores = {}
    year_retrievers = {}
    year_qa_tools = {}  # only if you need to store QA chains or something similar

    for year, file_path in years_dict.items():
        # load_or_create_faiss_index_from_documents is presumably your own utility 
        # that returns (vectorstore, splits)
        vectorstore, splits = load_or_create_faiss_index_from_documents(
            year=year,
            file_path=file_path,
            embeddings=embeddings,
            text_splitter=text_splitter,
            create_anyway=recreate_indexes
        )
        
        retriever = vectorstore.as_retriever()

        # Store them in the dictionaries
        year_vectorstores[year] = vectorstore
        year_retrievers[year] = retriever
        year_docs[year] = splits

        # If you need to do anything else (e.g., building a QA chain):
        # qa_chain = RetrievalQA.from_chain_type(
        #    llm=llm,
        #    chain_type="stuff",
        #    retriever=retriever
        # )
        # year_qa_tools[year] = qa_chain  # or store partial run_qa, etc.
    
    return {
        "year_docs": year_docs,
        "year_vectorstores": year_vectorstores,
        "year_retrievers": year_retrievers,
        "year_qa_tools": year_qa_tools
    }


In [25]:
embeddings = OpenAIEmbeddings(api_key=api_key)
llm = ChatOpenAI(api_key=api_key, model='gpt-4')

In [20]:
a = initialize_year_data(years)

Creating FAISS index for UNL-Agriculture-2013
Saved FAISS index for UNL-Agriculture-2013 to ./faiss_indexes/unl-agriculture-2013_documents
Creating FAISS index for UNL-Agriculture-2014
Saved FAISS index for UNL-Agriculture-2014 to ./faiss_indexes/unl-agriculture-2014_documents
Creating FAISS index for UNL-Agriculture-2015
Saved FAISS index for UNL-Agriculture-2015 to ./faiss_indexes/unl-agriculture-2015_documents
Creating FAISS index for UNL-Agriculture-2016
Saved FAISS index for UNL-Agriculture-2016 to ./faiss_indexes/unl-agriculture-2016_documents
Creating FAISS index for UNL-Agriculture-2017
Saved FAISS index for UNL-Agriculture-2017 to ./faiss_indexes/unl-agriculture-2017_documents
Creating FAISS index for UNL-Agriculture-2018
Saved FAISS index for UNL-Agriculture-2018 to ./faiss_indexes/unl-agriculture-2018_documents
Creating FAISS index for UNL-Agriculture-2019
Saved FAISS index for UNL-Agriculture-2019 to ./faiss_indexes/unl-agriculture-2019_documents
Creating FAISS index for UN

In [23]:
year_docs = a['year_docs']
year_vectorstores = a['year_vectorstores']
year_retrievers = a['year_retrievers']
year_qa_tools = a['year_qa_tools']

In [26]:
# List of year names for routing
year_names = list(years.keys())

# Create embeddings for routing (reuse the existing embeddings)
routing_embeddings = embeddings  # Using the same embedding model

# Initialize FAISS vector store for routing using the utility function
routing_vectorstore = load_or_create_faiss_index_for_routing(
    years=list(years.keys()),
    embeddings=embeddings,
    create_anyway=True  # Force recreation of the routing index
)


# Define a function to get the most similar year based on query using FAISS
def get_most_similar_year(query: str) -> Optional[str]:
    try:
        similar_docs = routing_vectorstore.similarity_search(query, k=1)
        if similar_docs:
            year = similar_docs[0].page_content
            print(f"Most similar year for query '{query}': {year}")
            return year
        else:
            print(f"No similar year found for query '{query}'")
            return None
    except Exception as e:
        print(f"Error during similarity search: {e}")
        return None

# Define separate Pydantic models
class RouterInput(BaseModel):
    query: str = Field(..., description="The query to determine the relevant year.")

class QAInput(BaseModel):
    query: str = Field(..., description="The year for the QA tool.")

# Define the Semantic Router function
def semantic_router(query: str) -> str:
    year = get_most_similar_year(query)
    if year and year in years:
        return year
    else:
        return "No Matching Year"  # Or handle as per your requirement


# Define the Semantic Router Tool as a StructuredTool
router_tool = StructuredTool.from_function(
    func=semantic_router,  # Now accepts 'query' directly
    name="Semantic Router",
    description=(
        "Determines which year's knowledge base to use for answering the question. "
        "Requires 'query' as input and returns the year (UNL-Agriculture-2013 to UNL-Agriculture-2023) or 'No Matching Year' if no match is found."
    ),
    args_schema=RouterInput  # Corrected schema
)


# Define a generic QA run function
def run_qa(chain: RetrievalQA, query: str) -> str:
    try:
        response = chain.run(query)
        fk_grade = textstat.flesch_kincaid_grade(response)
        print(f"Flesch-Kincaid Grade: {fk_grade}")
        return f"{response}\n\nFlesch-Kincaid Grade: {fk_grade}"
    except Exception as e:
        print(f"Error during QA run: {e}")
        return "I'm sorry, I encountered an error while processing your request."


# Create StructuredTool instances for each year
qa_tools = []
for year, retriever in year_retrievers.items():
    print(f"Setting up QA tool for {year}")
    
    # Initialize the RetrievalQA chain for the year
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Options: 'stuff', 'map_reduce', etc.
        retriever=retriever
    )
    
    # Bind the current qa_chain to the run_qa function using partial
    bound_qa_run = partial(run_qa, chain=qa_chain)
    
    # Create the StructuredTool for the year
    qa_tool = StructuredTool.from_function(
        func=bound_qa_run,
        name=f"{year} QA",
        description=(
            f"Use this tool to answer questions about {year}'s information, "
            "especially agricultural information."
        ),
        args_schema=QAInput  # Use the QAInput schema
    )
    
    qa_tools.append(qa_tool)
    print(f"QA tool for {year} created")

Creating FAISS routing index
Saved FAISS routing index to ./faiss_indexes/routing
Setting up QA tool for UNL-Agriculture-2013
QA tool for UNL-Agriculture-2013 created
Setting up QA tool for UNL-Agriculture-2014
QA tool for UNL-Agriculture-2014 created
Setting up QA tool for UNL-Agriculture-2015
QA tool for UNL-Agriculture-2015 created
Setting up QA tool for UNL-Agriculture-2016
QA tool for UNL-Agriculture-2016 created
Setting up QA tool for UNL-Agriculture-2017
QA tool for UNL-Agriculture-2017 created
Setting up QA tool for UNL-Agriculture-2018
QA tool for UNL-Agriculture-2018 created
Setting up QA tool for UNL-Agriculture-2019
QA tool for UNL-Agriculture-2019 created
Setting up QA tool for UNL-Agriculture-2020
QA tool for UNL-Agriculture-2020 created
Setting up QA tool for UNL-Agriculture-2021
QA tool for UNL-Agriculture-2021 created
Setting up QA tool for UNL-Agriculture-2022
QA tool for UNL-Agriculture-2022 created
Setting up QA tool for UNL-Agriculture-2023
QA tool for UNL-Agricult

In [27]:

# Update the tools list to include the router and individual QA tools
tools = [router_tool] + qa_tools  # Remove web_search_tool if not used


In [28]:

# Define the system message
# %%
# Define the system message with clear instructions and examples
system_message = """
You are an assistant that helps answer agricultural questions based on UNL research reports.

When given a question, you should:

1. Use the 'Semantic Router' tool to determine which year's information is relevant to the question (UNL-Agriculture-2013 to UNL-Agriculture-2023).
2. Use the appropriate '{year} QA' tool to find information from the selected year's documents.
3. When answering, consider any previous relevant queries asked by the user and mention if they are helpful.
"""



# **Important:** When deciding to use a tool, output your response in the following JSON format **exactly**:

# ```json
# {
#   "action": "Tool Name",
#   "action_input": {
#     // Required fields for the tool
#   }
# }


In [29]:

agent_kwargs = {
    "system_message": system_message
}


In [30]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(memory_key="chat_history",return_messages=True,k=7)

# Initialize the agent with the updated tools and system message
agent_chain = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    agent_kwargs=agent_kwargs,
    memory = memory
)


  memory = ConversationBufferWindowMemory(memory_key="chat_history",return_messages=True,k=7)
  agent_chain = initialize_agent(


In [31]:
# You can print or log the current state of memory
print(memory.load_memory_variables({}))  # This will show the conversation history


{'chat_history': []}


In [32]:
# Test if the memory is working by asking multiple queries
results = agent_chain.run("Tell me about year 2023 agriculture?")
print(f"Answer: {results}")

  results = agent_chain.run("Tell me about year 2023 agriculture?")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: The user is asking about agriculture in the year 2023. I can use the Semantic Router tool to determine the relevant knowledge base for this query.

Action:
```
{
  "action": "Semantic Router",
  "action_input": {
    "query": "Tell me about year 2023 agriculture?"
  }
}
```[0mMost similar year for query 'Tell me about year 2023 agriculture?': UNL-Agriculture-2023

Observation: [36;1m[1;3mUNL-Agriculture-2023[0m
Thought:[32;1m[1;3mThe Semantic Router tool has determined that the relevant knowledge base for this query is UNL-Agriculture-2023. I should use the UNL-Agriculture-2023 QA tool to answer questions about agriculture in the year 2023.

Action:
```
{
  "action": "UNL-Agriculture-2023 QA",
  "action_input": {
    "query": "Tell me about year 2023 agriculture?"
  }
}
```[0mFlesch-Kincaid Grade: 6.7

Observation: [38;5;200m[1;3mIn the year 2023, the final year of a research project, there were some changes

In [33]:
# Test queries
queries = [
    "Tell me about the agricultural advancements in 2015.",
    "What were the key research findings in 2020?",
    "Provide an overview of the 2018 UNL agriculture report.",
    "How did agriculture perform in 2023 compared to 2013?"
]

for query in queries:
    print(f"\nQuery: {query}")
    results = agent_chain.run(query)
    print(f"Answer: {results}")



Query: Tell me about the agricultural advancements in 2015.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: The user is asking about agricultural advancements in 2015. I should use the Semantic Router tool to confirm that the UNL-Agriculture-2015 knowledge base is the correct one to use for this query.
Action:
```
{
  "action": "Semantic Router",
  "action_input": {
    "query": "Tell me about the agricultural advancements in 2015."
  }
}
```[0mMost similar year for query 'Tell me about the agricultural advancements in 2015.': UNL-Agriculture-2015

Observation: [36;1m[1;3mUNL-Agriculture-2015[0m
Thought:[32;1m[1;3mThe Semantic Router confirmed that the UNL-Agriculture-2015 knowledge base is the best match for the query. Now, I will use the UNL-Agriculture-2015 QA tool to find the information about agricultural advancements in 2015.
Action:
```
{
  "action": "UNL-Agriculture-2015 QA",
  "action_input": {
    "query": "Tell me about the agricultural advanceme

In [35]:
%pip freeze > requirements_agwise.txt

Note: you may need to restart the kernel to use updated packages.
