In [None]:
# Installations
!pip install langchain | tail -n 1
!pip install langchain openai transformers chromadb | tail -n 1
!pip install pypdf | tail -n 1
!pip install sentence-transformers | tail -n 1
!pip install "langchain-chroma>=0.1.2" | tail -n 1
!pip install -U langchain-community | tail -n 1
!pip install openai transformers chromadb pypdf sentence-transformers "langchain-chroma>=0.1.2" -U langchain-community | tail -n 1

Successfully installed openai-1.54.4 sentence-transformers-3.3.0


In [None]:
# Imports
import os
import time
from google.colab import userdata
import langchain as lc
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.agents import initialize_agent, Tool
from langchain_core.exceptions import OutputParserException
from langchain.chat_models import ChatOpenAI

In [None]:
# Set up OpenAI API Key
os.environ["OPENAI_API_KEY"] = userdata.get('openai_key')

In [None]:
# Load and Process Documents
document_paths = [
    "/content/indiana15_ceds_2023.pdf",
    "/content/eda_ceds_guidelines_2023.pdf"
]

documents = []
for path in document_paths:
    loader = PyPDFLoader(path)
    documents.extend(loader.load())

In [None]:
# Improved Text Chunking and Vectorization
# Increased chunk size for better context
text_splitter = lc.text_splitter.RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-distilroberta-v1")
vector_store = Chroma.from_documents(chunks, embedding_model)


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-distilroberta-v1")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Initialize OpenAI LLM
llm = ChatOpenAI(model="gpt-3.5-turbo")

  llm = ChatOpenAI(model="gpt-3.5-turbo")


In [None]:
# Set Up Retrieval-QA Chain for RAG
retriever = vector_store.as_retriever()

def dynamic_document_retrieval(query, context_doc=None):
    global retriever

    if context_doc:
        filtered_chunks = [chunk for chunk in chunks if context_doc.lower() in chunk.page_content.lower()]
        if not filtered_chunks:
            return {"output": "Context document not found."}

        filtered_vector_store = Chroma.from_documents(filtered_chunks, embedding_model)
        retriever = filtered_vector_store.as_retriever()
    else:
        retriever = vector_store.as_retriever()

    rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
    return rag_chain.invoke({"query": query})

retrieval_tool = Tool(
    name="document_retriever",
    func=dynamic_document_retrieval,
    description="Retrieves information from selected documents on demand"
)

In [None]:
# Improved Agent Prompt for SWOT Analysis with Document-Specific Clues
agent_prompt = PromptTemplate(
    input_variables=["input", "agent_scratchpad"],
    template="""
    You are an economic development analysis assistant specializing in SWOT (Strengths, Weaknesses, Opportunities, Threats) analysis.
    You have access to multiple documents in the CEDS database. Extract relevant information and format your output accordingly.

    Look for keywords such as "advantage," "strength," "challenge," "opportunity," "risk," or "threat" to identify each section of the SWOT analysis.

    Follow this format:
    - **Strengths**: Provide specific strengths of the region with examples.
    - **Weaknesses**: Provide specific weaknesses of the region with examples.
    - **Opportunities**: Provide specific opportunities for the region with examples.
    - **Threats**: Provide specific threats to the region with examples.

    Provide each section as a separate list item with clear, concise points.

    If a section is missing, respond with "No relevant information found" for that part.

    Thought: {agent_scratchpad}
    Query: {input}
    """
)

In [None]:
# Initialize the Agent
tools = [retrieval_tool]

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent="zero-shot-react-description",
    verbose=True,
    prompt=agent_prompt,
    handle_parsing_errors=True
)

# Define Agentic Query Function with Retry Mechanism
# Added fallback query to improve analysis

def agentic_query(query, context_doc=None, retries=3):
    input_data = {
        "input": query,
        "agent_scratchpad": "",
    }
    for attempt in range(retries):
        try:
            time.sleep(1)  # Delay to respect API rate limit
            response = agent.invoke(input_data)
            if isinstance(response, dict):
                output = response.get('output', 'No relevant information found.')
                if "No relevant information found" in output:
                    print(f"Attempt {attempt + 1} - No relevant information found. Retrying with adjusted query...")
                    input_data["input"] = "Reattempting to extract SWOT information with a focus on specific keywords like advantage, challenge, and risk."
                else:
                    return output
            else:
                return response
        except OutputParserException as e:
            print(f"Attempt {attempt + 1} - Error processing query: {str(e)}")
        except Exception as e:
            print(f"Attempt {attempt + 1} - Unexpected error: {str(e)}")
    return "Exceeded maximum retry attempts without successful completion."

  agent = initialize_agent(


In [None]:
result = agentic_query("Provide a detailed SWOT analysis based on the content in the Indiana 15 CEDS document.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to retrieve the Indiana 15 CEDS document to analyze its content for a SWOT analysis.
Action: document_retriever
Action Input: "Indiana 15 CEDS"[0m
Observation: [36;1m[1;3m{'query': 'Indiana 15 CEDS', 'result': "The Indiana 15 CEDS stands for the Comprehensive Economic Development Strategy for the Indiana 15 region. It is a strategic plan that outlines the region's economic development goals and priorities for the next five years. The CEDS is informed by the region's vision statement, which includes aspirations for economic growth, tourism, quality of life, and opportunities for residents. The plan focuses on areas like population growth, housing, entrepreneurship, business development, education, income levels, and community revenues. The Indiana 15 RPC (Regional Planning Commission) plays a key role in coordinating and implementing this strategy.", 'source_documents': [Document(metadata={'page': 3, 'source': '/cont