In [None]:
# Installations
!pip install langchain | tail -n 1
!pip install langchain llama-cpp-python transformers chromadb | tail -n 1
!pip install langchain_mistralai | tail -n 1
!pip install pypdf | tail -n 1
!pip install langchain-text-splitters | tail -n 1
!pip install sentence-transformers| tail -n 1
!pip install "langchain-chroma>=0.1.2" | tail -n 1
!pip install -U langchain-community | tail -n 1
!pip install gspread oauth2client | tail -n 1
!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client oauth2client gspread | tail -n 1



In [None]:
import os
import time
import json
import logging
from datetime import datetime
from typing import List, Dict, Tuple
from collections import defaultdict
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

from google.colab import userdata, auth
from google.auth import default
import gspread
from oauth2client.client import GoogleCredentials

import langchain as lc
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA, GraphQAChain
from langchain.prompts import PromptTemplate
from langchain.agents import create_react_agent, Tool, AgentExecutor
from langchain.graphs import NetworkxEntityGraph
from langchain.graphs.networkx_graph import KnowledgeTriple
from langchain.schema import Document
from langchain_core.exceptions import OutputParserException
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.document_loaders import PyPDFLoader
from langchain_mistralai.chat_models import ChatMistralAI

  from tqdm.autonotebook import tqdm, trange


# **Chunk Documents and create Vector Store**

In [None]:
logging.basicConfig(level=logging.INFO)

def load_and_split_documents(document_paths: List[str], chunk_size: int = 1000, chunk_overlap: int = 100):
    documents = []
    for path in document_paths:
        try:
            loader = PyPDFLoader(path)
            documents.extend(loader.load())
        except Exception as e:
            logging.error(f"Error loading document {path}: {e}")

    text_splitter = lc.text_splitter.RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)

def initialize_vector_store(chunks: List[Document], embedding_model: HuggingFaceEmbeddings):
    try:
        return Chroma.from_documents(chunks, embedding_model)
    except Exception as e:
        logging.error(f"Error initializing vector store: {e}")
        return None

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-distilroberta-v1')

document_paths = [
    "/content/indiana15_ceds_2023.pdf",
    "/content/eda_ceds_guidelines_2023.pdf"
]

chunks = load_and_split_documents(document_paths)
vector_store = initialize_vector_store(chunks, embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# **Initialize LLM and build Document Retrieval Tool**

In [None]:
llm = ChatMistralAI(api_key=userdata.get('mistral_api'), model_name="mistral-large-latest")

In [None]:
# Retrieval QA chain for RAG
retriever = vector_store.as_retriever()

def dynamic_document_retrieval(query, context_doc=None):
    global retriever
    try:
        if context_doc:
            filtered_chunks = [chunk for chunk in chunks if context_doc.lower() in chunk.page_content.lower()]
            if not filtered_chunks:
                return {"output": "Context document not found."}
            filtered_vector_store = Chroma.from_documents(filtered_chunks, embedding_model)
            retriever = filtered_vector_store.as_retriever()
        else:
            retriever = vector_store.as_retriever()

        rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
        return rag_chain({"query": query})
    except Exception as e:
        logging.error(f"Error in dynamic document retrieval: {e}")
        return {"output": "Error processing query."}

retrieval_tool = Tool(
    name="document_retriever",
    func=dynamic_document_retrieval,
    description="Retrieves information from selected documents on demand"
)

# **Knowledge Graph**

In [None]:
spacy.require_gpu()
nlp = spacy.load("en_core_web_sm")

def extract_entities(text: str):
    doc = nlp(text)
    entities = {ent.text: ent.label_ for ent in doc.ents}
    return entities

# Function to calculate cosine similarity between two vectors
def get_cosine_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

def query_graph(G: nx.Graph, node_id):
    neighbors = list(G.neighbors(node_id))
    relationships = {}
    for neighbor in neighbors:
        similarity = G[node_id][neighbor]["weight"]
        relationships[neighbor] = similarity
    return relationships

# Function to build the knowledge graph from chunks
def build_knowledge_graph_from_chunks(chunks, embedding_model) -> nx.Graph:
    G = nx.Graph()

    # Create embeddings for each chunk text content using embed_documents
    chunk_texts = [chunk.page_content for chunk in chunks]
    chunk_embeddings = embedding_model.embed_documents(chunk_texts)

    # Add nodes to the graph
    for idx, chunk in enumerate(chunks):
        entities = extract_entities(chunk.page_content)
        G.add_node(idx, text=chunk.page_content, entities=entities, embedding=chunk_embeddings[idx])

    # Add edges based on similarity between chunks
    for i in range(len(chunks)):
        for j in range(i + 1, len(chunks)):
            similarity = get_cosine_similarity(chunk_embeddings[i], chunk_embeddings[j])
            if similarity > 0.7:
                G.add_edge(i, j, weight=similarity)

    return G

# Extract entities from the query
def extract_entities_from_query(query: str):
    doc = nlp(query)
    return {ent.text: ent.label_ for ent in doc.ents}

# Query the graph for relationships of a specific node
def query_graph(G: nx.Graph, node_id):
    neighbors = list(G.neighbors(node_id))
    relationships = {}
    for neighbor in neighbors:
        similarity = G[node_id][neighbor]["weight"]
        relationships[neighbor] = similarity
    return relationships

# Search the knowledge graph for a node that matches query entities
def search_node_in_graph(query, G: nx.Graph):
    query_entities = extract_entities_from_query(query)
    for node_id, data in G.nodes(data=True):
        for entity, label in query_entities.items():
            if any(entity in entities for entities in data['entities'].keys()):
                return node_id
    return None

# Get related chunks from the graph
def get_related_chunks(query, G: nx.Graph, chunks):
    node_id = search_node_in_graph(query, G)
    if node_id is not None:
        relationships = query_graph(G, node_id)
        related_chunks = [(chunks[neighbor].page_content, similarity) for neighbor, similarity in relationships.items()]
        return related_chunks
    return "No related chunks found."

# Build the knowledge graph
kg = build_knowledge_graph_from_chunks(chunks, embedding_model)

# Tool for querying the knowledge graph
kg_tool = Tool(
    name="knowledge_graph",
    func=lambda query: query_graph(kg, search_node_in_graph(query, kg)),
    description="Query the CEDS knowledge graph for relationships and patterns"
)

# **Agent Prompt**

In [None]:
agent_prompt = PromptTemplate(
    input_variables=["query", "context_doc", "tools", "agent_scratchpad", "max_iterations", "tool_names"],
    template="""
    You are an assistant specializing in economic development insights, focused on providing actionable and strategic recommendations based on regional data from the CEDS database. You have access to {max_iterations} opportunities to refine your final response.

    You have access to the following {tools}:
    - Knowledge Graph Tool: Use this to extract and identify relationships between entities such as economic indicators, strategies, regions, stakeholders, and resources. This will help you identify patterns, trends, and relevant metrics.
    - Document Retrieval Tool: Use this to extract context and validate findings from the CEDS documents. Summarize retrieved content into concise and actionable insights.

    Approach: Utilize the {tool_names}
    - **Step 1**: Use the Knowledge Graph Tool to extract relationships between key entities. Identify economic indicators, strategies, or regional data relevant to the query.
    - **Step 2**: Use the Document Retrieval Tool to extract context from CEDS documents. Summarize retrieved content into a concise overview, emphasizing key insights.
    - **Step 3**: Synthesize the information from both tools to provide strategic insights and a well-structured response that aligns with the response format.

    Response Format: Your response must be limited to 500 characters and structured as follows:
    - **Year of Analysis**: Extracted or inferred from document metadata.
    - **Dataset(s) Used**: Specify datasets used (e.g., CEDS regional data, economic indicators, etc.).
    - **Time Period Covered**: Indicate the data's start and end years (e.g., 2018-2023).
    - **Geographic Scope**: Specify the region analyzed, such as Pike County, Indiana, with state details.
    - **Comparison Regions**: Include neighboring counties, similar economic regions, or benchmarks.
    - **County Population Size**: Categorize counties as small, medium, or large.
    - **Urban/Rural Distinction**: Classify areas as urban or rural.

    - **Strategic Insights**: Provide a concise, actionable summary of the analysis, focusing on trends, issues, or opportunities.
    - **Context**: Summarize the relevant document context, such as CEDS sections, key strategies, or indicators.

    Thought Process:
    - Always prioritize summarization when extracting information from documents.
    - Ensure the final response highlights strategic recommendations in a concise format.

    Thought: {agent_scratchpad}
    Query: {query}
    """
)


# **Combine Tools and Create Agent Executor**

In [None]:
tools = [retrieval_tool, kg_tool]
agent = create_react_agent(tools=tools, llm=llm, prompt=agent_prompt)

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True,
    max_iterations=10
)

In [None]:
# Introduce a delay function with invoke for handling rate limits
def query_with_retry(query_func, query, retries=3, delay=1):
    attempt = 0
    while attempt < retries:
        try:
            return query_func(query)
        except Exception as e:
            attempt += 1
            if '429' in str(e):
                print(f"Rate limit exceeded. Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2
            else:
                print(f"Unexpected error: {str(e)}")
                break
    return "Failed to process the query after multiple attempts."

def agentic_query(query, kg, chunks, context_doc=None):
    input_data = {
        "query": query,
        "context_doc": context_doc,
        "agent_scratchpad": "",
        "max_iterations": 10
    }

    # Step 1: Extract related chunks from the knowledge graph (KG)
    related_chunks = get_related_chunks(query, kg, chunks)

    # Step 2: Construct Relevance Summary
    if related_chunks == "No related chunks found.":
        relevance_summary = "No relevant context was extracted from the knowledge graph."
        extracted_context = "No context was retrieved from the knowledge graph. The query was processed with limited insights."
    else:
        relevance_summary = f"Found {len(related_chunks)} relevant context chunk(s) from the knowledge graph."
        extracted_context = "\n".join(
            [
                f"Chunk {i+1} (Similarity: {similarity}):\n{chunk[:300]}..."
                for i, (chunk, similarity) in enumerate(related_chunks)
            ]
        )

    # Step 3: Use document retrieval to extract additional context
    retrieval_input = f"{query}\n\nContext from knowledge graph:\n{extracted_context}"
    retrieval_response = retrieval_tool.func(retrieval_input)

    # Step 4: Construct Actionable Insights
    actionable_insights = (
        f"In summary: {retrieval_response} [Concise actionable strategic insights based on analysis]."
    )

    # Step 5: Format and Return Output
    output = (
        f"1. **Relevance Summary:**\n{relevance_summary}\n\n"
        f"2. **Extracted Context:**\n{extracted_context}\n\n"
        f"3. **Actionable Insights:**\n{actionable_insights}"
    )
    return output



# **Testing**

In [None]:
class CEDSAgentTester:
    def __init__(self, agent_function, kg, chunks, sheet_name: str = "CEDS_Agent_Test_Results"):
        """
        Initialize the CEDSAgentTester with the agent function, knowledge graph, and chunks.
        """
        self.agent_function = agent_function
        self.kg = kg
        self.chunks = chunks
        self.sheet_name = sheet_name
        self.setup_google_sheets()

    def setup_google_sheets(self):
        """
        Set up connection to Google Sheets with proper authentication
        """
        try:
            # Authenticate Colab
            auth.authenticate_user()

            # Get Google Sheets credentials
            creds, _ = default()
            self.gc = gspread.authorize(creds)

            # Create or open spreadsheet
            try:
                self.sheet = self.gc.open(self.sheet_name)
            except:
                self.sheet = self.gc.create(self.sheet_name)

            # Setup main worksheet
            try:
                self.worksheet = self.sheet.worksheet("Test Results")
            except:
                self.worksheet = self.sheet.add_worksheet("Test Results", 1000, 8)  # 8 columns for all your fields
                headers = [
                    "Date", "Test ID", "Category", "Agent Prompt", "User Query", "Output", "Evaluation Score", "Notes"
                ]
                self.worksheet.insert_row(headers, 1)

            print(f"Successfully connected to sheet: {self.sheet_name}")

            # Test the connection with a sample write
            test_row = [
                datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "TEST_INIT", "Connection Test", "Default Prompt", "Test Query", "Test Response", 5, "Connection test successful"
            ]
            self.worksheet.append_row(test_row)
            print("Successfully verified connection with test write")

        except Exception as e:
            print(f"Error setting up Google Sheets: {str(e)}")
            raise e  # Re-raise the exception to handle it in the constructor

    def generate_test_cases(self) -> List[Dict]:
        """Generate a comprehensive set of test cases"""
        return [
            {
                "category": "Economy",
                "queries": [
                    "Economy: How have key economic indicators (GDP, job growth, industry diversity) changed over time in Indiana Region 15?"
                ]
            },
            # You can uncomment and add more categories and queries if needed
        ]

    def evaluate_response(self, response: str, category: str) -> tuple:
        """
        Evaluate the quality of the agent's response
        Returns score (1-5) and notes
        """
        metrics = {
            "length": len(response.split()) > 50,
            "relevance": any(word in response.lower() for word in category.lower().split()),
            "structure": response.count('.') > 2,
            "specificity": any(char.isdigit() for char in response)
        }

        score = sum(metrics.values()) + 1  # Base score of 1 plus metrics
        notes = []

        if not metrics["length"]:
            notes.append("Response too brief")
        if not metrics["relevance"]:
            notes.append("Low relevance to category")
        if not metrics["structure"]:
            notes.append("Poor response structure")
        if not metrics["specificity"]:
            notes.append("Lacks specific details")

        return score, "; ".join(notes) if notes else "Good response"

    def run_tests(self, num_iterations: int = 1):
        """
        Run the test suite multiple times and log results

        Parameters:
        num_iterations: Number of times to run each test case
        """
        test_cases = self.generate_test_cases()
        results = []

        for iteration in range(num_iterations):
            for test_case in test_cases:
                category = test_case["category"]
                for query in test_case["queries"]:
                    try:
                        response = self.agent_function(query, self.kg, self.chunks)

                        score, notes = self.evaluate_response(response, category)

                        result = {
                            "Date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                            "Test ID": f"TEST_{iteration}_{category.replace(' ', '')}",
                            "Category": category,
                            "Agent Prompt": "Default Agent Prompt",
                            "User Query": query,
                            "Output": response,
                            "Evaluation Score": score,
                            "Notes": notes
                        }

                        results.append(result)

                        self.worksheet.append_row([
                            result["Date"],
                            result["Test ID"],
                            result["Category"],
                            result["Agent Prompt"],
                            result["User Query"],
                            result["Output"],
                            result["Evaluation Score"],
                            result["Notes"]
                        ])

                    except Exception as e:
                        print(f"Error running test for query '{query}': {str(e)}")

        return results

    def generate_performance_report(self, results: List[Dict]) -> Dict:
        """Generate a summary report of test results"""
        df = pd.DataFrame(results)

        report = {
            "total_tests": len(results),
            "average_score": df["Evaluation Score"].mean(),
            "score_distribution": df["Evaluation Score"].value_counts().to_dict(),
            "performance_by_category": df.groupby("Category")["Evaluation Score"].mean().to_dict(),
            "failed_tests": df[df["Evaluation Score"] < 3][["Category", "User Query", "Notes"]].to_dict('records'),
            "top_performing_tests": df[df["Evaluation Score"] >= 4][["Category", "User Query", "Notes"]].to_dict('records')
        }

        # Create performance summary worksheet
        try:
            summary_sheet = self.sheet.worksheet("Performance Summary")
            summary_sheet.clear()
        except:
            summary_sheet = self.sheet.add_worksheet("Performance Summary", 1000, 3)

        # Write summary to sheet
        summary_data = [
            ["Metric", "Value", "Details"],
            ["Total Tests", report["total_tests"], ""],
            ["Average Score", f"{report['average_score']:.2f}", ""],
            ["Score Distribution", "", json.dumps(report["score_distribution"])],
            ["Performance by Category", "", json.dumps(report["performance_by_category"])],
            ["Number of Failed Tests", len(report["failed_tests"]), ""],
            ["Number of High Performing Tests", len(report["top_performing_tests"]), ""]
        ]

        summary_sheet.update(values=summary_data, range_name='A1:C' + str(len(summary_data)))

        return report

In [None]:
tester = CEDSAgentTester(agentic_query, kg, chunks)
results = tester.run_tests(num_iterations=1)
report = tester.generate_performance_report(results)

Successfully connected to sheet: CEDS_Agent_Test_Results
Successfully verified connection with test write


In [None]:
# Summary Metrics
print("\nTesting Summary:")
print(f"Total tests run: {report['total_tests']}")
print(f"Average score: {report['average_score']:.2f}")

print("\n" + "="*50 + "\n")

# Test Results
print("Test Results:")
for result in results:
    output = result['Output']
    print(output)


Testing Summary:
Total tests run: 1
Average score: 5.00


Test Results:
1. **Relevance Summary:**
Found 17 relevant context chunk(s) from the knowledge graph.

2. **Extracted Context:**
Chunk 1 (Similarity: 0.7329357583405699):
Wha t is the Indiana 15 Comprehensive Economic Development Stra tegy (CEDS)? 
Quality of place, recreational assets, strong schools, low 
crime, and friendly neighbors - these are some of the 
characteristics that were used by Indiana 15 residents 
to describe what they love about their region. Indi...
Chunk 2 (Similarity: 0.7757243505622292):
6 |  INDIANA 15 RPC - 2023 5-YEAR CEDS 
How are we aligning regional planning efforts?  
In 2021, the Indiana 15 RPC launched a regional resiliency and recovery 
planning process intended to help the Indiana 15 region recover from 
the economic effects of the COVID-19 pandemic and become more 
resil...
Chunk 3 (Similarity: 0.7463742701407654):
communities within the region can use speciﬁ c programs and projects 
to contin