In [None]:
!pip install langchain | tail -n 1
!pip install langchain-community | tail -n 1
!pip install faiss-cpu | tail -n 1
!pip install networkx | tail -n 1
!pip install openai | tail -n 1
!pip install matplotlib | tail -n 1
!pip install langchain_mistralai | tail -n 1
!pip install pdfplumber | tail -n 1
!pip install sentence-transformers | tail -n 1
!pip install transformers | tail -n 1
!pip install langchain-huggingface | tail -n 1
!pip install datasets | tail -n 1
!pip install bert-score | tail -n 1



In [None]:
import logging
import os
import datetime
from typing import List, Dict

# Data handling and visualization
import matplotlib.pyplot as plt
import networkx as nx
import pdfplumber

# NLP and ML
import spacy
import torch

# LangChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain_mistralai.chat_models import ChatMistralAI
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings

# Google Colab and API integration
from google.colab import userdata, auth
import gspread
from google.auth import default
from googleapiclient.discovery import build

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load NLP model for entity extraction
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

def extract_entities(text: str) -> Dict[str, List[str]]:
    """
    Extract entities from text and categorize them, including custom categories.
    """
    doc = nlp(text)
    entities = {
        "strategy": [],
        "indicator": [],
        "region": [],
        "stakeholder": [],
        "counties": [],
        "Strengths": [],
        "Weaknesses": [],
    }

    for ent in doc.ents:
        if ent.label_ in ["ORG", "GPE", "LOC"]:
            entities["region"].append(ent.text)
        elif ent.label_ == "PERSON":
            entities["stakeholder"].append(ent.text)
        elif ent.label_ in ["EVENT", "WORK_OF_ART"]:
            entities["strategy"].append(ent.text)
        else:
            entities["indicator"].append(ent.text)

    counties = [ent.text for ent in doc.ents if "County" in ent.text or "Parish" in ent.text]
    entities["counties"].extend(counties)

    strengths_keywords = ["strength", "advantage", "opportunity"]
    weaknesses_keywords = ["weakness", "challenge", "threat", "risk"]

    for sentence in doc.sents:
        sentence_text = sentence.text.lower()
        if any(keyword in sentence_text for keyword in strengths_keywords):
            entities["Strengths"].append(sentence.text.strip())
        if any(keyword in sentence_text for keyword in weaknesses_keywords):
            entities["Weaknesses"].append(sentence.text.strip())

    for key in entities:
        entities[key] = list(set(entities[key]))
    return entities

def load_and_chunk_documents(file_paths: List[str]) -> List[Document]:
    """
    Load and chunk multiple documents into manageable text chunks.
    """
    all_documents = []
    for file_path in file_paths:
        try:
            with pdfplumber.open(file_path) as pdf:
                text = "".join([page.extract_text() for page in pdf.pages if page.extract_text()])
            if not text.strip():
                raise ValueError(f"The PDF {file_path} contains no readable text.")

            splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            chunks = splitter.create_documents([text], metadatas=[{"source": file_path}])
            all_documents.extend(chunks)
            logging.info(f"Document '{os.path.basename(file_path)}' split into {len(chunks)} chunks.")
        except Exception as e:
            logging.error(f"Failed to process PDF {file_path}: {e}")
    return all_documents

def vectorize_and_store(documents: List[Document]) -> FAISS:
    """
    Vectorize documents and store in a FAISS vector store.
    """
    try:
        model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
        vector_store = FAISS.from_documents(documents, model)
        vector_store.save_local("faiss_index")
        logging.info("Documents vectorized and saved in FAISS.")
        return vector_store
    except Exception as e:
        logging.error(f"Failed to vectorize documents: {e}")
        raise

def generate_knowledge_graph(documents: List[Document]) -> nx.DiGraph:
    """
    Create a simple knowledge graph from documents.
    """
    graph = nx.DiGraph()
    for i, doc in enumerate(documents):
        graph.add_node(f"Doc {i+1}", content=doc.page_content[:100] + "...")
        if i > 0:
            graph.add_edge(f"Doc {i}", f"Doc {i+1}")
    logging.info("Knowledge graph created.")
    return graph

def enhance_knowledge_graph(graph: nx.DiGraph, documents: List[Document]):
    """
    Enhance the knowledge graph with entity-based relationships.
    """
    for i, doc in enumerate(documents):
        entities = extract_entities(doc.page_content)
        doc_node = f"Doc {i+1}"
        for category, entity_list in entities.items():
            for entity in set(entity_list):
                entity_node = f"{category}: {entity}"
                if entity_node not in graph:
                    graph.add_node(entity_node, type=category)
                graph.add_edge(doc_node, entity_node)
    logging.info("Knowledge graph enhanced with entity-based relationships.")
    return graph

def visualize_enhanced_knowledge_graph(graph: nx.DiGraph):
    """
    Visualize the enhanced knowledge graph with entity nodes.
    """
    plt.figure(figsize=(15, 10))
    pos = nx.spring_layout(graph)
    entity_nodes = [n for n, d in graph.nodes(data=True) if "type" in d]
    doc_nodes = [n for n in graph if n not in entity_nodes]

    nx.draw_networkx_nodes(graph, pos, nodelist=doc_nodes, node_color="lightblue", node_size=3000)
    nx.draw_networkx_nodes(graph, pos, nodelist=entity_nodes, node_color="lightgreen", node_size=2000)
    nx.draw_networkx_edges(graph, pos, arrows=True)
    nx.draw_networkx_labels(graph, pos, font_size=8, font_weight="bold")
    plt.title("Enhanced Document Knowledge Graph")
    plt.tight_layout()
    plt.savefig("enhanced_knowledge_graph.png", dpi=300)
    logging.info("Enhanced knowledge graph visualization saved.")

def query_with_knowledge_graph(graph: nx.DiGraph, query: str, retriever, qa_chain):
    """
    Use the knowledge graph to improve context retrieval for QA.
    """
    query_entities = extract_entities(query)
    relevant_nodes = set()
    for category, entity_list in query_entities.items():
        for entity in entity_list:
            entity_node = f"{category}: {entity}"
            if entity_node in graph:
                relevant_nodes.update(nx.ancestors(graph, entity_node))

    relevant_docs = []
    for node in relevant_nodes:
        if node.startswith("Doc"):
            relevant_docs.append(graph.nodes[node]["content"])

    context = " ".join(relevant_docs)
    result = qa_chain.invoke({"query": query, "context": context})
    return result["result"]

In [None]:
# LLM Evaluation Framework Class
class CEDSAgentTester:
    def __init__(self, kg, chunks, llm, sheet_name: str = "CEDS_Agent_Test_Results"):
        """
        Initialize the CEDSAgentTester with the knowledge graph, chunks, and LLM for synthesis.
        """
        self.kg = kg
        self.chunks = chunks
        self.llm = llm
        self.sheet_name = sheet_name
        self.setup_google_sheets()

    def setup_google_sheets(self):
        """
        Set up connection to Google Sheets with proper authentication.
        """
        try:
            # Authenticate Colab
            auth.authenticate_user()

            # Get Google Sheets credentials
            creds, _ = default()
            self.gc = gspread.authorize(creds)

            # Create or open spreadsheet
            try:
                self.sheet = self.gc.open(self.sheet_name)
            except:
                self.sheet = self.gc.create(self.sheet_name)

            # Setup main worksheet
            try:
                self.worksheet = self.sheet.worksheet("Test Results")
            except:
                self.worksheet = self.sheet.add_worksheet("Test Results", 1000, 8)
                headers = [
                    "Date", "Test ID", "Category", "Agent Prompt", "User Query",
                    "Output", "Evaluation Score", "Notes"
                ]
                self.worksheet.insert_row(headers, 1)

            logging.info(f"Successfully connected to sheet: {self.sheet_name}")

        except Exception as e:
            logging.error(f"Error setting up Google Sheets: {str(e)}")
            raise e

    def evaluate_response(self, response: str, category: str) -> tuple:
        """
        Evaluate the quality of the agent's response.
        Returns score (1-5) and notes.
        """
        metrics = {
            "length": len(response.split()) >= 50,
            "relevance": any(word in response.lower() for word in category.lower().split()),
            "structure": response.count('.') >= 2,
            "specificity": any(char.isdigit() for char in response)
        }

        score = sum(metrics.values()) + 1  # Base score of 1 plus metrics (max score 5)
        notes = []

        if not metrics["length"]:
            notes.append("Response too brief")
        if not metrics["relevance"]:
            notes.append("Low relevance to category")
        if not metrics["structure"]:
            notes.append("Poor response structure")
        if not metrics["specificity"]:
            notes.append("Lacks specific details")

        return score, "; ".join(notes) if notes else "Good response"

    def log_evaluation(self, query, response, score, notes):
        """
        Log evaluation results to Google Sheets.
        """
        try:
            result = {
                "Date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "Test ID": f"Test_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
                "Category": "General",
                "Agent Prompt": "LLM Response",
                "User Query": query,
                "Output": response,
                "Evaluation Score": score,
                "Notes": notes
            }

            # Append results to Google Sheets
            self.worksheet.append_row([
                result["Date"], result["Test ID"], result["Category"],
                result["Agent Prompt"], result["User Query"], result["Output"],
                result["Evaluation Score"], result["Notes"]
            ])

            logging.info(f"Logged evaluation for query: {query}")
        except Exception as e:
            logging.error(f"Failed to log evaluation: {str(e)}")

In [None]:
def main():
    """
    Main function to orchestrate the pipeline.
    """
    try:
        file_paths = ['/content/indiana15_ceds_2023.pdf', '/content/tbrcp_ceds_2022.pdf']
        file_paths = [path.strip() for path in file_paths if path.strip()]

        logging.info("Loading and chunking documents...")
        documents = load_and_chunk_documents(file_paths)

        logging.info("Vectorizing documents...")
        vector_store = vectorize_and_store(documents)

        logging.info("Generating and visualizing knowledge graph...")
        knowledge_graph = generate_knowledge_graph(documents)
        knowledge_graph = enhance_knowledge_graph(knowledge_graph, documents)
        # visualize_enhanced_knowledge_graph(knowledge_graph)

        logging.info("Initializing QA...")
        retriever = vector_store.as_retriever(search_kwargs={"k": 3})
        llm = ChatMistralAI(api_key=userdata.get('mistral_api'), model_name="mistral-large-latest")
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=retriever,
            chain_type="stuff",
            return_source_documents=False
        )

        # Initialize the CEDSAgentTester for evaluation
        tester = CEDSAgentTester(knowledge_graph, documents, llm)

        while True:
            query = input("Ask a question (type 'exit' to quit): ")
            if query.lower() == "exit":
                logging.info("Exiting...")
                break
            response = query_with_knowledge_graph(knowledge_graph, query, retriever, qa_chain)
            print("\nResponse:", response, "\n")

            # Evaluate and log response
            score, notes = tester.evaluate_response(response, "General")
            tester.log_evaluation(query, response, score, notes)

    except Exception as e:
        logging.error(f"Error in pipeline: {e}")

In [None]:
if __name__ == "__main__":
    main()

Ask a question (type 'exit' to quit): Compare the highlights of tampa vs indiana 15

Response: Based on the provided context, here are some highlights comparing the Tampa Bay region and the Indiana 15 region:

**Tampa Bay Region:**
- **Economic Data (2020):**
  - Average wages: $58,413
  - Employment growth rate: 3.9%
  - Population growth rate: 6.3%
- **Industry Concentration (2022 Location Quotients):**
  - Apparel manufacturing: 1.93
  - Insurance carriers and related activities: 1.83
  - Telecommunications: 1.72
- **Economic Development:**
  - Focus on attracting tourists and enhancing arts.

**Indiana 15 Region:**
- **Economic Challenges:**
  - Lack of childcare options.
  - Limited access to broadband.
  - Changes in business operations.
  - Lack of support services for mental health.
- **Regional Planning:**
  - Launched a regional resiliency and recovery planning process in 2021.
  - Kicked off the CEDS process in November 2022 with a phase focused on information gathering.

**