In [57]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import os
from langchain_nomic import NomicEmbeddings
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
import networkx as nx
import fitz
from typing import List
import numpy as np
from knowledge_graph import KnowledgeGraph
from query_engine import QueryEngine
from langchain.document_loaders import PyPDFLoader
from concurrent.futures import ThreadPoolExecutor, as_completed
import tqdm as tqdm
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.output_parsers import OutputFixingParser
from pydantic import BaseModel, Field
import spacy
from spacy.cli import download

# Define the Concepts class
class Concepts(BaseModel):
    concepts_list: List[str] = Field(description="List of concepts")

load_dotenv()  # Load environment variables from .env file

api_key = os.getenv("NOMIC_API_KEY")

path = "/home/name-1/AI-Agent/RAG_Project/RAG_Project/data/Understanding_Climate_Change.pdf"
loader = PyPDFLoader(path)
documents = loader.load()
documents = documents[:10]
api_key = os.getenv("NOMIC_API_KEY")
# DocumentProcessor's process_documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
splits = text_splitter.split_documents(documents)
vector_store = FAISS.from_documents(splits, embeddings)
graph = nx.Graph()

knowledge_graph = KnowledgeGraph()
llm = ChatOpenAI(
    model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
    base_url="http://10.2.125.37:1234/v1",
    api_key="lm-studio"
)

for i, split in enumerate(splits):
    graph.add_node(i, content=split.page_content)

texts = [split.page_content for split in splits]
create_embedding = embeddings.embed_documents(texts)

concept_cache = {}

def _load_spacy_model():
    try:
        return spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading spaCy model...")
        download("en_core_web_sm")
        return spacy.load("en_core_web_sm")

nlp = _load_spacy_model()

content = split.page_content
# print(content)
doc = nlp(content)
# print(doc)
named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
# print(named_entities)
concept_extraction_prompt = PromptTemplate(
            input_variables=["text"],
            template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
        )


concept_chain = concept_extraction_prompt | llm
general_concepts = concept_chain.invoke({"text": content}).content
# llm.invoke(content)
# general_concepts = content_runnable.invoke({"text": content})
general_concepts

'Here are the key concepts (excluding named entities) extracted from the text:\n\n1. Climate action\n2. Urbanization\n3. Population density\n4. Economic activities\n5. Sustainability\n6. Transportation systems\n7. Green building\n8. Infrastructure resilience\n9. Community engagement\n10. Participatory planning'

In [56]:
from pydantic import BaseModel, Field, validator
from typing import List, Optional
import re

class AIMessage(BaseModel):
    content: str
    additional_kwargs: Optional[dict]
    response_metadata: Optional[dict]
    id: str
    usage_metadata: Optional[dict]

    key_concepts: List[str] = Field(default_factory=list)

    @validator('key_concepts', pre=True, always=True)
    def extract_key_concepts(cls, v, values):
        content = values.get('content', '')
        # Regex to find the list after the introductory text
        match = re.search(r"Here are the key concepts.*?:\s*(\d\..*)", content, re.DOTALL)
        if match:
            # Extract the list of concepts and clean up the formatting
            list_text = match.group(1)
            concepts = [item.strip().split('. ', 1)[-1] for item in list_text.split('\n') if '. ' in item]
            return concepts
        return []

# Example usage
message_data = {
    'content': 'Here are the key concepts (excluding named entities) extracted from the text:\n\n1. Sustainability\n2. Climate action\n3. Population density\n4. Economic activities\n5. Transportation systems\n6. Green building standards\n7. Infrastructure resilience\n8. Community engagement\n9. Participatory planning\n10. Conservation\n11. Pollution controls',
    'additional_kwargs': {'refusal': None},
    'response_metadata': {
        'token_usage': {'completion_tokens': 69, 'prompt_tokens': 138, 'total_tokens': 207},
        'model_name': 'lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf',
        'system_fingerprint': None,
        'finish_reason': 'stop',
        'logprobs': None
    },
    'id': 'run-7ca6460b-ed26-4a18-8d3a-d0db3be20e0f-0',
    'usage_metadata': {'input_tokens': 138, 'output_tokens': 69, 'total_tokens': 207}
}

ai_message = AIMessage(**message_data)
print(ai_message.key_concepts)


['Sustainability', 'Climate action', 'Population density', 'Economic activities', 'Transportation systems', 'Green building standards', 'Infrastructure resilience', 'Community engagement', 'Participatory planning', 'Conservation', 'Pollution controls']


/tmp/ipykernel_4678/440199384.py:14: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/
  @validator('key_concepts', pre=True, always=True)


In [60]:
content = split.page_content
# print(content)
doc = nlp(content)
# print(doc)
named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
# print(named_entities)
concept_extraction_prompt = PromptTemplate(
    input_variables=["text"],
    template="Extract key concepts (excluding named entities) from the following text and return as JSON:\n\n{text}\n\nKey concepts:"
)
llm_in = llm.with_structured_output(Concepts)
llm_x = llm_in.invoke(content)
# concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
# general_concepts = concept_chain.invoke({"text": content}).concepts_list

In [62]:
from pydantic import BaseModel
import json

class KeyConceptsOutput(BaseModel):
    key_concepts: list[str]

    @classmethod
    def parse_response(cls, response_text: str):
        # Attempt to parse as JSON
        try:
            data = json.loads(response_text)
            return cls(**data)
        except json.JSONDecodeError:
            # Handle cases where the model did not output valid JSON
            return None


class LMStudioModelWrapper:
    def __init__(self, model):
        self.model = model

    def with_structured_output(self, input_text: str):
        # Create a prompt that requests structured output
        structured_prompt = f"""
        Extract the key concepts from the following text and return them in a structured JSON format:
        {{
            "key_concepts": ["Concept 1", "Concept 2", "Concept 3", ...]
        }}

        Text:
        {input_text}
        """

        # Get the response from the model
        response = self.model.invoke({"text": structured_prompt})

        # Parse the response using the Pydantic model
        return KeyConceptsOutput.parse_response(response)

# Usage
lm_model = LMStudioModelWrapper(llm)
result = lm_model.with_structured_output("Your input text here")

if result:
    print(result.key_concepts)
else:
    print("Failed to parse structured output.")


None


In [58]:
from langchain_core.pydantic_v1 import BaseModel, Field


class Joke(BaseModel):
    setup: str = Field(description="The setup of the joke")
    punchline: str = Field(description="The punchline to the joke")


# model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm = llm.with_structured_output(Joke)
structured_llm.invoke("Tell me a joke about cats")

In [17]:

def _extract_concepts_and_entities(content, llm):
    if content in concept_cache:
        return concept_cache[content]
    
    doc = nlp(content)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
    
    concept_extraction_prompt = PromptTemplate(
        input_variables=["text"],
        template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
    )
    concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
    general_concepts = concept_chain.invoke({"text": content}).concepts_list
    
    all_concepts = list(set(named_entities + general_concepts))
    concept_cache[content] = all_concepts
    return all_concepts

with ThreadPoolExecutor() as executor:
    future_to_node = {executor.submit(_extract_concepts_and_entities, split.page_content, llm): i 
                     for i, split in enumerate(splits)}

In [64]:
# model = ChatOpenAI(temperature=0.0)
input_string = "What is your name?"

llm.invoke(input_string)

AttributeError: 'LMStudioModelWrapper' object has no attribute 'invoke'

In [59]:
def _extract_concepts_and_entities(content, llm):
    """
    Extracts concepts and named entities from the content using spaCy and a large language model.
    
    Args:
    - content (str): The content from which to extract concepts and entities.
    - llm: An instance of a large language model.
    
    Returns:
    - list: A list of extracted concepts and entities.
    """
    if content in concept_cache:
        return concept_cache[content]
    
    # Extract named entities using spaCy
    doc = nlp(content)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
    
    # Extract general concepts using LLM
    concept_extraction_prompt = PromptTemplate(
        input_variables=["text"],
        template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
    )
    concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
    general_concepts = concept_chain.invoke({"text": content}).concepts_list
    
    # Combine named entities and general concepts
    all_concepts = list(set(named_entities + general_concepts))
    
    concept_cache[content] = all_concepts
    return all_concepts


with ThreadPoolExecutor() as executor:
    future_to_node = {executor.submit(_extract_concepts_and_entities, split.page_content, llm): i 
                        for i, split in enumerate(splits)}