In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import os
from langchain_nomic import NomicEmbeddings
from dotenv import load_dotenv
# from openai.error import RateLimitError
from langchain_openai import ChatOpenAI
import networkx as nx
import fitz
from typing import List
import numpy as np
from knowledge_graph import KnowledgeGraph
from query_engine import QueryEngine
from langchain.document_loaders import  PyPDFLoader
from concurrent.futures import ThreadPoolExecutor, as_completed
import tqdm as tqdm
from langchain_core.prompts import PromptTemplate
# from typing import 
from langchain_core.pydantic_v1 import BaseModel, Field
import spacy
from spacy.cli import download

class Concepts(BaseModel):
    concepts_list: List[str] = Field(description="List of concepts")


load_dotenv()  # Load environment variables from .env file

api_key = os.getenv("NOMIC_API_KEY")

path = "/home/name-1/AI-Agent/RAG_Project/RAG_Project/data/Understanding_Climate_Change.pdf"
loader = PyPDFLoader(path)
documents = loader.load()
documents = documents[:10]

# DocumentProcessor's process_documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
embeddings =  NomicEmbeddings(model="nomic-embed-text-v1.5",)
splits = text_splitter.split_documents(documents)
vector_store = FAISS.from_documents(splits, embeddings)
graph = nx.Graph()

knowledge_graph = KnowledgeGraph()
llm = ChatOpenAI(
                model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
                base_url="http://10.2.125.37:1234/v1",
                api_key="lm-studio"
            )

In [31]:
llm("what is the meaning of life?")

  llm("what is the meaning of life?")


AIMessage(content="The question of what the meaning of life is has been debated and explored by philosophers, theologians, scientists, and many others for centuries. There are various perspectives on this topic, and it's difficult to provide a single definitive answer.\n\nThat being said, here are some possible ways to approach the question:\n\n1. **Biological perspective**: From a purely biological standpoint, the meaning of life could be seen as survival and reproduction. Organisms exist to survive, find food, shelter, and mates, and reproduce to pass on their genes to the next generation.\n2. **Personal fulfillment**: Many people believe that the meaning of life is to pursue happiness, personal growth, and self-fulfillment. This might involve developing one's passions, building meaningful relationships, and achieving a sense of purpose and contentment.\n3. **Spiritual or religious perspective**: For many individuals, the meaning of life is connected to their spiritual or religious b

In [10]:
splits = text_splitter.split_documents(documents)
vector_store = FAISS.from_documents(splits, embeddings)

In [11]:
for i, split in enumerate(splits):
    graph.add_node(i, content=split.page_content)

In [12]:
texts = [split.page_content for split in splits]
create_embedding = embeddings.embed_documents(texts)

In [13]:
concept_cache = {}
def _load_spacy_model():
    """
    Loads the spaCy NLP model, downloading it if necessary.
    
    Args:
    - None
    
    Returns:
    - spacy.Language: An instance of a spaCy NLP model.
    """
    try:
        return spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading spaCy model...")
        download("en_core_web_sm")
        return spacy.load("en_core_web_sm")
nlp = _load_spacy_model()

In [14]:
def _extract_concepts_and_entities(content, llm):
    """
    Extracts concepts and named entities from the content using spaCy and a large language model.
    
    Args:
    - content (str): The content from which to extract concepts and entities.
    - llm: An instance of a large language model.
    
    Returns:
    - list: A list of extracted concepts and entities.
    """
    if content in concept_cache:
        return concept_cache[content]
    
    # Extract named entities using spaCy
    doc = nlp(content)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
    
    # Extract general concepts using LLM
    concept_extraction_prompt = PromptTemplate(
        input_variables=["text"],
        template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
    )
    concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
    general_concepts = concept_chain.invoke({"text": content}).concepts_list
    
    # Combine named entities and general concepts
    all_concepts = list(set(named_entities + general_concepts))
    
    concept_cache[content] = all_concepts
    return all_concepts

In [18]:
content = split.page_content
if content in concept_cache:
    all_concepts = concept_cache[content]

In [19]:
doc = nlp(content)
named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]

In [20]:
concept_extraction_prompt = PromptTemplate(
    input_variables=["text"],
    template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
)
concept_extraction_prompt

PromptTemplate(input_variables=['text'], template='Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:')

In [28]:
content

'caps, and pollution controls. Enforcement and compliance are critical for their  effectiveness.  \nLocal and Community Initiatives  \nUrban Climate Action  \nCities play a pivotal role in climate action due to their high population densities and \neconomic activities. Urban climate initiatives include sustainable transportation systems, \ngreen building standards, and climate -resilient infrastructure. Community eng agement and \nparticipatory planning are essential for successful implementation.  \nCommunity -Based Conservation'

In [21]:
concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
concept_chain

PromptTemplate(input_variables=['text'], template='Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:')
| RunnableBinding(bound=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7e6ce25d7490>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7e6ce3a75010>, root_client=<openai.OpenAI object at 0x7e6ce1a6f010>, root_async_client=<openai.AsyncOpenAI object at 0x7e6ce25d7790>, model_name='lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF', openai_api_key=SecretStr('**********'), openai_api_base='http://10.2.125.37:1234/v1', openai_proxy=''), kwargs={'tools': [{'type': 'function', 'function': {'name': 'Concepts', 'description': '', 'parameters': {'type': 'object', 'properties': {'concepts_list': {'description': 'List of concepts', 'type': 'array', 'items': {'type': 'string'}}}, 'required': ['concepts_list']}}}], 'parallel_tool_calls': False, 'tool_choice': {'type': 'function', 'function'

In [29]:
general_concepts = concept_chain.invoke({"text": "how can i improve it"})
print(general_concepts)

None


In [33]:
from pydantic import BaseModel, Field
from typing import List
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

# Define your Pydantic model
class Concepts(BaseModel):
    concepts_list: List[str] = Field(description="List of concepts")

# Initialize the LLM
llm = ChatOpenAI(
    model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
    base_url="http://10.2.125.37:1234/v1",
    api_key="lm-studio"
)

# Create the prompt template
concept_extraction_prompt = PromptTemplate(
    input_variables=["text"],
    template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
)

# Create the chain
concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)

# Invoke the chain
input_data = {"text": "how can i improve it"}
print("Input to chain:", input_data)
general_concepts = concept_chain.invoke(input_data)
print("Output from chain:", general_concepts)

Input to chain: {'text': 'how can i improve it'}
Output from chain: None


In [30]:
with ThreadPoolExecutor() as executor:
    future_to_node = {executor.submit(_extract_concepts_and_entities, split.page_content, llm): i 
                        for i, split in enumerate(splits)}
    
    for future in tqdm(as_completed(future_to_node), total=len(splits), desc="Extracting concepts and entities"):
        node = future_to_node[future]
        concepts = future.result()
        graph.nodes[node]['concepts'] = concepts