In [67]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import os
from langchain_nomic import NomicEmbeddings
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
import networkx as nx
import fitz
from typing import List
import numpy as np
from knowledge_graph import KnowledgeGraph
from query_engine import QueryEngine
from langchain.document_loaders import PyPDFLoader
from concurrent.futures import ThreadPoolExecutor, as_completed
import tqdm as tqdm
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.output_parsers import OutputFixingParser
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
import spacy
from spacy.cli import download

class Concepts(BaseModel):
    concepts_list: List[str] = Field(description="List of concepts")

load_dotenv()  # Load environment variables from .env file

api_key = os.getenv("NOMIC_API_KEY")

path = "/home/name-1/AI-Agent/RAG_Project/RAG_Project/data/Understanding_Climate_Change.pdf"
loader = PyPDFLoader(path)
documents = loader.load()
documents = documents[:10]
api_key = os.getenv("NOMIC_API_KEY")
# DocumentProcessor's process_documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
splits = text_splitter.split_documents(documents)
vector_store = FAISS.from_documents(splits, embeddings)
graph = nx.Graph()

knowledge_graph = KnowledgeGraph()
llm = ChatOpenAI(
    model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
    base_url="http://10.2.125.37:1234/v1",
    api_key="lm-studio"
)

for i, split in enumerate(splits):
    graph.add_node(i, content=split.page_content)

texts = [split.page_content for split in splits]
create_embedding = embeddings.embed_documents(texts)

concept_cache = {}

def _load_spacy_model():
    try:
        return spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading spaCy model...")
        download("en_core_web_sm")
        return spacy.load("en_core_web_sm")

nlp = _load_spacy_model()

content = split.page_content
# print(content)
doc = nlp(content)
# print(doc)
named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
# print(named_entities)
concept_extraction_prompt = PromptTemplate(
            input_variables=["text"],
            template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
        )


concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
general_concepts = concept_chain.invoke({"text": content}).concepts_list
# llm.invoke(content)
# general_concepts = content_runnable.invoke({"text": content})


ValidationError: 1 validation error for Generation
text
  none is not an allowed value (type=type_error.none.not_allowed)

In [66]:
# model = ChatOpenAI(temperature=0.0)
input_string = "What is your name?"

llm.invoke(input_string)

AIMessage(content="I don't have a personal name, but I'm an AI assistant designed to provide information and help with tasks. You can refer to me as Assistant or AI if you like!", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 36, 'prompt_tokens': 38, 'total_tokens': 74}, 'model_name': 'lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-90542874-3e33-4781-acab-bf8e5be3ff34-0', usage_metadata={'input_tokens': 38, 'output_tokens': 36, 'total_tokens': 74})

In [59]:
def _extract_concepts_and_entities(content, llm):
    """
    Extracts concepts and named entities from the content using spaCy and a large language model.
    
    Args:
    - content (str): The content from which to extract concepts and entities.
    - llm: An instance of a large language model.
    
    Returns:
    - list: A list of extracted concepts and entities.
    """
    if content in concept_cache:
        return concept_cache[content]
    
    # Extract named entities using spaCy
    doc = nlp(content)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
    
    # Extract general concepts using LLM
    concept_extraction_prompt = PromptTemplate(
        input_variables=["text"],
        template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
    )
    concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
    general_concepts = concept_chain.invoke({"text": content}).concepts_list
    
    # Combine named entities and general concepts
    all_concepts = list(set(named_entities + general_concepts))
    
    concept_cache[content] = all_concepts
    return all_concepts


with ThreadPoolExecutor() as executor:
    future_to_node = {executor.submit(_extract_concepts_and_entities, split.page_content, llm): i 
                        for i, split in enumerate(splits)}