In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import os
from langchain_nomic import NomicEmbeddings
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
import networkx as nx
import fitz
from typing import List
import numpy as np
from knowledge_graph import KnowledgeGraph
from query_engine import QueryEngine
from langchain.document_loaders import PyPDFLoader
from concurrent.futures import ThreadPoolExecutor, as_completed
import tqdm as tqdm
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.output_parsers import OutputFixingParser
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
import spacy
from spacy.cli import download
# from langchain_ollama import ChatOllama
from langchain_community.llms import Ollama
import json
from pydantic import ValidationError

class Concepts(BaseModel):
    concepts_list: List[str] = Field(description="List of concepts")

load_dotenv()  # Load environment variables from .env file

api_key = os.getenv("NOMIC_API_KEY")

path = "E:/RAG_Project/data/Understanding_Climate_Change.pdf"
loader = PyPDFLoader(path)
documents = loader.load()
documents = documents[:10]
api_key = os.getenv("NOMIC_API_KEY")
# DocumentProcessor's process_documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
splits = text_splitter.split_documents(documents)
vector_store = FAISS.from_documents(splits, embeddings)
graph = nx.Graph()

knowledge_graph = KnowledgeGraph()
# llm = ChatOpenAI(
#     model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
#     base_url="http://localhost:1234/v1",
#     api_key="lm-studio"
# )

# llm = ChatOpenAI(
#                  model="llama3.1",
#                 base_url = "http://localhost:11434/v1",
#                 api_key = 'ollama'
#             )

llm = ChatOpenAI(
                 model="llama3.1",
                base_url = "https://api.groq.com/openai/v1",
                api_key = 'ollama'
            )
for i, split in enumerate(splits):
    graph.add_node(i, content=split.page_content)

texts = [split.page_content for split in splits]
create_embedding = embeddings.embed_documents(texts)

concept_cache = {}

def _load_spacy_model():
    try:
        return spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading spaCy model...")
        download("en_core_web_sm")
        return spacy.load("en_core_web_sm")

nlp = _load_spacy_model()

content = split.page_content
# print(content)
doc = nlp(content)
# print(doc)
named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
# print(named_entities)
concept_extraction_prompt = PromptTemplate(
            input_variables=["text"],
            template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
        )


# concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
# general_concepts = concept_chain.invoke({"text": content})

# # Check if the output is None
# if general_concepts is None:
#     print("The LLM did not return any concepts. Please check the model's response.")
# else:
#     try:
#         # Check if the LLM response is a stringified list
#         if isinstance(general_concepts.concepts_list, str):
#             concepts_list = json.loads(general_concepts.concepts_list)
#         else:
#             concepts_list = general_concepts.concepts_list

#         # Validate the concepts using the Concepts model
#         concepts = Concepts(concepts_list=concepts_list)
#         general_concepts = concepts.concepts_list
#         print(general_concepts)

#     except (json.JSONDecodeError, ValidationError, AttributeError) as e:
#         print(f"Error parsing concepts: {e}")


# llm.invoke(content)
# general_concepts = content_runnable.invoke({"text": content})


In [13]:
from langchain.schema import BaseMessage
# Debugging: Run the LLM prompt and get the raw output
raw_output = llm.invoke(content)  # Pass the text directly as a string
# prompt_input = concept_extraction_prompt.format(text=content)
# Check if we got any output
if raw_output is None:
    print("The LLM did not return any concepts. Please check the model's response.")
else:
    print("Raw LLM Output:", raw_output)  # Print the raw output for debugging

    try:
        # Extract 'content' from the LLM response (raw_output)
        if isinstance(raw_output, BaseMessage):
            response_content = raw_output.content  # Only extract the actual text content
        else:
            raise ValueError("Invalid response format from the LLM.")

        # print("Extracted Text Content:", response_content)

        # Now, assume the 'response_content' contains concepts in textual form
        general_concepts = response_content.splitlines()  # Split the text into lines as a simple approach

        # Ensure the concepts list is valid
        concepts = Concepts(concepts_list=general_concepts)
        print("Validated Concepts:", concepts.concepts_list)

    except (json.JSONDecodeError, ValidationError, AttributeError, ValueError) as e:
        print(f"Error parsing concepts: {e}")


Raw LLM Output: content="It seems like you've got a collection of environmental-related topics there! Let me break down the key points you mentioned:\n\n### Climate Change Mitigation Strategies\n1. **Energy-Efficient Solutions**: This encompasses measures such as:\n   - Renewable energy resources.\n   - Energy conservation tools and technologies.\n\n2. **Greenhouse Gas Reduction**: Focuses on:\n   a. **Energy Transition**: Moving or transitioning to greener sources, like solar, wind, and others.\n   b. **Fuel Efficiency Enhancements** aimed at making vehicles consume less fuel, thereby reducing emissions.\n3. **Indoor Cooling Systems with Reduced Pollution**: Emphasizes energy-efficient solutions, possibly tied in the future to newer technology in this segment or phase of development. This would also extend to greenhouses & controlled indoor spaces as we adapt more into sustainable solutions with AI/IT (if applicable), although currently, this doesn’t play a crucial indirect impact on 

In [15]:
from langchain.schema import BaseMessage
# Debugging: Run the LLM prompt and get the raw output

nlp = _load_spacy_model()

content = split.page_content
# print(content)
doc = nlp(content)
# print(doc)
named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
# print(named_entities)
concept_extraction_prompt = PromptTemplate(
            input_variables=["text"],
            template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
        )

prompt_input = concept_extraction_prompt.format(text=content)
raw_output = llm.invoke(prompt_input)  # Pass the text directly as a string

# Check if we got any output
if raw_output is None:
    print("The LLM did not return any concepts. Please check the model's response.")
else:
    print("Raw LLM Output:", raw_output)  # Print the raw output for debugging

    try:
        # Extract 'content' from the LLM response (raw_output)
        if isinstance(raw_output, BaseMessage):
            response_content = raw_output.content  # Only extract the actual text content
        else:
            raise ValueError("Invalid response format from the LLM.")

        # print("Extracted Text Content:", response_content)

        # Now, assume the 'response_content' contains concepts in textual form
        general_concepts = response_content.splitlines()  # Split the text into lines as a simple approach

        # Ensure the concepts list is valid
        concepts = Concepts(concepts_list=general_concepts)
        print("Validated Concepts:", concepts.concepts_list)

    except (json.JSONDecodeError, ValidationError, AttributeError, ValueError) as e:
        print(f"Error parsing concepts: {e}")


Raw LLM Output: content='Here are the key concepts extracted from the text (excluding named entities):\n\n1. **Regulation**\n2. **Protection**\n\n(mediated concepts implied by regulation)\n\n1. **Pollution prevention/management/mitigation**\n\n(among others related to environment impacts not exhaustively stated as pollutuents, likely e.g., carbon, water contaminants - though note non-specific word choice so no certainty here.) \n\n2. **Accountability/Enforcement**\n3. **Efficacy** \n4. * **Effective outcomes**\n\n(implied)' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 110, 'prompt_tokens': 116, 'total_tokens': 226}, 'model_name': 'llama3.1', 'system_fingerprint': 'fp_ollama', 'finish_reason': 'stop', 'logprobs': None} id='run-0a66490f-f079-4e61-b7a2-0f9142733243-0' usage_metadata={'input_tokens': 116, 'output_tokens': 110, 'total_tokens': 226}
Validated Concepts: ['Here are the key concepts extracted from the text (excluding named entities

In [19]:
def _extract_concepts_and_entities(content, llm):
    if content in concept_cache:
        return concept_cache[content]
    
    # Extract named entities using spaCy
    doc = nlp(content)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
    
    # Extract general concepts using LLM
    concept_extraction_prompt = PromptTemplate(
        input_variables=["text"],
        template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
    )
    # concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
    # general_concepts = concept_chain.invoke({"text": content}).concepts_list
    prompt_input = concept_extraction_prompt.format(text=content)
    raw_output = llm.invoke(prompt_input)  # Pass the text directly as a string

    # Check if we got any output
    if raw_output is None:
        print("The LLM did not return any concepts. Please check the model's response.")
    else:
        print("Raw LLM Output:", raw_output)  # Print the raw output for debugging

        try:
            # Extract 'content' from the LLM response (raw_output)
            if isinstance(raw_output, BaseMessage):
                response_content = raw_output.content  # Only extract the actual text content
            else:
                raise ValueError("Invalid response format from the LLM.")

            # print("Extracted Text Content:", response_content)

            # Now, assume the 'response_content' contains concepts in textual form
            general_concept = response_content.splitlines()  # Split the text into lines as a simple approach

            # Ensure the concepts list is valid
            concepts = Concepts(concepts_list=general_concept)
            general_concepts = concepts.concepts_list
            # print("Validated Concepts:", concepts.concepts_list)

        except (json.JSONDecodeError, ValidationError, AttributeError, ValueError) as e:
            print(f"Error parsing concepts: {e}")
            
            # Combine named entities and general concepts
        all_concepts = list(set(named_entities + general_concepts))
        
        concept_cache[content] = all_concepts
        return all_concepts


In [20]:
def _extract_concepts(splits, llm):
    """
    Extracts concepts for all document splits using multi-threading.
    
    Args:
    - splits (list): A list of document splits.
    - llm: An instance of a large language model.
    
    Returns:
    - None
    """
    with ThreadPoolExecutor() as executor:
        future_to_node = {executor.submit(_extract_concepts_and_entities, split.page_content, llm): i 
                            for i, split in enumerate(splits)}
        
        for future in tqdm(as_completed(future_to_node), total=len(splits), desc="Extracting concepts and entities"):
            node = future_to_node[future]
            concepts = future.result()
            graph.nodes[node]['concepts'] = concepts

In [21]:

from sklearn.metrics.pairwise import cosine_similarity

def _compute_similarities(self, embeddings):
    """
    Computes the cosine similarity matrix for the embeddings.
    
    Args:
    - embeddings (numpy.ndarray): An array of embeddings.
    
    Returns:
    - numpy.ndarray: A cosine similarity matrix for the embeddings.
    """
    return cosine_similarity(embeddings)


In [22]:
def _add_edges(embeddings):
    """
    Adds edges to the graph based on the similarity of embeddings and shared concepts.
    
    Args:
    - embeddings (numpy.ndarray): An array of embeddings for the document splits.
    
    Returns:
    - None
    """
    edges_threshold = 0.8
    similarity_matrix = _compute_similarities(embeddings)
    num_nodes = len(graph.nodes)
    
    for node1 in tqdm(range(num_nodes), desc="Adding edges"):
        for node2 in range(node1 + 1, num_nodes):
            similarity_score = similarity_matrix[node1][node2]
            if similarity_score > edges_threshold:
                shared_concepts = set(graph.nodes[node1]['concepts']) & set(graph.nodes[node2]['concepts'])
                edge_weight = _calculate_edge_weight(node1, node2, similarity_score, shared_concepts)
                graph.add_edge(node1, node2, weight=edge_weight, 
                                    similarity=similarity_score,
                                    shared_concepts=list(shared_concepts))

def _calculate_edge_weight(node1, node2, similarity_score, shared_concepts, alpha=0.7, beta=0.3):
    """
    Calculates the weight of an edge based on similarity score and shared concepts.
    
    Args:
    - node1 (int): The first node.
    - node2 (int): The second node.
    - similarity_score (float): The similarity score between the nodes.
    - shared_concepts (set): The set of shared concepts between the nodes.
    - alpha (float, optional): The weight of the similarity score. Default is 0.7.
    - beta (float, optional): The weight of the shared concepts. Default is 0.3.
    
    Returns:
    - float: The calculated weight of the edge.
    """
    max_possible_shared = min(len(graph.nodes[node1]['concepts']), len(graph.nodes[node2]['concepts']))
    normalized_shared_concepts = len(shared_concepts) / max_possible_shared if max_possible_shared > 0 else 0
    return alpha * similarity_score + beta * normalized_shared_concepts

In [23]:
for i, split in enumerate(splits):
    graph.add_node(i, content=split.page_content)

texts = [split.page_content for split in splits]
create_embedding = embeddings.embed_documents(texts)

# _add_nodes(splits)
# embeddings = _create_embeddings(splits, embedding_model)
_extract_concepts(splits, llm)
_add_edges(create_embedding)


# query_engine = QueryEngine(vector_store, knowledge_graph, llm)

Raw LLM Output: content='Here are the key concepts extracted from the text, excluding named entities:\n\n1. Sustainability\n2. Industrial practices\n3. Waste heat recovery\n4. Carbon sequestration\n5. Restoration of degraded lands\n6. Forest cover increase\n7. Biodiversity conservation\n8. Water management\n9. Community involvement\n10. Sustainable practices' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 71, 'prompt_tokens': 200, 'total_tokens': 271}, 'model_name': 'llama3.1', 'system_fingerprint': 'fp_ollama', 'finish_reason': 'stop', 'logprobs': None} id='run-22c075a8-1d5f-4950-9b21-26ed4e464e95-0' usage_metadata={'input_tokens': 200, 'output_tokens': 71, 'total_tokens': 271}
Raw LLM Output: content='Here are the key concepts extracted from the text (excluding named entities):\n\n* Sustainable agriculture\n* Greenhouse gas emissions\n* Mitigation strategies\n* Animal digestion and manure management\n* Anaerobic conditions\n* Food security

TypeError: 'module' object is not callable

In [14]:
all_concepts = list(set(named_entities + general_concepts))
all_concepts

['',
 '   - Energy conservation tools and technologies.',
 'The banking community is stepping through offering loans – on such condition - at reasonable interest so businesses & institutions may borrow the finances they so desperately needed by greenifying the urban climate conditions; promoting greener urban landscapes.',
 'Local and Community Initiatives  \nUrban Climate Action  \nCities',
 '2. **Greenhouse Gas Reduction**: Focuses on:',
 'Partnership initiatives are starting, where partnerships and collaborations will not only help but expedite progress to an entirely “climate neutral state.”',
 '   b. **Fuel Efficiency Enhancements** aimed at making vehicles consume less fuel, thereby reducing emissions.',
 '1. **Energy-Efficient Solutions**: This encompasses measures such as:',
 '4. **Caps and Pollution Controls**: Enforces limits (as mentioned – caps) along with standards or mechanisms against further environmental degradation, usually through legal mandates at state (locally adm

In [66]:
_extract_concepts_and_entities

AIMessage(content="I don't have a personal name, but I'm an AI assistant designed to provide information and help with tasks. You can refer to me as Assistant or AI if you like!", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 36, 'prompt_tokens': 38, 'total_tokens': 74}, 'model_name': 'lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-90542874-3e33-4781-acab-bf8e5be3ff34-0', usage_metadata={'input_tokens': 38, 'output_tokens': 36, 'total_tokens': 74})

In [59]:
def _extract_concepts_and_entities(content, llm):
    """
    Extracts concepts and named entities from the content using spaCy and a large language model.
    
    Args:
    - content (str): The content from which to extract concepts and entities.
    - llm: An instance of a large language model.
    
    Returns:
    - list: A list of extracted concepts and entities.
    """
    if content in concept_cache:
        return concept_cache[content]
    
    # Extract named entities using spaCy
    doc = nlp(content)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "WORK_OF_ART"]]
    
    # Extract general concepts using LLM
    concept_extraction_prompt = PromptTemplate(
        input_variables=["text"],
        template="Extract key concepts (excluding named entities) from the following text:\n\n{text}\n\nKey concepts:"
    )
    concept_chain = concept_extraction_prompt | llm.with_structured_output(Concepts)
    general_concepts = concept_chain.invoke({"text": content}).concepts_list
    
    # Combine named entities and general concepts
    all_concepts = list(set(named_entities + general_concepts))
    
    concept_cache[content] = all_concepts
    return all_concepts


with ThreadPoolExecutor() as executor:
    future_to_node = {executor.submit(_extract_concepts_and_entities, split.page_content, llm): i 
                        for i, split in enumerate(splits)}