In [1]:
import os
import pandas as pd
import numpy as np
from owlready2 import get_ontology
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util


In [2]:
# Define file path
directory = 'D:\GitHub\Projetos\Mestrado\EnergyContext\ontology\imports'
filename = 'oec-extracted.owl'
file_path = os.path.join(directory, filename)

print("Loading ontology...")
# Load the ontology
onto = get_ontology("file://" + file_path).load()

# Define the IRIs (unique identifiers) for properties and individual in the ontology
term_sent_by_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#termSentBy"
actor_coal_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorCoal"
actor_has_context_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorHasContext"

# Get the ontology elements (properties and individual) using their IRIs
term_sent_by_property = onto.search_one(iri=term_sent_by_property_iri)
actor_coal = onto.search_one(iri=actor_coal_iri)
actor_has_context_property = onto.search_one(iri=actor_has_context_property_iri)


Loading ontology...


In [3]:
# Load the PDF file
local_path = r"D:\GitHub\Projetos\Mestrado\EnergyContext\pdf\appendixa_0.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    raise FileNotFoundError("Upload a PDF file")

# Split PDF into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

# Create the vector database using the Ollama Embeddings model
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local-rag"
)


OllamaEmbeddings: 100%|██████████| 3/3 [00:07<00:00,  2.57s/it]


In [4]:
# Define a prompt template for querying the context
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""answer_prompt_template = You are an expert in electrial engineering.
    Using only the information provided in the context, choose the best answer, and only that, to the following question:
    Original question: {question}""")
local_model = "mistral"
llm = ChatOllama(model=local_model)
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)


In [5]:
# Define a template for formatting the question and context
template = """Answer the question based ONLY on the following context with ONLY one answer:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Set up a chain to retrieve relevant context, format the question and answer using the template, and generate a response using the language model
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [18]:
# List to store individual IRIs to be updated
individuals_to_update = []

# First Loop: Collect Individuals to Update
if term_sent_by_property and actor_coal and actor_has_context_property:
    for individual in onto.individuals():
        if term_sent_by_property in individual.get_properties():
            term_sent_by_values = getattr(individual, term_sent_by_property.python_name)
            if actor_coal in term_sent_by_values:
                individuals_to_update.append(individual.iri) 

# Second Loop: Perform Question-Answering and Update Ontology
with onto:   
    for individual_iri in individuals_to_update:
        try: 
            individual = onto.search_one(iri=individual_iri)
            if not individual:
                print(f"Warning: Individual with IRI {individual_iri} not found.")
                continue  # Skip to the next individual if not found

            if hasattr(individual, "termLexiconString"):
                term_lexicon_string_value = getattr(individual, "termLexiconString")
                if isinstance(term_lexicon_string_value, list):
                    term_lexicon_string_value = " ".join(term_lexicon_string_value)
                if term_lexicon_string_value:
                    lexicon_question = "What is " + term_lexicon_string_value + "?"
                    answer = chain.invoke(lexicon_question)
                    print("Question:", lexicon_question)
                    print(term_lexicon_string_value, ":", answer)
                    # Update the termMeaningString property
                    individual.termMeaningString.append(answer)

            else:
                print(" termLexiconString is empty")
        except Exception as e:  # Catch any unexpected errors
            print(f"Error processing individual {individual_iri}: {e}")
    

# Save the updated ontology (handle empty onto_path)
try:
    onto.save(file_path) 
except IndexError:  # If onto_path is still empty after modifications
    print("Error: Unable to determine save location. Using default path.")
    onto.save(file_path) 


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Question: What is net metering?
net metering :  Net metering is a policy or program that allows renewable energy system owners to receive credits for surplus electricity their systems produce but do not use, and which are fed back into the grid. These credits can be used to offset electricity costs when the system's output does not meet demand (e.g., during nighttime). It encourages the deployment of renewable energy technologies by allowing consumers to benefit financially from their own production. The specific details of net metering policies, such as the amount and duration of credits, can vary depending on the jurisdiction.


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Question: What is Storage?
Storage :  Storage in the context of renewable energy refers to technologies and systems designed to store excess electrical energy generated by renewable sources (such as solar, wind, or geothermal) for later use. This stored energy can help balance supply and demand, ensure a consistent power output during periods when generation may be low (e.g., at night or on calm days), and reduce the reliance on traditional fossil fuel-based power plants. Some common storage technologies include batteries, pumped hydroelectric storage, and thermal storage systems. The storage of renewable energy is an essential aspect of a sustainable and resilient electricity grid.
