In [1]:
import os
import pandas as pd
import numpy as np
from owlready2 import get_ontology
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

In [2]:

# Define file path-
directory = 'D:\GitHub\Projetos\Mestrado\EnergyContext\ontology\imports'
filename = 'oec-extracted.owl'
file_path = os.path.join(directory, filename)

print("Loading ontology...")
# Load the ontology
onto = get_ontology("file://" + file_path).load()

Loading ontology...


In [3]:
# Define the IRIs for the properties and the individual
term_sent_by_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#termSentBy"
actor_coal_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorCoal"
actor_has_context_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorHasContext"

# Search for the properties and the individual in the ontology
term_sent_by_property = onto.search_one(iri=term_sent_by_property_iri)
actor_coal = onto.search_one(iri=actor_coal_iri)
actor_has_context_property = onto.search_one(iri=actor_has_context_property_iri)

In [2]:
local_path = r"D:\GitHub\Projetos\Mestrado\EnergyContext\pdf\appendixa_0.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    raise FileNotFoundError("Upload a PDF file")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local-rag"
)

PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [5]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""answer_prompt_template = You are an expert in electrial engineering.
    Using only the information provided in the context, choose the best answer, and only that, to the following question:
    Original question: {question}""")

In [6]:
local_model = "mistral"
llm = ChatOllama(model=local_model)
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

template = """Answer the question based ONLY on the following context with ONLY one answer:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [7]:
# Check if the properties and individual exist
if term_sent_by_property and actor_coal and actor_has_context_property:
    # List to store individuals with 'termSentBy' set to 'actorCoal'
    individuals_with_term_sent_by_actor_coal = []

    # Iterate through all individuals in the ontology
    for individual in onto.individuals():
        # Check if the individual has the property 'termSentBy'
        if term_sent_by_property in individual.get_properties():
            # Retrieve the values of 'termSentBy'
            term_sent_by_values = getattr(individual, term_sent_by_property.python_name)
            # Check if 'actorCoal' is in the values of 'termSentBy'
            if actor_coal in term_sent_by_values:
                individuals_with_term_sent_by_actor_coal.append(individual)

    #print("Individuals where 'termSentBy' is 'actorCoal':")
    for individual in individuals_with_term_sent_by_actor_coal:
        print(f"Individual: {individual}")

        # Check if the individual has the 'termLexiconString' data property
        if hasattr(individual, "termLexiconString"):
            # Retrieve the value of 'termLexiconString'
            term_lexicon_string_value = getattr(individual, "termLexiconString")
            
            # Check if the value is a list and convert it to a string
            if isinstance(term_lexicon_string_value, list):
                term_lexicon_string_value = " ".join(term_lexicon_string_value)
            
            # Check if the value is empty or not
            if term_lexicon_string_value:
                lexicon_question = "What is " +  term_lexicon_string_value + "?"
                answer = chain.invoke(lexicon_question)
                if hasattr(individual, "termMeaningString"):
                        # Retrieve the current value of 'termMeaningString'
                        term_meaning_string_value = getattr(individual, "termMeaningString")
                        
                        # Check if it's a list, if not make it a list
                        if not isinstance(term_meaning_string_value, list):
                            term_meaning_string_value = [term_meaning_string_value]
                        
                        # Append the new answer
                        
                        term_meaning_string_value.append(answer)

                        # Set the updated value back to 'termMeaningString'
                        setattr(individual, "termMeaningString", term_meaning_string_value)
                else:
                        print("  termMeaningString: Not found")
                # Print the answer
                print("Question:", lexicon_question)
                print(term_lexicon_string_value,":", answer)
                
            else:
                print("  termLexiconString is empty")
        else:
            print("  termLexiconString: Not found")

else:
    print("One or more of the properties or the individual could not be found in the ontology.")
onto.save()


Individual: D:\GitHub\Projetos\Mestrado\EnergyContext\ontology\imports\oec-extracted.ExchangedTermNetMetering


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.46s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Question: What is net metering?
net metering :  Net metering is a system that allows consumers who generate their own electricity from renewable energy sources (such as solar panels) to feed excess electricity back into the grid, effectively crediting them for the excess power they produce. This means that during periods when the consumer's on-site generation exceeds their electrical demand, the utility credit is applied to the customer's next bill in proportion to the amount of electricity fed onto the grid. The goal of net metering is to encourage the use of renewable energy by reducing the financial burden for consumers who want to invest in such systems.
Individual: D:\GitHub\Projetos\Mestrado\EnergyContext\ontology\imports\oec-extracted.ExchangedTermStorage


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Question: What is Storage?
Storage :  Storage in the context of renewable energy refers to technologies or systems that store electrical energy for later use, such as batteries, pumped hydroelectric storage, and thermal energy storage. The goal of storage is to manage the intermittent nature of renewable energy sources like solar and wind by storing excess energy produced during periods of low demand or high generation, and releasing it during periods of high demand or low generation. This helps ensure a stable electricity supply and improves the overall efficiency and reliability of the grid.


IndexError: list index out of range