In [1]:
import os
import pandas as pd
import numpy as np
from owlready2 import get_ontology
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util


In [2]:
# Define file path
directory = 'D:\GitHub\Projetos\Mestrado\EnergyContext\ontology\imports'
filename = 'oec-extracted.owl'
file_path = os.path.join(directory, filename)

print("Loading ontology...")
# Load the ontology
onto = get_ontology("file://" + file_path).load()

# Define the IRIs (unique identifiers) for properties and individual in the ontology
term_sent_by_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#termSentBy"
actor_coal_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorCoal"
actor_has_context_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorHasContext"

# Get the ontology elements (properties and individual) using their IRIs
term_sent_by_property = onto.search_one(iri=term_sent_by_property_iri)
actor_coal = onto.search_one(iri=actor_coal_iri)
actor_has_context_property = onto.search_one(iri=actor_has_context_property_iri)


Loading ontology...


In [3]:
# Load the PDF file
local_path = r"D:\GitHub\Projetos\Mestrado\EnergyContext\pdf\appendixa_0.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    raise FileNotFoundError("Upload a PDF file")

# Split PDF into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

# Create the vector database using the Ollama Embeddings model
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local-rag"
)


OllamaEmbeddings: 100%|██████████| 3/3 [00:11<00:00,  3.79s/it]


In [4]:
# Define a prompt template for querying the context
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""answer_prompt_template = You are an expert in electrial engineering.
    Using only the information provided in the context, choose the best answer, and only that, to the following question:
    Original question: {question}""")
local_model = "mistral"
llm = ChatOllama(model=local_model)
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)


In [5]:
# Define a template for formatting the question and context
template = """Answer the question based ONLY on the following context with ONLY one answer:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Set up a chain to retrieve relevant context, format the question and answer using the template, and generate a response using the language model
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [6]:
# Define default ontology path for saving (use raw string or double backslashes)
default_save_path = r'D:\GitHub\Projetos\Mestrado\EnergyContext\ontology\imports\updated_oec-extracted.owl'
file_path = r'D:\GitHub\Projetos\Mestrado\EnergyContext\ontology\imports\oec-extracted.owl'

# List to store individual IRIs to be updated
individuals_to_update = []

# First Loop: Collect Individuals to Update
if term_sent_by_property and actor_coal and actor_has_context_property:
    for individual in onto.individuals():
        if term_sent_by_property in individual.get_properties():
            term_sent_by_values = getattr(individual, term_sent_by_property.python_name)
            if actor_coal in term_sent_by_values:
                # Ensure termLexiconString is a list
                if hasattr(individual, "termLexiconString"):
                    if not isinstance(individual.termLexiconString, list):
                        individual.termLexiconString = [individual.termLexiconString]
                individuals_to_update.append(individual.iri) 

# Second Loop: Perform Question-Answering and Update Ontology
with onto:   
    for individual_iri in individuals_to_update:
        try: 
            individual = onto.search_one(iri=individual_iri)
            if not individual:
                print(f"Warning: Individual with IRI {individual_iri} not found.")
                continue  # Skip to the next individual if not found

            if hasattr(individual, "termLexiconString"):
                term_lexicon_string_value = getattr(individual, "termLexiconString")
                if isinstance(term_lexicon_string_value, list):
                    term_lexicon_string_value = " ".join(term_lexicon_string_value)

                # Check if termLexiconString is empty after joining
                if term_lexicon_string_value.strip():  
                    lexicon_question = "What is " + term_lexicon_string_value + "?"
                    answer = chain.invoke(lexicon_question)
                    print("Question:", lexicon_question)
                    print(term_lexicon_string_value, ":", answer)
                    

                    if not hasattr(individual, 'termMeaningString'):
                        individual.termMeaningString = [] # Create an empty list for the data property if it doesn't exist
                    existing_meanings = getattr(individual, 'termMeaningString')

                    # Check if the answer already exists in termMeaningString
                    if answer not in existing_meanings:
                        # Update the termMeaningString property
                        individual.termMeaningString.append(answer)
                    else:
                        print(f"Answer already exists for {term_lexicon_string_value}. Skipping.")
                else:
                    print(" termLexiconString is empty after joining")
            else:
                print(" termLexiconString: Not found")
        except Exception as e:  # Catch any unexpected errors
            print(f"Error processing individual {individual_iri}: {e}")


# Save the updated ontology (only once after all updates)
onto.save(file_path)  


NameError: name 'term_sent_by_property' is not defined