In [2]:
import os
import pandas as pd
import numpy as np
from owlready2 import get_ontology
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

In [3]:

# Load the question answering pipeline with a model fine-tuned for question answering
question_answerer = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Load the dataset
dataset = load_dataset("Open-Orca/OpenOrca")

# Extract contexts and turn them into a DataFrame
contexts = dataset["train"]["response"]
df = pd.DataFrame({'context': contexts})

# Retrieve relevant contexts using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=50000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['context'])

# Filter contexts for relevance and length
def is_relevant_context(context, keywords):
    return any(keyword.lower() in context.lower() for keyword in keywords)

In [None]:

# Define file path-
directory = 'D:\\GitHub\\Projetos\\Mestrado\\RAG\\ontology\\imports'
filename = 'oec-extracted.owl'
file_path = os.path.join(directory, filename)

print("Loading ontology...")
# Load the ontology
onto = get_ontology("file://" + file_path).load()

Loading ontology...


In [5]:
# Define the IRIs for the properties and the individual
term_sent_by_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#termSentBy"
actor_coal_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorCoal"
actor_has_context_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorHasContext"

# Search for the properties and the individual in the ontology
term_sent_by_property = onto.search_one(iri=term_sent_by_property_iri)
actor_coal = onto.search_one(iri=actor_coal_iri)
actor_has_context_property = onto.search_one(iri=actor_has_context_property_iri)

In [6]:
# Check if the properties and individual exist
if term_sent_by_property and actor_coal and actor_has_context_property:
    # List to store individuals with 'termSentBy' set to 'actorCoal'
    individuals_with_term_sent_by_actor_coal = []

    # Iterate through all individuals in the ontology
    for individual in onto.individuals():
        # Check if the individual has the property 'termSentBy'
        if term_sent_by_property in individual.get_properties():
            # Retrieve the values of 'termSentBy'
            term_sent_by_values = getattr(individual, term_sent_by_property.python_name)
            # Check if 'actorCoal' is in the values of 'termSentBy'
            if actor_coal in term_sent_by_values:
                individuals_with_term_sent_by_actor_coal.append(individual)

    # Print the individuals with 'termSentBy' set to 'actorCoal'
    #print("Individuals where 'termSentBy' is 'actorCoal':")
    for individual in individuals_with_term_sent_by_actor_coal:
        print(f"Individual: {individual}")

        # Check if the individual has the 'termLexiconString' data property
        if hasattr(individual, "termLexiconString"):
            # Retrieve the value of 'termLexiconString'
            term_lexicon_string_value = getattr(individual, "termLexiconString")
            
            # Check if the value is a list and convert it to a string
            if isinstance(term_lexicon_string_value, list):
                term_lexicon_string_value = " ".join(term_lexicon_string_value)
            
            # Check if the value is empty or not
            if term_lexicon_string_value:
                # Transform the question into the same vector space
                question = f'What is the definition of {term_lexicon_string_value} in the context of renreable energy?'
                keywords = ["solar energy", "renewable energy", "photovoltaic"]
                question_vector = tfidf_vectorizer.transform([question])

                # Compute cosine similarity between the question and all contexts
                cosine_similarities = cosine_similarity(question_vector, tfidf_matrix).flatten()

                # Set a high threshold for cosine similarity
                threshold = 0.1
                top_n_indices = np.where(cosine_similarities > threshold)[0]
                top_contexts = df.iloc[top_n_indices]['context'].drop_duplicates().tolist()
                # Define more specific keywords

                # Filter for relevant contexts
                relevant_contexts = [context for context in top_contexts if is_relevant_context(context, keywords)]

                """
                # Print the selected contexts for debugging
                print("Relevant contexts:")
                for i, context in enumerate(relevant_contexts[:10]):  # Print only the top 10 for brevity
                    print(f"Context {i+1}: {context}\n")"""

                # Concatenate the top relevant contexts into a single string
                combined_contexts = " ".join(relevant_contexts)

                # Perform the question answering on the combined contexts
                result = question_answerer(question=question, context=combined_contexts)

                # Print the result for debugging
                #print("QA Result:", result)

                # Extract the answer span from the context using the provided indices
                answer_start = result['start']
                answer_end = result['end']
                answer = combined_contexts[answer_start:answer_end].strip()
                if hasattr(individual, "termMeaningString"):
                        # Retrieve the current value of 'termMeaningString'
                        term_meaning_string_value = getattr(individual, "termMeaningString")
                        
                        # Check if it's a list, if not make it a list
                        if not isinstance(term_meaning_string_value, list):
                            term_meaning_string_value = [term_meaning_string_value]
                        
                        # Append the new answer
                        term_meaning_string_value.append(answer)

                        # Set the updated value back to 'termMeaningString'
                        setattr(individual, "termMeaningString", term_meaning_string_value)
                else:
                        print("  termMeaningString: Not found")
                # Print the answer
                print("Answer:", answer)
 
            else:
                print("  termLexiconString is empty")
        else:
            print("  termLexiconString: Not found")

else:
    print("One or more of the properties or the individual could not be found in the ontology.")





Individual: D:\GitHub\Projetos\Mestrado\RAG\ontology\imports\oec-extracted.termCarbon


KeyboardInterrupt: 