In [1]:
import os
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_huggingface import llms
from langchain_huggingface.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForQuestionAnswering,RobertaForQuestionAnswering, pipeline, AutoModel
from owlready2 import get_ontology
from owlready2 import sync_reasoner_hermit
import traceback
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

In [2]:
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)
model_path="sentence-transformers/all-MiniLM-l6-v2"
model_name="FacebookAI/roberta-base"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModel.from_pretrained(name)

embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )

db = FAISS.from_documents(docs, embeddings)

#tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512, pad_token_id=50256)
#model = AutoModelForQuestionAnswering.from_pretrained(model_name)

name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(name)
model = RobertaForQuestionAnswering.from_pretrained(model_name)
question_answerer = pipeline(
        "question-answering",
        model=model,
        tokenizer=tokenizer,
        return_tensors='pt'
    )

llm = HuggingFacePipeline(
        pipeline=question_answerer,
        model_kwargs={"temperature": 0.7}
    )

retriever = db.as_retriever(search_kwargs={"k": 8})

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)



In [None]:
import os
from owlready2 import get_ontology
import pandas as pd
from langchain.globals import set_verbose, set_debug
set_debug(True)
set_verbose(True)

# Define file path
directory = 'D:\\GitHub\\Projetos\\Mestrado\\RAG\\ontology\\imports'
filename = 'oec-extracted.owl'
file_path = os.path.join(directory, filename)

print("Loading ontology...")
# Load the ontology
onto = get_ontology("file://" + file_path).load()

# Define the IRIs for the properties and the individual
term_sent_by_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#termSentBy"
actor_coal_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorCoal"
actor_has_context_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorHasContext"

# Search for the properties and the individual in the ontology
term_sent_by_property = onto.search_one(iri=term_sent_by_property_iri)
actor_coal = onto.search_one(iri=actor_coal_iri)
actor_has_context_property = onto.search_one(iri=actor_has_context_property_iri)

# Check if the properties and individual exist
if term_sent_by_property and actor_coal and actor_has_context_property:
    # List to store individuals with 'termSentBy' set to 'actorCoal'
    individuals_with_term_sent_by_actor_coal = []

    # Iterate through all individuals in the ontology
    for individual in onto.individuals():
        # Check if the individual has the property 'termSentBy'
        if term_sent_by_property in individual.get_properties():
            # Retrieve the values of 'termSentBy'
            term_sent_by_values = getattr(individual, term_sent_by_property.python_name)
            # Check if 'actorCoal' is in the values of 'termSentBy'
            if actor_coal in term_sent_by_values:
                individuals_with_term_sent_by_actor_coal.append(individual)

    # Print the individuals with 'termSentBy' set to 'actorCoal'
    print("Individuals where 'termSentBy' is 'actorCoal':")
    for individual in individuals_with_term_sent_by_actor_coal:
        print(f"Individual: {individual}")

        # Check if the individual has the 'termLexiconString' data property
        if hasattr(individual, "termLexiconString"):
            # Retrieve the value of 'termLexiconString'
            term_lexicon_string_value = getattr(individual, "termLexiconString")
            
            # Check if the value is a list and convert it to a string
            if isinstance(term_lexicon_string_value, list):
                term_lexicon_string_value = " ".join(term_lexicon_string_value)
            
            # Check if the value is empty or not
            if term_lexicon_string_value:
                question = f'What is {term_lexicon_string_value} footprint'
                print(f"Generated question: {question}")
                
                # Append data to the list
                result = qa.run({"query": question})
                data.append({'question': question, 'answer': result})
                print(f"Appended to data list: {question}")
            else:
                print("  termLexiconString is empty")
        else:
            print("  termLexiconString: Not found")

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Save results to CSV
    print("Saving results to CSV...")
    df.to_csv('output.csv', index=False)
    print("CSV file has been saved successfully.")

else:
    print("One or more of the properties or the individual could not be found in the ontology.")


In [None]:
# Load the ontology
#defining file path
directory = 'D:\GitHub\Projetos\Mestrado\RAG\ontology\imports'
filename = 'oec-extracted.owl'
file_path = os.path.join(directory, filename)
# Loading the ontology
onto = get_ontology("file://" + file_path).load()

# Define the IRIs for the properties and the individual
term_sent_by_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#termSentBy"
actor_coal_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorCoal"
actor_has_context_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorHasContext"

# Search for the properties and the individual in the ontology
term_sent_by_property = onto.search_one(iri=term_sent_by_property_iri)
actor_coal = onto.search_one(iri=actor_coal_iri)
actor_has_context_property = onto.search_one(iri=actor_has_context_property_iri)

# Check if the properties and individual exist
if term_sent_by_property and actor_coal and actor_has_context_property:
    # List to store individuals with 'termSentBy' set to 'actorCoal'
    individuals_with_term_sent_by_actor_coal = []

    # Iterate through all individuals in the ontology
    for individual in onto.individuals():
        # Check if the individual has the property 'termSentBy'
        if term_sent_by_property in individual.get_properties():
            # Retrieve the values of 'termSentBy'
            term_sent_by_values = getattr(individual, term_sent_by_property.python_name)
            # Check if 'actorCoal' is in the values of 'termSentBy'
            if actor_coal in term_sent_by_values:
                individuals_with_term_sent_by_actor_coal.append(individual)

    # Print the individuals with 'termSentBy' set to 'actorCoal'
    print("Individuals where 'termSentBy' is 'actorCoal':")
    for individual in individuals_with_term_sent_by_actor_coal:
        print(f"Individual: {individual}")

        # Check if the individual has the 'termLexiconString' data property
        if hasattr(individual, "termLexiconString"):
            # Retrieve the value of 'termLexiconString'
            term_lexicon_string_value = getattr(individual, "termLexiconString")
            # Check if the value is empty or not
            if term_lexicon_string_value:
                print(f"  termLexiconString: {term_lexicon_string_value}")
                question = f'What is {term_lexicon_string_value} footprint'
                result = qa.run({"query": question})
                #print(result)
        else:
            print("  termLexiconString: Not found")


In [None]:
# Load the ontology
#defining file path
directory = 'D:\GitHub\Projetos\Mestrado\RAG\ontology\imports'
filename = 'oec-extracted.owl'
file_path = os.path.join(directory, filename)
# Loading the ontology
onto = get_ontology("file://" + file_path).load()

# Define the IRIs for the properties and the individual
term_sent_by_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#termSentBy"
actor_coal_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorCoal"
actor_has_context_property_iri = "http://www.semanticweb.org/matheus/ontologies/2023/10/oec-extracted#actorHasContext"

# Search for the properties and the individual in the ontology
term_sent_by_property = onto.search_one(iri=term_sent_by_property_iri)
actor_coal = onto.search_one(iri=actor_coal_iri)
actor_has_context_property = onto.search_one(iri=actor_has_context_property_iri)

# Check if the properties and individual exist
if term_sent_by_property and actor_coal and actor_has_context_property:
    # List to store individuals with 'termSentBy' set to 'actorCoal'
    individuals_with_term_sent_by_actor_coal = []

    # Iterate through all individuals in the ontology
    for individual in onto.individuals():
        # Check if the individual has the property 'termSentBy'
        if term_sent_by_property in individual.get_properties():
            # Retrieve the values of 'termSentBy'
            term_sent_by_values = getattr(individual, term_sent_by_property.python_name)
            # Check if 'actorCoal' is in the values of 'termSentBy'
            if actor_coal in term_sent_by_values:
                individuals_with_term_sent_by_actor_coal.append(individual)

    # Print the individuals with 'termSentBy' set to 'actorCoal'
    print("Individuals where 'termSentBy' is 'actorCoal':")
    for individual in individuals_with_term_sent_by_actor_coal:
        print(f"Individual: {individual}")

        # Check if the individual has the 'termLexiconString' data property
        if hasattr(individual, "termLexiconString"):
            # Retrieve the value of 'termLexiconString'
            term_lexicon_string_value = getattr(individual, "termLexiconString")
            # Check if the value is empty or not
            if term_lexicon_string_value:
                print(f"  termLexiconString: {term_lexicon_string_value}")
                question = f'What is {term_lexicon_string_value} in the context of solar energy'
                result = qa.run({"query": question})
                print(result)
        else:
            print("  termLexiconString: Not found")
