In [127]:
import pandas as pd
from rdflib import Graph, Namespace, RDF, RDFS, XSD, URIRef, Literal
import json
import numpy as np
import ast
import urllib.parse
import os

pd.set_option('display.max_columns', None)

tbox = Graph()
tbox.parse("tbox.ttl", format="turtle")  # or 'xml', 'n3', etc.

EX = Namespace("http://example.org/research_paper/")

reseach_paper = pd.read_csv("research_papers.csv")


In [128]:
classes = set(tbox.subjects(RDF.type, RDFS.Class))

properties = {}
for prop in tbox.subjects(RDF.type, RDF.Property):
    domain = tbox.value(prop, RDFS.domain)
    range_ = tbox.value(prop, RDFS.range)
    properties[prop] = (domain, range_)


print("Classes in TBox:")
for c in classes:
    print(f"  {c}")

print("Properties in TBox and their domains/ranges:")
for p, (d, r) in properties.items():
    print(f"  {p} domain={d} range={r}")

print("Subclasses in TBox:")
for c in classes:
    subclasses = list(tbox.subjects(predicate=RDFS.subClassOf, object=c))
    if subclasses:
        print(f"  {c} subclasses={subclasses}")

Classes in TBox:
  http://example.org/research_paper/Journal
  http://example.org/research_paper/Conference
  http://example.org/research_paper/Edition
  http://example.org/research_paper/Paper
  http://example.org/research_paper/Review
  http://example.org/research_paper/Reviewer
  http://example.org/research_paper/Volume
  http://example.org/research_paper/Topic
  http://example.org/research_paper/Owner
  http://example.org/research_paper/Author
  http://example.org/research_paper/Publication
Properties in TBox and their domains/ranges:
  http://example.org/research_paper/cites domain=http://example.org/research_paper/Paper range=http://example.org/research_paper/Paper
  http://example.org/research_paper/corresponds_to domain=http://example.org/research_paper/Paper range=http://example.org/research_paper/Owner
  http://example.org/research_paper/has_abstract domain=http://example.org/research_paper/Paper range=http://www.w3.org/2001/XMLSchema#string
  http://example.org/research_pape

In [129]:
abox = Graph()
abox.bind("ex", EX)
abox.bind("rdf", RDF)
abox.bind("rdfs", RDFS)

In [130]:
def urlib_parse(value):
    return urllib.parse.quote(value, safe='')

In [131]:
# Function to create a class instance with a label
def create_class_instance(graph, namespace, instance_label, class_name, instance_uri_value=None):
    if instance_uri_value:
        instance_uri = URIRef(namespace[urlib_parse(instance_uri_value)])
    else:
        instance_uri = URIRef(namespace[urlib_parse(instance_label)])
    
    graph.add((instance_uri, RDF.type, namespace[class_name]))
    graph.add((instance_uri, RDFS.label, Literal(instance_label, datatype=XSD.string)))

In [132]:
# Function to create a label for an instance
def create_instance_label(graph, namespace, instance_label, class_name, instance_uri_value=None):
    if instance_uri_value:
        instance_uri = URIRef(namespace[urlib_parse(instance_uri_value)])
    else:
        instance_uri = URIRef(namespace[urlib_parse(instance_label)])
    
    graph.add((instance_uri, RDFS.label, Literal(instance_label, datatype=XSD.string)))

In [133]:
def create_property_instance(graph, namespace, subject_uri_value, property_name, object_value, is_literal=False):
    subject_uri = URIRef(namespace[urlib_parse(subject_uri_value)])
    predicate = namespace[property_name]

    if is_literal:
        graph.add((subject_uri, predicate, Literal(object_value, datatype=XSD.string)))
    else:
        object_uri = URIRef(namespace[urlib_parse(object_value)])
        graph.add((subject_uri, predicate, object_uri))

In [134]:
owners_set = set(reseach_paper['author'].dropna().unique())

reviewers_set = set()
for reviewers_list_str in reseach_paper['reviewers'].dropna():
    try:
        reviewers_list = ast.literal_eval(reviewers_list_str)
        if isinstance(reviewers_list, list):
            reviewers_set.update(reviewers_list)
    except (ValueError, SyntaxError):
        # Handle cases where ast.literal_eval might fail if the string is not a valid list literal
        print(f"Warning: Could not parse reviewers list: {reviewers_list_str}")

In [135]:
global_review_id_counter = 0

In [None]:
print(f"Initial ABOX size: {len(abox)} triples.")

for index, row in reseach_paper.iterrows():
    if pd.notna(row['published_paper']) and row['published_paper']:
        paper_id = row['id']
        paper_title = row['title']
        abstract = str(row.get('abstract', ''))

        try:
            authors_list = ast.literal_eval(row['authors']) if pd.notna(row['authors']) else []
            references_list = ast.literal_eval(row['references']) if pd.notna(row['references']) else []
            topics_list = ast.literal_eval(row['topic']) if pd.notna(row['topic']) else []
            reviewers_list = ast.literal_eval(row['reviewers']) if pd.notna(row['reviewers']) else []
            reviews_list = ast.literal_eval(row['reviews']) if pd.notna(row['reviews']) else []

        except (ValueError, SyntaxError) as e:
            print(f"Error parsing list data for paper {paper_id}: {e}. Skipping this paper.")
            continue

        paper_owner_name = row['author']
        
        isJournal = pd.notna(row['journal'])
        if isJournal:
            journal_name = row['journal']
            journal_volume_id = urlib_parse(journal_name) + "_" + "volume_" + str(row['journal_volume'])
            journal_volume_label = str(row['journal_volume'])
            publication_year = str(row['journal_year'])
        else:
            conference_name = row['conference']
            conference_city = row['conference_city']
            publication_year = str(row['conference_year'])
            conference_edition_id = urlib_parse(conference_name) + "_" + "edition_" + str(row['conference_edition'])
            conference_edition_label = str(row['conference_edition'])

        # Define Class instances
        create_instance_label(abox, EX, paper_id, "Paper")
        create_instance_label(abox, EX, paper_owner_name, "Owner")

        for author in authors_list:
            author_name = str(author)
            is_owner = author_name in owners_set
            is_reviewer = author_name in reviewers_set

            if is_owner:
                create_instance_label(abox, EX, author_name, "Owner")
            if is_reviewer:
                create_instance_label(abox, EX, author_name, "Reviewer")
            if not is_owner and not is_reviewer:
                create_instance_label(abox, EX, author_name, "Author")
        
        for reviewer in reviewers_list:
            reviewer_name = str(reviewer)
            create_instance_label(abox, EX, reviewer_name,  "Reviewer")
        
        current_paper_review_instance_ids = []
        if len(reviewers_list) == len(reviews_list):
            for i in range(len(reviews_list)):
                review_text = reviews_list[i]
                review_instance_id = f'REVIEW_{global_review_id_counter + i + 1}'
                current_paper_review_instance_ids.append(review_instance_id)
                create_instance_label(abox, EX, review_text, "Review", review_instance_id)
        else:
            print(f"Warning: the length of reviewers and reviews are not equal")
            break

        for topic in topics_list:
            topic_name = str(topic)
            create_instance_label(abox, EX, topic_name, "Topic")
        
        for cited_paper in references_list:
            cited_paper_id = str(cited_paper)
            create_instance_label(abox, EX, cited_paper_id, "Paper")

        if isJournal:
            create_instance_label(abox, EX, journal_name, "Journal")
            create_instance_label(abox, EX, journal_volume_label, "Volume", journal_volume_id)
        else:
            create_instance_label(abox, EX, conference_name, "Conference")
            create_instance_label(abox, EX, conference_edition_label, "Edition", conference_edition_id)
        
        # Create Property instances
        create_property_instance(abox, EX, paper_id, "has_title", paper_title, is_literal=True)
        create_property_instance(abox, EX, paper_id, "has_abstract", abstract, is_literal=True)
        create_property_instance(abox, EX, paper_id, "corresponds_to", paper_owner_name)

        for topic in topics_list:
            create_property_instance(abox, EX, paper_id, "related_to", str(topic))

        for cited_paper_id in references_list:
            create_property_instance(abox, EX, paper_id, "cites", str(cited_paper_id))
        
        for author in authors_list:
            create_property_instance(abox, EX, paper_id, "written_by", str(author))

        for i in range(len(reviewers_list)):
            review_instance_id = current_paper_review_instance_ids[i]
            reviewer_name = str(reviewers_list[i])
            create_property_instance(abox, EX, paper_id, "has_review", review_instance_id)
            create_property_instance(abox, EX, reviewer_name, "note", review_instance_id)

        global_review_id_counter += len(current_paper_review_instance_ids)

        if isJournal:
            create_property_instance(abox, EX, paper_id, "published_in", journal_name)
            create_property_instance(abox, EX, journal_name, "has_volume", journal_volume_id)
            create_property_instance(abox, EX, journal_volume_id, "has_date_volume", publication_year, is_literal=True)
        else:
            create_property_instance(abox, EX, paper_id, "published_in", conference_name)
            create_property_instance(abox, EX, conference_name, "has_edition", conference_edition_id)
            create_property_instance(abox, EX, conference_edition_id, "has_date_edition", publication_year, is_literal=True)
            create_property_instance(abox, EX, conference_edition_id, "has_location", conference_city, is_literal=True)


print(f"Finished ABOX generation. Total triples in abox: {len(abox)}")

Initial ABOX size: 0 triples.
Finished ABOX generation. Total triples in abox: 16312


In [137]:
# Use current working directory instead of script path
script_dir = os.getcwd()  # Works in Jupyter
rdf_file_path = os.path.join(script_dir, "abox.ttl")

# Save the graph
abox.serialize(destination=rdf_file_path, format="turtle")

print(f"RDF saved to: {rdf_file_path}")

RDF saved to: c:\Users\usuario\Desktop\FIB\Ciència de Dades\2n Quadrimestre\SDM\Project\Knowledge Graph\SDM_lab2\abox.ttl


In [138]:
# from owlrl import DeductiveClosure, OWLRL_Semantics

# g = Graph()
# g.parse("tbox.ttl", format="turtle")
# g.parse("abox.ttl", format="turtle")

# DeductiveClosure(OWLRL_Semantics).expand(g)
# print(len(g))

In [139]:
from owlrl import RDFSClosure

g1 = Graph()
g1.parse('tbox.ttl', format="turtle")
g1.parse('abox.ttl', format="turtle")

# RDFS_Sematics(graph, axioms, daxioms, rdfs)
# - graph -> The RDF Graph (TBOX + ABOX)
# - axioms -> Add axiomatic triples (e.g., that rdf:type is a property, or that rdfs:subClassOf has domain rdfs:Class)
# - daxioms -> Add datatype axioms (xsd:string rdf:type rdfs:Datatype)
# - rdfs -> RDFS reasoning rules
rdfs_reasoner = RDFSClosure.RDFS_Semantics(g1, axioms=False, daxioms=False, rdfs=True)
rdfs_reasoner.closure()
rdfs_reasoner.flush_stored_triples()
print(f'Total number of triples (explicit + inference): {len(g1)}')

Total number of triples (explicit + inference): 22795


In [140]:
# diff1 = g  - g1
# for s,p,o in diff1:
#     print(s,p,o)

In [141]:
# Check Inference Constraint
author_uri = URIRef("http://example.org/research_paper/Warren%20J.%20Warwick")

for s, p, o in g1.triples((author_uri, None, None)):
    print(s, p, o)


http://example.org/research_paper/Warren%20J.%20Warwick http://www.w3.org/2000/01/rdf-schema#label Warren J. Warwick
http://example.org/research_paper/Warren%20J.%20Warwick http://example.org/research_paper/note http://example.org/research_paper/REVIEW_454
http://example.org/research_paper/Warren%20J.%20Warwick http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://example.org/research_paper/Author
http://example.org/research_paper/Warren%20J.%20Warwick http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2000/01/rdf-schema#Resource
http://example.org/research_paper/Warren%20J.%20Warwick http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://example.org/research_paper/Reviewer


In [142]:
rdf_file_path = os.path.join(script_dir, "tbox_abox.ttl")

# Save the graph
abox.serialize(destination=rdf_file_path, format="turtle")

print(f"RDF saved to: {rdf_file_path}")

RDF saved to: c:\Users\usuario\Desktop\FIB\Ciència de Dades\2n Quadrimestre\SDM\Project\Knowledge Graph\SDM_lab2\tbox_abox.ttl
