### 4. KG Building Notebook

##### In this notebook, we:
1. call the graph object.
2. define two namespaces and bind them to the graph:
    * RBL = Namespace("http://example.org/rebel/#")
    * WIKI = Namespace("https://www.wikidata.org/wiki/")
3. make entity triples and add them to the graph: 
    * (ent, RDF.type, RDFS.Resource)
    * (ent, RDF.type, RBL.Entity)
    * (ent, RDFS.label, Literal(key))
    * (ent, WIKI.id, Literal(value))
    * (ent, OWL.sameAs, wiki_ent)
4. make relation triples and add them to the graph:
    * (rel, RDF.type, RDF.Property)
    * (rel, RDF.type, RBL.Relation)
    * (rel, WIKI.type, WIKI.Property)
    * (rel, RDFS.label, Literal(key))
    * (rel, WIKI.id, Literal(value))
    * (rel, OWL.sameAs, wiki_rel)
5. add more semantics:
    * (RDF.type, OWL.equivalentProperty, WIKI.P31)
    * (RDFS.subClassOf, OWL.equivalentProperty, WIKI.P279)
    * (RDFS.subPropertyOf, OWL.equivalentProperty, WIKI.P1647)
    * (RDFS.member, OWL.equivalentProperty, WIKI.P361)
6. add the triples, both REBEL and Wikidata, to the graph
    * (URIRef(WIKI + triple[0]), URIRef(WIKI + triple[1]), URIRef(WIKI + triple[2]))
    * (URIRef(RBL + subj), URIRef(RBL + pred), URIRef(RBL + obj))
7. serialize the grap in ttl format and save it.

In [1]:
# we start with imports
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDFS, RDF, OWL
import json
import string

In [2]:
# Let's define some namespaces
RBL = Namespace("http://example.org/rebel/#")
WIKI = Namespace("https://www.wikidata.org/wiki/")

g = Graph() # create a graph object
g.bind("rebel", RBL,  override=True) # bind the rebel namespace to the graph
g.bind("wiki", WIKI, override=True) # bind the wiki namespace to the graph

In [3]:
# Let's rememeber how the library works and decide how to put into the correct format
ent = "The Beatles"
ent = ent.lower().replace(' ', '_')
ent = URIRef(RBL + ent)
print(type(ent))
print(ent)


<class 'rdflib.term.URIRef'>
http://example.org/rebel/#the_beatles


In [4]:
# let's shape the relation names
rel = 'instance of'.lower().split()
if len(rel) > 1:
    words = []
    for index, word in enumerate(rel):
        if (index % 2) == 0:
            word = word.lower()   
        else:
            word = word.capitalize()
                    
        words.append(word)
    rel = ''.join(words)
rel =  URIRef(RBL + rel)
print(rel)

http://example.org/rebel/#instanceOf


In [5]:
#First we get the entities, relations and the wikidata ids from the dictionaries.
#Then we create the triples and add them to the graph.

with open('data/rebel_wiki_entities_dict.json', 'r', encoding='utf-8') as f:
    entities = json.load(f)
print(len(entities))
#print(entities)

with open('data/rebel_wiki_relations_dict.json', 'r', encoding='utf-8') as f:
    relations = json.load(f)
print(len(relations))
#print(relations)


3738
157


In [6]:
# we create the entity triples: type: rebel:Entity, label: entity name, id: wikidata id, sameAs: wikidata uri
for key, value in entities.items():
    ent = key.translate(str.maketrans('', '', string.punctuation))
    ent = ent.lower().replace(' ', '_')
    ent = URIRef(RBL + ent)
    wiki_ent = URIRef(WIKI + value)
    #print(ent)
    g.add((ent, RDF.type, RDFS.Resource))
    g.add((ent, RDF.type, RBL.Entity))
    g.add((ent, RDFS.label, Literal(key)))
    g.add((ent, WIKI.id, Literal(value)))
    g.add((ent, OWL.sameAs, wiki_ent))

In [7]:
# we create the relation triples: type: rebel:Relation, label: relation name, id: wikidata id, sameAs: wikidata uri
for key, value in relations.items():
    #print(key)
    rel = key.translate(str.maketrans('', '', string.punctuation))
    rel = rel.lower().split()
    #print(rel)
    if len(rel) > 1:
        words = []
        for index, word in enumerate(rel):
            #print(index, word)
            if (index % 2) == 0:
                word = word.lower()   
            elif (index % 2) == 1:
                word = word.capitalize()         
            words.append(word)
        rel = ''.join(words)
    else:
        rel = rel[0]
    #print(rel)
    rel = URIRef(RBL + rel)
    wiki_rel = URIRef(WIKI + value)
    #print(rel)
    g.add((rel, RDF.type, RDF.Property))
    g.add((rel, RDF.type, RBL.Relation))
    g.add((rel, WIKI.type, WIKI.Property))
    g.add((rel, RDFS.label, Literal(key)))
    g.add((rel, WIKI.id, Literal(value)))
    g.add((rel, OWL.sameAs, wiki_rel))

In [8]:
#Let's add more semantics for the reasoners: e.g. equivalent properties

g.add((RDF.type, OWL.equivalentProperty, WIKI.P31))
g.add((RDFS.subClassOf, OWL.equivalentProperty, WIKI.P279))
g.add((RDFS.subPropertyOf, OWL.equivalentProperty, WIKI.P1647))
g.add((RDFS.member, OWL.equivalentProperty, WIKI.P361))

<Graph identifier=N761d051a453c467e8a917e3a8da23206 (<class 'rdflib.graph.Graph'>)>

In [9]:
# Let's add the triples to the graph
data = []
with open('data/preprocessed_data_with_REBEL_extracted_triples_plus_wikidata.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('None\n')))
print(len(data))


917


In [10]:
# We start with wikidata triples
for line in data:
    wiki_triples = line["Wikidata Triples"]
    for triple in wiki_triples:
        new_triples = (URIRef(WIKI + triple[0]), URIRef(WIKI + triple[1]), URIRef(WIKI + triple[2]))
        g.add(new_triples)

In [11]:
# Let's add the REBEL triples

for line in data:
    rebel_triples = line["Extracted Triples"]
    for triple in rebel_triples:
        subj = triple[0].translate(str.maketrans('', '', string.punctuation))
        subj = subj.lower().replace(' ', '_')
        pred = triple[1].translate(str.maketrans('', '', string.punctuation)).split()
        if len(pred) > 1:
            words = []
            for index, word in enumerate(pred):
                #print(index, word)
                if (index % 2) == 0:
                    word = word.lower()   
                elif (index % 2) == 1:
                    word = word.capitalize()         
                words.append(word)
            pred = ''.join(words)
        else:
            pred = pred[0]
        #print(pred)
        obj = triple[2].translate(str.maketrans('', '', string.punctuation))
        obj = obj.lower().replace(' ', '_')
        new_triples = (URIRef(RBL + subj), URIRef(RBL + pred), URIRef(RBL + obj))
        #print(new_triples)
        g.add(new_triples)

In [12]:
# let's have a look at the graph
print(g.serialize())

@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix rebel: <http://example.org/rebel/#> .
@prefix wiki: <https://www.wikidata.org/wiki/> .

rebel:100_meter_sprint_final a rebel:Entity,
        rdfs:Resource ;
    rdfs:label "100 meter sprint final" ;
    rebel:partOf rebel:1960_summer_olympics ;
    rebel:participant rebel:armin_hary ;
    rebel:pointIntime rebel:1960 ;
    rebel:winner rebel:armin_hary ;
    owl:sameAs wiki:no-wikiID ;
    wiki:id "no-wikiID" .

rebel:12_preexisting_formal_palestinian_refugee_camps a rebel:Entity,
        rdfs:Resource ;
    rdfs:label "12 pre-existing formal Palestinian refugee camps" ;
    rebel:country rebel:lebanon ;
    owl:sameAs wiki:no-wikiID ;
    wiki:id "no-wikiID" .

rebel:12_preexisting_formal_palestinian_refugee_camps_in_lebanon a rebel:Entity,
        rdfs:Resource ;
    rdfs:label "12 pre-existing formal Palestinian 

In [15]:
# let's save the graph
g.serialize(destination="KGs/extracted_graph_for_enexa_hackathon_LLM_REBEL.ttl", format="turtle")

<Graph identifier=N761d051a453c467e8a917e3a8da23206 (<class 'rdflib.graph.Graph'>)>