### 6. KG Building Notebook

##### In this notebook, we:
1. call the graph object.
2. define two namespaces and bind them to the graph:
    * KGL = Namespace("http://example.org/ibm-KnowGL/#")
    * WIKI = Namespace("https://www.wikidata.org/wiki/")
3. make entity triples and add them to the graph: 
    * (ent, RDF.type, RDFS.Resource)
    * (ent, RDF.type, KGL.Entity)
    * (ent, RDFS.label, Literal(key))
    * (ent, WIKI.id, Literal(value))
    * (ent, OWL.sameAs, wiki_ent)
4. make relation triples and add them to the graph:
    * (rel, RDF.type, RDF.Property)
    * (rel, RDF.type, KGL.Relation)
    * (rel, WIKI.type, WIKI.Property)
    * (rel, RDFS.label, Literal(key))
    * (rel, WIKI.id, Literal(value))
    * (rel, OWL.sameAs, wiki_rel)
5. make triples and add them to the graph:
    * (typ, OWL.sameAs, wiki_type)
    * (typ, RDF.type, KGL.Type)
    * (typ, RDF.type, RDFS.Class)
    * (typ, WIKI.id, Literal(value))
    * (typ, RDFS.label, Literal(key))
6. add more semantics:
    * (RDF.type, OWL.equivalentProperty, WIKI.P31)
    * (RDFS.subClassOf, OWL.equivalentProperty, WIKI.P279)
    * (RDFS.subPropertyOf, OWL.equivalentProperty, WIKI.P1647)
    * (RDFS.member, OWL.equivalentProperty, WIKI.P361)
7. add the triples, both KnowGL and Wikidata, to the graph:
    * (URIRef(KGL + subj), URIRef(KGL + pred), URIRef(KGL + obj))
    * (URIRef(WIKI + triple[0]), URIRef(WIKI + triple[1]), URIRef(WIKI + triple[2]))
8. add the entity triples to the graph:
    * (URIRef(KGL + sbj), RDF.type, URIRef(KGL + new_obj))
    * (URIRef(KGL + sbj), RDFS.label, Literal(obj))
9. serialize the grap in ttl format and save it.

In [1]:
# we start with imports
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDFS, RDF, OWL, XSD
import json
import string

In [2]:
# Let's define some namespaces
KGL = Namespace("http://example.org/ibm-KnowGL/#")
WIKI = Namespace("https://www.wikidata.org/wiki/")

g = Graph() # create a graph object
g.bind("knowGL", KGL,  override=True) # bind the knowGL namespace to the graph
g.bind("wiki", WIKI, override=True) # bind the wiki namespace to the graph

In [3]:
# Let's rememeber how the library works and decide how to put into the correct format
ent = "The Beatles"
ent = ent.lower().replace(' ', '_')
ent = URIRef(KGL + ent)
print(type(ent))
print(ent)


<class 'rdflib.term.URIRef'>
http://example.org/ibm-KnowGL/#the_beatles


In [4]:
# let's shape the relation names
rel = 'instance of'.lower().split()
if len(rel) > 1:
    words = []
    for index, word in enumerate(rel):
        if (index % 2) == 0:
            word = word.lower()   
        else:
            word = word.capitalize()
                    
        words.append(word)
    rel = ''.join(words)
rel =  URIRef(KGL + rel)
print(rel)

http://example.org/ibm-KnowGL/#instanceOf


In [5]:
#First we get the entities, relations, types and the wikidata ids from the dictionaries.
#Then we create the triples and add them to the graph.

with open('data/knowGL_wiki_entities_dict.json', 'r', encoding='utf-8') as f:
    entities = json.load(f)
print(f'Number of entities: {len(entities)}')
#print(entities)

with open('data/knowGL_wiki_relations_dict.json', 'r', encoding='utf-8') as f:
    relations = json.load(f)
print(f'Number of relations: {len(relations)}')
#print(relations)

with open('data/knowGL_wiki_types_dict.json', 'r', encoding='utf-8') as f:
    types = json.load(f)
print(f'Number of types: {len(types)}')
#print(relations)


Number of entities: 3109
Number of relations: 193
Number of types: 1092


In [6]:

# we create the entity triples: type: knowGL:Entity, label: entity name, id: wikidata id, sameAs: wikidata uri
for key, value in entities.items():
    if value == "no-wikiID":
        continue
    else:
        ent = key.translate(str.maketrans('', '', string.punctuation)).strip()
        ent = ent.lower().replace(' ', '_')
        ent = URIRef(KGL + ent)
        wiki_ent = URIRef(WIKI + value)
    #print(ent)
    g.add((ent, RDF.type, RDFS.Resource))
    g.add((ent, RDF.type, KGL.Entity))
    g.add((ent, RDFS.label, Literal(key)))
    g.add((ent, WIKI.id, Literal(value)))
    g.add((ent, OWL.sameAs, wiki_ent))

In [7]:
# we create the relation triples: knowGL:Relation, label: relation name, id: wikidata id, sameAs: wikidata uri
for key, value in relations.items():
    #print(key)
    if value == "no-wikiID":
        continue
    else:
        rel = key.translate(str.maketrans('', '', string.punctuation)).strip()
        rel = rel.lower().split()
        #print(rel)
        if len(rel) > 1:
            words = []
            for index, word in enumerate(rel):
                #print(index, word)
                if (index % 2) == 0:
                    word = word.lower()   
                elif (index % 2) == 1:
                    word = word.capitalize()         
                words.append(word)
            rel = ''.join(words)
        else:
            rel = rel[0]
        #print(rel)
        rel = URIRef(KGL + rel)
        wiki_rel = URIRef(WIKI + value)
        #print(rel)
    g.add((rel, RDF.type, RDF.Property))
    g.add((rel, RDF.type, KGL.Relation))
    g.add((rel, WIKI.type, WIKI.Property))
    g.add((rel, RDFS.label, Literal(key)))
    g.add((rel, WIKI.id, Literal(value)))
    g.add((rel, OWL.sameAs, wiki_rel))

In [8]:
# we create the type triples:

for key, value in types.items():
    #print(key)
    if value == "no-wikiID":
        continue
    if key == '':
        continue
    else:
        typ = key.strip().split()
    
    new_typ = []
    for x in typ:
        if x.isupper():
            x=x.upper()
        else:
            x = x.capitalize()
        new_typ.append(x)
    new_typ = ''.join(new_typ)
    #print(new_typ)
    typ = URIRef(KGL + new_typ)
    wiki_type = URIRef(WIKI + value)
    g.add((typ, OWL.sameAs, wiki_type))
    g.add((typ, RDF.type, KGL.Type))
    g.add((typ, RDF.type, RDFS.Class))
    g.add((typ, WIKI.id, Literal(value)))
    g.add((typ, RDFS.label, Literal(key)))
    

In [9]:
#Let's add more semantics for the reasoners: e.g. equivalent properties

g.add((RDF.type, OWL.equivalentProperty, WIKI.P31))
g.add((RDFS.subClassOf, OWL.equivalentProperty, WIKI.P279))
g.add((RDFS.subPropertyOf, OWL.equivalentProperty, WIKI.P1647))
g.add((RDFS.member, OWL.equivalentProperty, WIKI.P361))

<Graph identifier=N7f6396057093481da373edbc237d797b (<class 'rdflib.graph.Graph'>)>

In [10]:
# Let's add the triples to the graph
data = []
with open('data/preprocessed_data_with_KnowGL_extracted_triples_plus_wikidata.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('None\n')))
print(len(data))


917


In [11]:
# We start with wikidata triples
for line in data:
    wiki_triples = line["Wikidata Triples"]
    for triple in wiki_triples:
        #print(triple)
        new_triples = (URIRef(WIKI + triple[0]), URIRef(WIKI + triple[1]), URIRef(WIKI + triple[2]))
        g.add(new_triples)

In [12]:
# Let's add the KnowGL triples

for line in data:
    knowGL_triples = line["Extracted Triples"]
    for triple in knowGL_triples:
        subj = triple[0].translate(str.maketrans('', '', string.punctuation)).strip()
        subj = subj.lower().replace(' ', '_')
        pred = triple[1].translate(str.maketrans('', '', string.punctuation)).strip().split()
        if len(pred) > 1:
            words = []
            for index, word in enumerate(pred):
                #print(index, word)
                if (index % 2) == 0:
                    word = word.lower()   
                elif (index % 2) == 1:
                    word = word.capitalize()         
                words.append(word)
            pred = ''.join(words)
        else:
            pred = pred[0]
        #print(pred)
        obj = triple[2].translate(str.maketrans('', '', string.punctuation)).strip()
        obj = obj.lower().replace(' ', '_')
        new_triples = (URIRef(KGL + subj), URIRef(KGL + pred), URIRef(KGL + obj))
        #print(new_triples)
        g.add(new_triples)

In [13]:
# Let's add the KnowGL type triples
for line in data:
    ent_triples = line['Entity Triples']
    for triple in ent_triples:
        sbj = triple[0].translate(str.maketrans('', '', string.punctuation)).strip()
        sbj = sbj.lower().replace(' ', '_')
        obj = triple[2].translate(str.maketrans('', '', string.punctuation)).strip()
        if triple[1] == 'type':
            obj = obj.split()
            new_obj = []
            for x in obj:
                if x.isupper():
                    x=x.upper()
                else:
                    x = x.capitalize()
                new_obj.append(x)
            new_obj = ''.join(new_obj)
            new_triple = (URIRef(KGL + sbj), RDF.type, URIRef(KGL + new_obj))
        if triple[1] == 'label':
            new_triple = (URIRef(KGL + sbj), RDFS.label, Literal(obj))
        #print(new_triple)
        g.add(new_triple)
   

Let's implant a few data type inconsitencies:

In [14]:
triples_with_corrupted_wikidata_ids = []

# we create the entity triples: type: knowGL:Entity, label: entity name, id: wikidata id, sameAs: wikidata uri
for key, value in entities.items():
    ent = key.translate(str.maketrans('', '', string.punctuation)).strip()
    ent = ent.lower().replace(' ', '_')
    wiki_ent = value
    if wiki_ent != "no-wikiID":
        wiki_ent = wiki_ent.strip('Q')
        #print(wiki_ent)
        corrupted_triple = (URIRef(KGL + ent), WIKI.id, Literal(wiki_ent, datatype=XSD.integer))
        #print(corrupted_triple)
        triples_with_corrupted_wikidata_ids.append(corrupted_triple)



In [15]:
print(len(triples_with_corrupted_wikidata_ids))

for t in triples_with_corrupted_wikidata_ids[:5]:
    print(t)
        

2472
(rdflib.term.URIRef('http://example.org/ibm-KnowGL/#dax'), rdflib.term.URIRef('https://www.wikidata.org/wiki/id'), rdflib.term.Literal('188878'))
(rdflib.term.URIRef('http://example.org/ibm-KnowGL/#eu_member_state'), rdflib.term.URIRef('https://www.wikidata.org/wiki/id'), rdflib.term.Literal('44318753'))
(rdflib.term.URIRef('http://example.org/ibm-KnowGL/#ecuador'), rdflib.term.URIRef('https://www.wikidata.org/wiki/id'), rdflib.term.Literal('736'))
(rdflib.term.URIRef('http://example.org/ibm-KnowGL/#engelhard_corporation'), rdflib.term.URIRef('https://www.wikidata.org/wiki/id'), rdflib.term.Literal('1342078'))
(rdflib.term.URIRef('http://example.org/ibm-KnowGL/#euro_stoxx_50'), rdflib.term.URIRef('https://www.wikidata.org/wiki/id'), rdflib.term.Literal('981010'))


In [16]:
### Here we add the triples with corrupted wikidata ids to the graph ###
for t in triples_with_corrupted_wikidata_ids[:100]:
    g.add(t)

In [17]:
# let's have a look at the graph
print(g.serialize())

@prefix knowGL: <http://example.org/ibm-KnowGL/#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix wiki: <https://www.wikidata.org/wiki/> .

knowGL:ATPTennisTournament a knowGL:Type,
        rdfs:Class ;
    rdfs:label "ATP tennis tournament" ;
    owl:sameAs wiki:Q26863790 ;
    wiki:id "Q26863790" .

knowGL:Absorption a knowGL:Type,
        rdfs:Class ;
    rdfs:label "absorption" ;
    owl:sameAs wiki:Q332828 ;
    wiki:id "Q332828" .

knowGL:AcademicConference a knowGL:Type,
        rdfs:Class ;
    rdfs:label "academic conference" ;
    owl:sameAs wiki:Q2020153 ;
    wiki:id "Q2020153" .

knowGL:AcademicDepartment a knowGL:Type,
        rdfs:Class ;
    rdfs:label "academic department" ;
    owl:sameAs wiki:Q2467461 ;
    wiki:id "Q2467461" .

knowGL:AcademicDiscipline a knowGL:Type,
        rdfs:Class ;
    rdfs:label "academic discipline" ;
    owl:sameAs

In [18]:
# let's save the graph
g.serialize(destination="KGs/extracted_graph_for_enexa_hackathon_LLM_KnowGL.ttl", format="turtle")

<Graph identifier=N7f6396057093481da373edbc237d797b (<class 'rdflib.graph.Graph'>)>