## Parse the .nt Files 

In [1]:
from rdflib import Graph
import pprint

# Load training data
train_graph = Graph()
train_graph.parse("data/fokg-sw-train-2024.nt", format="nt")

#Load test data
test_graph = Graph()
test_graph.parse("data/fokg-sw-test-2024.nt", format="nt")

#Append data
graph = train_graph + test_graph

#Extract triples
triples = []
for subj, pred, obj in graph:
    triples.append((str(subj), str(pred), str(obj)))

print("Sample triples:")
pprint.pprint(triples[:5])

Sample triples:
[('http://dice-research.org/data/fb15k-237.ttl#727',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'),
 ('http://dice-research.org/data/fb15k-237.ttl#1221',
  'http://swc2017.aksw.org/hasTruthValue',
  '0.0'),
 ('http://dice-research.org/data/fb15k-237.ttl#1488',
  'http://swc2017.aksw.org/hasTruthValue',
  '0.0'),
 ('http://dice-research.org/data/fb15k-237.ttl#191',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate',
  'http://rdf.freebase.com/ns/music.record_label.artist'),
 ('http://dice-research.org/data/fb15k-237.ttl#1214',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#object',
  'http://rdf.freebase.com/ns/m.01wbgdv')]


## Encode the Triples

In [2]:
from sklearn.preprocessing import LabelEncoder

#Flatten
entities = [t[0] for t in triples] + [t[2] for t in triples]
predicates = [t[1] for t in triples]


entity_encoder = LabelEncoder()
predicate_encoder = LabelEncoder()

encoded_entities = entity_encoder.fit_transform(entities)
encoded_predicates = predicate_encoder.fit_transform(predicates)

encoded_triples = [
    (encoded_entities[i], encoded_predicates[i], encoded_entities[len(triples) + i])
    for i in range(len(triples))
]

print("Encoded triples:")
pprint.pprint(encoded_triples[:5])

Encoded triples:
[(1200, 4, 3673), (251, 0, 0), (546, 0, 0), (605, 2, 3648), (243, 1, 1989)]


In [3]:
#Index train, test sets 
train_idx = [(entity_encoder.transform([s])[0], predicate_encoder.transform([p])[0], entity_encoder.transform([o])[0]) for s,p,o in train_graph]
test_idx = [(entity_encoder.transform([s])[0], predicate_encoder.transform([p])[0], entity_encoder.transform([o])[0]) for s,p,o in test_graph]

print("Encoded Train triples:")
pprint.pprint(train_idx[:5])
print("Encoded Test triples:")
pprint.pprint(test_idx[:5])

Encoded Train triples:
[(1200, 4, 3673), (251, 0, 0), (546, 0, 0), (826, 2, 1545), (867, 2, 3661)]
Encoded Test triples:
[(954, 4, 3673), (818, 3, 2701), (445, 4, 3673), (540, 3, 1563), (605, 2, 3648)]


## Validate the Encoding

In [4]:
decoded_triples = [
    (
        entity_encoder.inverse_transform([encoded_triples[i][0]])[0],
        predicate_encoder.inverse_transform([encoded_triples[i][1]])[0],
        entity_encoder.inverse_transform([encoded_triples[i][2]])[0]
    )
    for i in range(len(encoded_triples))
]

for i in range(5):
    print("Original:", triples[i])
    print("Decoded:", decoded_triples[i])
    print("--------")

Original: ('http://dice-research.org/data/fb15k-237.ttl#727', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#727', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')
--------
Original: ('http://dice-research.org/data/fb15k-237.ttl#1221', 'http://swc2017.aksw.org/hasTruthValue', '0.0')
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#1221', 'http://swc2017.aksw.org/hasTruthValue', '0.0')
--------
Original: ('http://dice-research.org/data/fb15k-237.ttl#1488', 'http://swc2017.aksw.org/hasTruthValue', '0.0')
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#1488', 'http://swc2017.aksw.org/hasTruthValue', '0.0')
--------
Original: ('http://dice-research.org/data/fb15k-237.ttl#191', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/music.record_label.artist')
Decoded: ('http://di

In [5]:
decoded_train_triples = [
    (
        entity_encoder.inverse_transform([train_idx[i][0]])[0],
        predicate_encoder.inverse_transform([train_idx[i][1]])[0],
        entity_encoder.inverse_transform([train_idx[i][2]])[0]
    )
    for i in range(len(train_idx))
]

for i in range(5):
    print("Encoded:", train_idx[i])
    print("Decoded:", decoded_train_triples[i])
    print("--------")

Encoded: (1200, 4, 3673)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#727', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')
--------
Encoded: (251, 0, 0)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#1221', 'http://swc2017.aksw.org/hasTruthValue', '0.0')
--------
Encoded: (546, 0, 0)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#1488', 'http://swc2017.aksw.org/hasTruthValue', '0.0')
--------
Encoded: (826, 2, 1545)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#390', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/location.location.contains')
--------
Encoded: (867, 2, 3661)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#427', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/people.person.profession')
--------


## Prepare TensorFlow Data

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from rdflib import URIRef

# Define constants for predicates
TRUTH_VALUE_PREDICATE = predicate_encoder.transform([str(URIRef("http://swc2017.aksw.org/hasTruthValue"))])[0]
OBJECT_PREDICATE = predicate_encoder.transform([str(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#object"))])[0]
PREDICATE_PREDICATE = predicate_encoder.transform([str(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"))])[0]
SUBJECT_PREDICATE = predicate_encoder.transform([str(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"))])[0]
TYPE_PREDICATE = predicate_encoder.transform([str(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"))])[0]

print(TRUTH_VALUE_PREDICATE, OBJECT_PREDICATE, PREDICATE_PREDICATE, SUBJECT_PREDICATE, TYPE_PREDICATE) #checking the encoded value

# Convert triples to a NumPy array
train_data = np.array(train_idx)

# Initialize dictionaries for X (type, subject, predicate, object) and Y (hasTruthValue)
facts = {}  # Key: Fact IRI, Value: List of triples for the fact
veracity_values = {}  # Key: Fact IRI, Value: Veracity value

# Populate facts and veracity values
for fact_iri, predicate, obj in train_idx:
    #print(fact_iri, predicate, obj)
    if predicate in [TYPE_PREDICATE, SUBJECT_PREDICATE, PREDICATE_PREDICATE, OBJECT_PREDICATE]:
        facts.setdefault(fact_iri, []).append((fact_iri, predicate, obj))
    elif predicate == TRUTH_VALUE_PREDICATE:
        veracity_values.setdefault(fact_iri, []).append((fact_iri, predicate, obj))

# X -> facts; Y -> veracity_values
X,Y = [],[]
for key, value in facts.items():
    X.append(value)
    v_value = veracity_values.get(key)
    Y.append(v_value)

X = np.array(X)
print("X : ",X.shape)
Y = np.array(Y)
print("Y : ",Y.shape)

pprint.pprint(X[:5])

0 1 2 3 4
X :  (1000, 4, 3)
Y :  (1000, 1, 3)
array([[[1200,    4, 3673],
        [1200,    3, 2303],
        [1200,    1, 2529],
        [1200,    2, 3659]],

       [[ 826,    2, 1545],
        [ 826,    3, 2226],
        [ 826,    1, 2531],
        [ 826,    4, 3673]],

       [[ 867,    2, 3661],
        [ 867,    4, 3673],
        [ 867,    3, 3210],
        [ 867,    1, 3266]],

       [[ 988,    2, 3644],
        [ 988,    3, 2854],
        [ 988,    1, 1823],
        [ 988,    4, 3673]],

       [[ 243,    1, 1989],
        [ 243,    2, 3656],
        [ 243,    4, 3673],
        [ 243,    3, 2972]]])


## Create and Train a TensorFlow Model