## Parse the .nt Files 

In [1]:
from rdflib import Graph
import pprint

# Load training data
train_graph = Graph()
train_graph.parse("data/fokg-sw-train-2024.nt", format="nt")

#Load test data
test_graph = Graph()
test_graph.parse("data/fokg-sw-test-2024.nt", format="nt")

#Append data
graph = train_graph + test_graph

#Extract triples
triples = []
for subj, pred, obj in graph:
    triples.append((str(subj), str(pred), str(obj)))

print("Sample triples:")
pprint.pprint(triples[:5])

Sample triples:
[('http://dice-research.org/data/fb15k-237.ttl#894',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate',
  'http://rdf.freebase.com/ns/music.record_label.artist'),
 ('http://dice-research.org/data/fb15k-237.ttl#407',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate',
  'http://rdf.freebase.com/ns/base.aareas.schema.administrative_area.administrative_parent'),
 ('http://dice-research.org/data/fb15k-237.ttl#797',
  'http://swc2017.aksw.org/hasTruthValue',
  '0.0'),
 ('http://dice-research.org/data/fb15k-237.ttl#1050',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'),
 ('http://dice-research.org/data/fb15k-237.ttl#824',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#object',
  'http://rdf.freebase.com/ns/m.06nns1')]


## Encode the Triples

In [2]:
from sklearn.preprocessing import LabelEncoder

# Flatten the triples list into a list of entities and predicates
entities = [t[0] for t in triples] + [t[2] for t in triples]
predicates = [t[1] for t in triples]


# Encoding entities and predicates
entity_encoder = LabelEncoder()
predicate_encoder = LabelEncoder()

encoded_entities = entity_encoder.fit_transform(entities)
encoded_predicates = predicate_encoder.fit_transform(predicates)

encoded_triples = [
    (encoded_entities[i], encoded_predicates[i], encoded_entities[len(triples) + i])
    for i in range(len(triples))
]

print("Encoded triples:")
pprint.pprint(encoded_triples[:5])

Encoded triples:
[(1385, 2, 3648), (845, 2, 1505), (1277, 0, 0), (61, 4, 3673), (1308, 1, 2914)]


In [3]:
#Index train, test sets 
train_idx = [(entity_encoder.transform([s])[0], predicate_encoder.transform([p])[0], entity_encoder.transform([o])[0]) for s,p,o in train_graph]
test_idx = [(entity_encoder.transform([s])[0], predicate_encoder.transform([p])[0], entity_encoder.transform([o])[0]) for s,p,o in test_graph]

print("Encoded Train triples:")
pprint.pprint(train_idx[:5])
print("Encoded Test triples:")
pprint.pprint(test_idx[:5])

Encoded Train triples:
[(1385, 2, 3648), (845, 2, 1505), (1277, 0, 0), (61, 4, 3673), (1308, 1, 2914)]
Encoded Test triples:
[(1401, 4, 3673),
 (739, 3, 1885),
 (1104, 3, 1617),
 (275, 4, 3673),
 (531, 3, 2506)]


## Validate the Encoding

In [4]:
decoded_triples = [
    (
        entity_encoder.inverse_transform([encoded_triples[i][0]])[0],
        predicate_encoder.inverse_transform([encoded_triples[i][1]])[0],
        entity_encoder.inverse_transform([encoded_triples[i][2]])[0]
    )
    for i in range(len(encoded_triples))
]

for i in range(5):
    print("Original:", triples[i])
    print("Decoded:", decoded_triples[i])
    print("--------")

Original: ('http://dice-research.org/data/fb15k-237.ttl#894', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/music.record_label.artist')
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#894', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/music.record_label.artist')
--------
Original: ('http://dice-research.org/data/fb15k-237.ttl#407', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/base.aareas.schema.administrative_area.administrative_parent')
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#407', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/base.aareas.schema.administrative_area.administrative_parent')
--------
Original: ('http://dice-research.org/data/fb15k-237.ttl#797', 'http://swc2017.aksw.org/hasTruthValue', '0.0')
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#797', 'http://swc2017.aksw.org/hasTruthValue', '0.0')

In [5]:
decoded_train_triples = [
    (
        entity_encoder.inverse_transform([train_idx[i][0]])[0],
        predicate_encoder.inverse_transform([train_idx[i][1]])[0],
        entity_encoder.inverse_transform([train_idx[i][2]])[0]
    )
    for i in range(len(train_idx))
]

for i in range(10):
    print("Encoded:", train_idx[i])
    print("Decoded:", decoded_train_triples[i])
    print("--------")

Encoded: (1385, 2, 3648)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#894', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/music.record_label.artist')
--------
Encoded: (845, 2, 1505)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#407', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://rdf.freebase.com/ns/base.aareas.schema.administrative_area.administrative_parent')
--------
Encoded: (1277, 0, 0)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#797', 'http://swc2017.aksw.org/hasTruthValue', '0.0')
--------
Encoded: (61, 4, 3673)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#1050', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')
--------
Encoded: (1308, 1, 2914)
Decoded: ('http://dice-research.org/data/fb15k-237.ttl#824', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object', 'http://rdf.freebase.com/ns/m.06nns1')
--------
Encoded: (208, 3, 2389)


## Prepare TensorFlow Data

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from rdflib import URIRef

# Define constants for different types of predicates used in the knowledge graph
# These are encoded values of common RDF predicates
TRUTH_VALUE_PREDICATE = predicate_encoder.transform([str(URIRef("http://swc2017.aksw.org/hasTruthValue"))])[0]
OBJECT_PREDICATE = predicate_encoder.transform([str(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#object"))])[0]
PREDICATE_PREDICATE = predicate_encoder.transform([str(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"))])[0]
SUBJECT_PREDICATE = predicate_encoder.transform([str(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"))])[0]
TYPE_PREDICATE = predicate_encoder.transform([str(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"))])[0]

print(TRUTH_VALUE_PREDICATE, OBJECT_PREDICATE, PREDICATE_PREDICATE, SUBJECT_PREDICATE, TYPE_PREDICATE) #checking the encoded value

# Convert triples(list) to a NumPy array
train_data = np.array(train_idx)
print("train_data: ", train_data.shape)
test_data = np.array(test_idx)
print("test_data: ", test_data.shape)

# Initialize dictionaries for X (type, subject, predicate, object) and Y (hasTruthValue)
facts_train = {}  # Key: Fact IRI, Value: List of triples for the fact
veracity_values_train = {}  # Key: Fact IRI, Value: Veracity value
facts_test = {} # Key: Fact IRI, Value: List of triples for the fact

# Populate facts and veracity values of train_idx
for fact_iri, predicate, obj in train_idx:
    if predicate in [TYPE_PREDICATE, SUBJECT_PREDICATE, PREDICATE_PREDICATE, OBJECT_PREDICATE]:
        facts_train.setdefault(fact_iri, []).append((fact_iri, predicate, obj))
    elif predicate == TRUTH_VALUE_PREDICATE:
        veracity_values_train.setdefault(fact_iri, []).append((fact_iri, predicate, obj))

# Populate facts and veracity values of test_idx
for fact_iri, predicate, obj in test_idx:
    if predicate in [TYPE_PREDICATE, SUBJECT_PREDICATE, PREDICATE_PREDICATE, OBJECT_PREDICATE]:
        facts_test.setdefault(fact_iri, []).append((fact_iri, predicate, obj))

0 1 2 3 4
train_data:  (5000, 3)
test_data:  (2000, 3)


In [7]:
# Prepare the training data
# X -> facts; Y -> veracity_values 
X,Y = [],[]
for key, value in facts_train.items():
    X.append(value)
    v_value = veracity_values_train.get(key)
    Y.append(v_value[0][2])

X = np.array(X)
print(type(X))
print("X : ", X.shape)
pprint.pprint(X[:2])
Y = np.array(Y)
print("Y : ", Y.shape)
pprint.pprint(Y[:2])

#Prepare the test data
X_test_nt = []
for key, value in facts_test.items():
    X_test_nt.append(value)

X_test_nt = np.array(X_test_nt)
print("X_test_nt : ", X_test_nt.shape)
pprint.pprint(X_test_nt[:2])

<class 'numpy.ndarray'>
X :  (1000, 4, 3)
array([[[1385,    2, 3648],
        [1385,    1, 1795],
        [1385,    3, 3299],
        [1385,    4, 3673]],

       [[ 845,    2, 1505],
        [ 845,    3, 2929],
        [ 845,    1, 2224],
        [ 845,    4, 3673]]])
Y :  (1000,)
array([0, 1])
X_test_nt :  (500, 4, 3)
array([[[1401,    4, 3673],
        [1401,    3, 2657],
        [1401,    1, 2307],
        [1401,    2, 1545]],

       [[ 739,    3, 1885],
        [ 739,    1, 3555],
        [ 739,    4, 3673],
        [ 739,    2, 3660]]])


In [8]:
X = X.reshape(X.shape[0], -1)
print(type(X))
print("X : ", X.shape)
pprint.pprint(X[:2])

Y = Y.reshape(Y.shape[0], -1)
print("Y : :", Y.shape)
pprint.pprint(Y[:2])

X_test_nt = X_test_nt.reshape(X_test_nt.shape[0], -1)
print("X_test_nt : ", X_test_nt.shape)
pprint.pprint(X_test_nt[:2])

<class 'numpy.ndarray'>
X :  (1000, 12)
array([[1385,    2, 3648, 1385,    1, 1795, 1385,    3, 3299, 1385,    4,
        3673],
       [ 845,    2, 1505,  845,    3, 2929,  845,    1, 2224,  845,    4,
        3673]])
Y : : (1000, 1)
array([[0],
       [1]])
X_test_nt :  (500, 12)
array([[1401,    4, 3673, 1401,    3, 2657, 1401,    1, 2307, 1401,    2,
        1545],
       [ 739,    3, 1885,  739,    1, 3555,  739,    4, 3673,  739,    2,
        3660]])


In [9]:
# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Print the shapes of the training and test sets
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (800, 12)
Y_train shape: (800, 1)
X_test shape: (200, 12)
Y_test shape: (200, 1)


## Create and Train a MLPClassifier Model

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

Y_train = Y_train.ravel()
Y_test = Y_test.ravel()

# Train a model (MLPClassifier)
model = MLPClassifier(hidden_layer_sizes=(256, 256, 128), max_iter=10000, random_state=42)

# Train the model
model.fit(X_train, Y_train)

In [11]:
# Evaluate the model
accuracy = model.score(X_test, Y_test)
print("Model accuracy on test data:", accuracy)

# Predict on the test set
Y_pred = model.predict(X_test)

# Print some predictions
print("Actual truth values:   ", Y_test[:25])
print("Predicted truth values:", Y_pred[:25])

Model accuracy on test data: 0.875
Actual truth values:    [1 1 1 0 1 0 0 1 1 0 0 1 0 1 0 0 0 1 1 1 1 1 1 0 0]
Predicted truth values: [1 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 1 1 1 1 1 0 1]


In [12]:
# Predict on the fokg-sw-test-2024.nt
Y_pred_nt = model.predict(X_test_nt)
print("Predicted truth values:", Y_pred_nt[:25])

Predicted truth values: [0 1 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 0]


## Write it to result file

In [14]:
with open("result_nt.ttl", "w") as resultFile:
    for i in range(len(X_test_nt)):
        sub = entity_encoder.inverse_transform([X_test_nt[i][0]])[0]
        pred = predicate_encoder.inverse_transform([0])[0]
        obj = entity_encoder.inverse_transform([Y_pred_nt[i]])[0]
        line = f"<{sub}> <{pred}> \"{obj}\"^^<http://www.w3.org/2001/XMLSchema#double> .\n"
        print(line)
        resultFile.write(line)

<http://dice-research.org/data/fb15k-237.ttl#908> <http://swc2017.aksw.org/hasTruthValue> "0.0"^^<http://www.w3.org/2001/XMLSchema#double> .

<http://dice-research.org/data/fb15k-237.ttl#311> <http://swc2017.aksw.org/hasTruthValue> "1.0"^^<http://www.w3.org/2001/XMLSchema#double> .

<http://dice-research.org/data/fb15k-237.ttl#640> <http://swc2017.aksw.org/hasTruthValue> "0.0"^^<http://www.w3.org/2001/XMLSchema#double> .

<http://dice-research.org/data/fb15k-237.ttl#1243> <http://swc2017.aksw.org/hasTruthValue> "0.0"^^<http://www.w3.org/2001/XMLSchema#double> .

<http://dice-research.org/data/fb15k-237.ttl#1474> <http://swc2017.aksw.org/hasTruthValue> "1.0"^^<http://www.w3.org/2001/XMLSchema#double> .

<http://dice-research.org/data/fb15k-237.ttl#18> <http://swc2017.aksw.org/hasTruthValue> "1.0"^^<http://www.w3.org/2001/XMLSchema#double> .

<http://dice-research.org/data/fb15k-237.ttl#940> <http://swc2017.aksw.org/hasTruthValue> "0.0"^^<http://www.w3.org/2001/XMLSchema#double> .

<http