In [1]:
import rdflib
import pandas as pd
from rdflib import Graph, URIRef, RDF, Namespace
import numpy as np
import os
import torch
from joblib import dump

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

from pykeen.triples import TriplesFactory
from pykeen.models import TransE
from pykeen.training import SLCWATrainingLoop
from pykeen.losses import MarginRankingLoss

  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [2]:
from rdflib import Graph
import pprint

# Load reference knowledge graph
reference_kg = Graph()
reference_kg.parse("data/reference-kg.nt", format="nt")
print("Reference Knowledge Graph length.", len(reference_kg))

# Load training data
train_graph = Graph()
train_graph.parse("data/fokg-sw-train-2024.nt", format="nt")
print("Training data length.", len(train_graph))

#Load test data
test_graph = Graph()
test_graph.parse("data/fokg-sw-test-2024.nt", format="nt")
print("Test data length.", len(test_graph))


Reference Knowledge Graph length. 675859
Training data length. 5000
Test data length. 2000


# Training TransE with reference_kg triples

In [3]:
import numpy as np
reference_triples = []
for subj, pred, obj in reference_kg:
    if isinstance(subj, rdflib.URIRef) and isinstance(obj, rdflib.URIRef): #we only consider URIRef entities
        reference_triples.append((str(subj), str(pred), str(obj)))

print("Number of triples in reference_kg: ",len(reference_triples))

reference_triples_array = np.array(reference_triples, dtype=object)
reference_factory = TriplesFactory.from_labeled_triples(reference_triples_array)

# TransE Model
embedding_dim = 200
margin = 1.0

model = TransE(
    triples_factory=reference_factory,
    embedding_dim=embedding_dim,
    scoring_fct_norm=1,  # L1 distance
    loss=MarginRankingLoss(margin=margin),  # margin-based ranking
    random_seed= 42,
)

training_kg_loop = SLCWATrainingLoop(
    model=model,
    triples_factory=reference_factory,
    optimizer="adam",
    optimizer_kwargs={"lr": 1e-3},
    negative_sampler="basic",
    negative_sampler_kwargs={"num_negs_per_pos": 10},
)

num_epochs = 10
batch_size = 256
print(f"Training TransE for {num_epochs} epochs, batch_size={batch_size} ...")
_ = training_kg_loop.train(
    triples_factory=reference_factory,
    num_epochs=num_epochs,
    batch_size=batch_size,
    use_tqdm=True,
)

print("TransE model trained with reference_kg.")

Number of triples in reference_kg:  660000
Training TransE for 10 epochs, batch_size=256 ...


Training epochs on cpu: 100%|██████████| 10/10 [17:35<00:00, 105.51s/epoch, loss=0.0145, prev_loss=0.0147]

TransE model trained with reference_kg.





# Encoding entity and relations

In [4]:
entity_to_id = reference_factory.entity_to_id
relation_to_id = reference_factory.relation_to_id

entity_representation = model.entity_representations[0]
relation_representation = model.relation_representations[0]

# Preparing Train and Test data

In [5]:
from rdflib.plugins.sparql import prepareQuery
import rdflib

# Train Data
query_train = prepareQuery("""
    SELECT ?stmt ?subject ?predicate ?object ?truthValue
    WHERE {
        ?stmt a <http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement> .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#subject> ?subject .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate> ?predicate .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#object> ?object .
        ?stmt <http://swc2017.aksw.org/hasTruthValue> ?truthValue .
    }
""")

train_triples = []
train_truthValue = []
for fact_iri, sub, pred, obj, truth_value in train_graph.query(query_train):
    train_triples.append((sub.toPython(),pred.toPython(),obj.toPython()))
    train_truthValue.append(truth_value.toPython())

# Test Data
query_test = prepareQuery("""
    SELECT ?stmt ?subject ?predicate ?object
    WHERE {
        ?stmt a <http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement> .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#subject> ?subject .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate> ?predicate .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#object> ?object .
    }
""")

test_triples = []
test_fact_iri = []
for fact_iri, sub, pred, obj in test_graph.query(query_test):
    test_triples.append((sub.toPython(),pred.toPython(),obj.toPython()))
    test_fact_iri.append(fact_iri.toPython())


# Encoded on embedded model

In [6]:
def get_embedding_for_fact(subj, pred, obj):
    if subj not in entity_to_id or obj not in entity_to_id or pred not in relation_to_id:
        emb_dim = model.entity_representations[0]._embeddings.weight.shape[-1]
        return np.zeros(3 * emb_dim)

    s_id = entity_to_id[subj]
    p_id = relation_to_id[pred]
    o_id = entity_to_id[obj]

    s_emb = model.entity_representations[0](indices=torch.tensor([s_id]))  # shape [1, dim]
    p_emb = model.relation_representations[0](indices=torch.tensor([p_id]))
    o_emb = model.entity_representations[0](indices=torch.tensor([o_id]))

    cat = torch.cat([s_emb[0], p_emb[0], o_emb[0]], dim=0)
    return cat.detach().cpu().numpy()


X_train = [get_embedding_for_fact(s, p, o) for (s, p, o) in train_triples]
y_train = np.array(train_truthValue)
X_train = np.array(X_train)

X_test = [get_embedding_for_fact(s, p, o) for (s, p, o) in test_triples]
X_test = np.array(X_test)

## Create and Train a MLPClassifier Model

In [7]:
mlp = MLPClassifier(hidden_layer_sizes=(256, 256, 128), activation="relu", solver="adam", max_iter=50, random_state=42)
mlp.fit(X_train, y_train)

train_probs = mlp.predict_proba(X_train)[:, 1]
train_auc = roc_auc_score(y_train, train_probs)
print(f"Train AUC: {train_auc:.4f}")

test_probs = mlp.predict_proba(X_test)[:, 1]

Train AUC: 1.0000


## Write it to result file

In [8]:
with open("result.ttl", "w") as resultFile:
    for fact_iri, score in zip(test_fact_iri, test_probs):
        line = f'<{fact_iri}> <http://swc2017.aksw.org/hasTruthValue> "{score}"^^<http://www.w3.org/2001/XMLSchema#double> .\n'
        resultFile.write(line)