# Knowledge Graph Embeddings

In [1]:
from rdflib import Graph, Namespace
from rdflib.namespace import RDF
import pandas as pd
import pykeen
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
import os
import numpy as np
import torch
from collections import defaultdict
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.getcwd()

'c:\\mahmoud uni\\TU\\SS2024\\KGs\\Portfolio'

## Load Knowledge Graph

In [3]:
BASE = Namespace("http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/")
g = Graph()
ontology_file = r"dataset\EA_FC_knowledge_graph.ttl" 
g.parse(ontology_file, format="ttl")

<Graph identifier=Nc79bce033432439c9dc3ade3d9e9338d (<class 'rdflib.graph.Graph'>)>

In [4]:
# Extract Entities
players = set(g.subjects(RDF.type, BASE.Player))
clubs = set(g.subjects(RDF.type, BASE.Club))
leagues = set(g.subjects(RDF.type, BASE.League))

print(f"Total Players: {len(players)}")
print(f"Total Clubs: {len(clubs)}")
print(f"Total Leagues: {len(leagues)}")

Total Players: 6276
Total Clubs: 2041
Total Leagues: 169


# Split Preperation

In [None]:
# Player extracting unique ID for each player
unique_player_ids = set()
player_id_to_triples = defaultdict(list)


players = list(g.subjects(RDF.type, BASE.Player)) # player nodes

# mapping players to unique id
player_to_id = {}

for player in players:
    for pred, obj in g.predicate_objects(player):
        if pred == BASE.player_id:
            player_id_full = str(obj)  
            player_unique_id = player_id_full.split('_')[0] # full ID e.g., '139068_21' -> wird zu 139068
            player_to_id[player] = player_unique_id
            unique_player_ids.add(player_unique_id)

for s, p, o in g:
    player_id = None

    if s in player_to_id:
        player_id = player_to_id[s]

    elif o in player_to_id:
        player_id = player_to_id[o]

    if player_id:
        player_id_to_triples[player_id].append((str(s), str(p), str(o)))

print(f"Total Players in KG: {len(players)}")
print(f"Total Players Extracted: {len(player_id_to_triples)}")


Total Players in KG: 6276
Total Players Extracted: 1569


In [None]:
# Step 4: Shuffle and split unique players
unique_player_ids = list(unique_player_ids)
np.random.seed(1120) 
np.random.shuffle(unique_player_ids)

# Define split ratios
train_ratio, val_ratio = 0.6, 0.2
num_players = len(unique_player_ids)

train_cutoff = int(train_ratio * num_players)
val_cutoff = int((train_ratio + val_ratio) * num_players)

# Assign players to train, validation, and test sets
train_players = set(unique_player_ids[:train_cutoff])
val_players = set(unique_player_ids[train_cutoff:val_cutoff])
test_players = set(unique_player_ids[val_cutoff:])

# Step 5: Assign triples to respective sets
train_triples, val_triples, test_triples = [], [], []

for player_id, triples in player_id_to_triples.items():
    if player_id in train_players:
        train_triples.extend(triples)
    elif player_id in val_players:
        val_triples.extend(triples)
    elif player_id in test_players:
        test_triples.extend(triples)

# Convert lists to numpy arrays
train_array = np.array(train_triples, dtype=str)
val_array = np.array(val_triples, dtype=str)
test_array = np.array(test_triples, dtype=str)

# Step 6: Create PyKEEN TriplesFactory objects
training = TriplesFactory.from_labeled_triples(train_array)
validation = TriplesFactory.from_labeled_triples(val_array)
testing = TriplesFactory.from_labeled_triples(test_array)


In [16]:
print(training)
print(validation)
print(testing)

TriplesFactory(num_entities=11679, num_relations=54, create_inverse_triples=False, num_triples=203256)
TriplesFactory(num_entities=4852, num_relations=54, create_inverse_triples=False, num_triples=67824)
TriplesFactory(num_entities=4747, num_relations=54, create_inverse_triples=False, num_triples=67824)


# KG Embeddings

In [17]:
print(torch.cuda.is_available()) 
print(torch.cuda.device_count())  
#print(torch.cuda.get_device_name(0))  # Name of the first GPU

True
1


In [None]:
result = pykeen.pipeline.pipeline(
    model="TransE",
    loss="softplus",
    
    training=training,
    testing=testing,
    validation=validation,

    training_kwargs=dict(
        num_epochs=2000,  # Maximum possible epochs
        batch_size=512,
        stopper="early",  # Enable early stopping
        stopper_kwargs=dict(frequency=50, patience=10, relative_delta=0.001) 
    ),
    
    optimizer="Adam",
    random_seed=1120,
    device="cuda",
    optimizer_kwargs=dict(lr=0.001),
)


Training epochs on cuda:0: 100%|██████████| 5/5 [00:06<00:00,  1.26s/epoch, loss=1.7, prev_loss=2.13] 
Evaluating on cuda:0: 100%|██████████| 67.8k/67.8k [00:34<00:00, 1.95ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 89.17s seconds
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=11679, num_relations=54, create_inverse_triples=False, num_triples=203256) to file:///C:/mahmoud%20uni/TU/SS2024/KGs/Portfolio/models/ea_fc_embeddings/training_triples
INFO:pykeen.pipeline.api:Saved to directory: C:\mahmoud uni\TU\SS2024\KGs\Portfolio\models\ea_fc_embeddings
