# Graph Neural Network

In [1]:
import pandas as pd
import numpy as np
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import torch
from torch_geometric.data import Data

import os

In [2]:
os.getcwd()

'c:\\mahmoud uni\\TU\\SS2024\\KGs\\Portfolio'

## Load Knowledge Graph

In [3]:
# # Load the ontology
g = Graph()
#ontology_file = r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\EA_FC_knowledge_graph_small.ttl" 
ontology_file = r"dataset\EA_FC_knowledge_graph_small.ttl" 
g.parse(ontology_file, format="ttl")

# g = Graph()
# ontology_file = r"dataset\EA_FC_knowledge_graph.nt" 
# g.parse(ontology_file, format="nt")

<Graph identifier=N645c13b8f90d4de58e61fc699c342316 (<class 'rdflib.graph.Graph'>)>

In [4]:
# Other namespaces 
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
BASE = Namespace("http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/")

In [5]:
# Count all triples
print(f"Total triples: {len(g)}")

# Check for specific class membership
player_count = len(list(g.subjects(RDF.type, BASE.Player)))
print(f"Total Players: {player_count}")

team_count = len(list(g.subjects(RDF.type, BASE.Club)))
print(f"Total Teams: {team_count}")

league_count = len(list(g.subjects(RDF.type, BASE.League)))
print(f"Total Leagues: {league_count}")

Total triples: 50772
Total Players: 776
Total Teams: 608
Total Leagues: 143


## Analyze the KG

In [6]:
import networkx as nx
import matplotlib.pyplot as plt

In [17]:
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph

nx_graph = rdflib_to_networkx_multidigraph(g)

In [18]:
num_nodes = nx_graph.number_of_nodes()
num_edges = nx_graph.number_of_edges()
print(f'Number of nodes: {num_nodes}')
print(f'Number of edges: {num_edges}')
print(f'Ratio edges to nodes: {round(num_edges / num_nodes, 2)}')

Number of nodes: 5131
Number of edges: 50772
Ratio edges to nodes: 9.9


In [26]:
from torch_geometric.utils.convert import to_networkx, from_networkx
pyg_graph = from_networkx(nx_graph)

5131

## Preprocessing

In [36]:
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import HeteroData

# Initialize a heterogeneous graph
data = HeteroData()

# Initialize label encoders for string features
player_label_encoders = {}
club_label_encoders = {}
league_label_encoders = {}

# Extract player nodes and their features dynamically
players = []
player_features = []
player_feature_names = []

for player in g.subjects(RDF.type, BASE.Player):
    players.append(player)
    features = []
    for predicate, obj in g.predicate_objects(subject=player):
        if isinstance(obj, Literal):
            if predicate not in player_feature_names:
                player_feature_names.append(predicate)
            if obj.datatype == XSD.integer:
                features.append(int(obj))
            elif obj.datatype in (XSD.float, XSD.decimal):
                features.append(float(obj))
            elif obj.datatype == XSD.string:
                # Encode string features
                if predicate not in player_label_encoders:
                    player_label_encoders[predicate] = LabelEncoder()
                encoded_value = player_label_encoders[predicate].fit_transform([obj])[0]
                features.append(encoded_value)
            else:
                try:
                    features.append(float(obj))
                except ValueError:
                    features.append(0.0)
    player_features.append(features)

data["player"].x = torch.tensor(player_features, dtype=torch.float)
data["player"].num_nodes = len(players)

# Extract club nodes and their features dynamically
clubs = []
club_features = []
club_feature_names = []

for club in g.subjects(RDF.type, BASE.Club):
    clubs.append(club)
    features = []
    for predicate, obj in g.predicate_objects(subject=club):
        if isinstance(obj, Literal):
            if predicate not in club_feature_names:
                club_feature_names.append(predicate)
            if obj.datatype == XSD.integer:
                features.append(int(obj))
            elif obj.datatype in (XSD.float, XSD.decimal):
                features.append(float(obj))
            elif obj.datatype == XSD.string:
                # Encode string features
                if predicate not in club_label_encoders:
                    club_label_encoders[predicate] = LabelEncoder()
                encoded_value = club_label_encoders[predicate].fit_transform([obj])[0]
                features.append(encoded_value)
            else:
                try:
                    features.append(float(obj))
                except ValueError:
                    features.append(0.0)
    club_features.append(features)

data["club"].x = torch.tensor(club_features, dtype=torch.float)
data["club"].num_nodes = len(clubs)

# Extract league nodes and their features dynamically
leagues = []
league_features = []
league_feature_names = []

for league in g.subjects(RDF.type, BASE.League):
    leagues.append(league)
    features = []
    for predicate, obj in g.predicate_objects(subject=league):
        if isinstance(obj, Literal):
            if predicate not in league_feature_names:
                league_feature_names.append(predicate)
            if obj.datatype == XSD.integer:
                features.append(int(obj))
            elif obj.datatype in (XSD.float, XSD.decimal):
                features.append(float(obj))
            elif obj.datatype == XSD.string:
                # Encode string features
                if predicate not in league_label_encoders:
                    league_label_encoders[predicate] = LabelEncoder()
                encoded_value = league_label_encoders[predicate].fit_transform([obj])[0]
                features.append(encoded_value)
            else:
                try:
                    features.append(float(obj))
                except ValueError:
                    features.append(0.0)
    league_features.append(features)

data["league"].x = torch.tensor(league_features, dtype=torch.float)
data["league"].num_nodes = len(leagues)

# Optional: Debugging feature names
print("Extracted Player Features:", player_feature_names)
print("Extracted Club Features:", club_feature_names)
print("Extracted League Features:", league_feature_names)


Extracted Player Features: [rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/age'), rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/club_contract_valid_until_year'), rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/club_joined_date'), rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/club_position'), rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/defending'), rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/defending_marking_awareness'), rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/defending_sliding_tackle'), rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/defending_standing_tackle'), rdflib.term.URIRef('http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/dribbling'

In [37]:
# Extract edges: Player -> Club
player_to_club_edges = []
for player in players:
    club = g.value(player, BASE.plays_for)
    if club in clubs:
        player_to_club_edges.append((players.index(player), clubs.index(club)))

data["player", "plays_for", "club"].edge_index = torch.tensor(player_to_club_edges, dtype=torch.long).t()

# Extract edges: Player -> League
player_to_league_edges = []
for player in players:
    league = g.value(player, BASE.competes_in)
    if league in leagues:
        player_to_league_edges.append((players.index(player), leagues.index(league)))

data["player", "competes_in", "league"].edge_index = torch.tensor(player_to_league_edges, dtype=torch.long).t()

# Extract edges: Club -> League
club_to_league_edges = []
for club in clubs:
    league = g.value(club, BASE.part_of_league)
    if league in leagues:
        club_to_league_edges.append((clubs.index(club), leagues.index(league)))

data["club", "part_of", "league"].edge_index = torch.tensor(club_to_league_edges, dtype=torch.long).t()


In [38]:
player_targets = []
for player in players:
    potential = int(g.value(player, BASE.potential, default=0))
    player_targets.append(potential)

data["player"].y = torch.tensor(player_targets, dtype=torch.float)


In [34]:
player_targets = []
for player in players:
    potential = int(g.value(player, BASE.potential, default=0))
    player_targets.append(potential)

data["player"].y = torch.tensor(player_targets, dtype=torch.float)
