# Graph Neural Network

In [1]:
import pandas as pd
import numpy as np
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD
import os

from torch_geometric.data import Data
import torch
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from rdflib.namespace import XSD
from sklearn.preprocessing import StandardScaler


In [2]:
os.getcwd()

'c:\\mahmoud uni\\TU\\SS2024\\KGs\\Portfolio'

## Load Knowledge Graph

In [3]:
# # Load the ontology
BASE = Namespace("http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/")
g = Graph()
ontology_file = r"dataset\EA_FC_knowledge_graph.ttl" 
g.parse(ontology_file, format="ttl")

<Graph identifier=N0a4caf44e30e4e34ae528aaf323b822e (<class 'rdflib.graph.Graph'>)>

In [4]:
# Count all triples
print(f"Total triples: {len(g)}")

# Check for specific class membership
player_count = len(list(g.subjects(RDF.type, BASE.Player)))
print(f"Total Players: {player_count}")

team_count = len(list(g.subjects(RDF.type, BASE.Club)))
print(f"Total Teams: {team_count}")

league_count = len(list(g.subjects(RDF.type, BASE.League)))
print(f"Total Leagues: {league_count}")

Total triples: 51846
Total Players: 764
Total Teams: 618
Total Leagues: 146


## Data Preperation

### Node Extraction

In [5]:
# extract nodes
players = list(g.subjects(RDF.type, BASE.Player))
clubs = list(g.subjects(RDF.type, BASE.Club))
leagues = list(g.subjects(RDF.type, BASE.League))

# mapping for node indices
node_list = players + clubs + leagues
node_to_idx = {node: idx for idx, node in enumerate(node_list)}

In [6]:
# Label Encoders for Categorical Features
position_encoder = LabelEncoder()
foot_encoder = LabelEncoder()
work_rate_encoder = LabelEncoder()
body_type_encoder = LabelEncoder()
league_encoder = LabelEncoder()
nationality_encoder = LabelEncoder()

# Collect all unique categories for encoding
positions = set()
preferred_feet = set()
work_rate_components = set()
body_types = set()
league_names = set()
nationalities = set()

# Collect categories for players
for player in players:
    for p, o in g.predicate_objects(player):
        if p == BASE.position_category:
            positions.add(str(o))
        if p == BASE.preferred_foot:
            preferred_feet.add(str(o))
        if p == BASE.work_rate:
            work_rate = str(o)
            work_rate_att, work_rate_def = work_rate.split('/')
            work_rate_components.add(work_rate_att)
            work_rate_components.add(work_rate_def)
        if p == BASE.body_type:
            body_types.add(str(o))

# Collect categories for clubs and leagues
for club in clubs:
    for p, o in g.predicate_objects(club):
        if p == BASE.name:
            league_names.add(str(o))

for league in leagues:
    for p, o in g.predicate_objects(league):
        if p == BASE.league_nationality_name:
            nationalities.add(str(o))

# Fit Encoders
position_encoder.fit(list(positions))
foot_encoder.fit(list(preferred_feet))
work_rate_encoder.fit(list(work_rate_components))  # Now fitted on individual components
body_type_encoder.fit(list(body_types))
league_encoder.fit(list(league_names))
nationality_encoder.fit(list(nationalities))


### Feature Extraction

In [7]:
features = []
targets = []

for node in node_list:
    attrs = {}
    for p, o in g.predicate_objects(node):
        # Handle Numeric Features
        if isinstance(o, Literal):
            if o.datatype in [XSD.integer, XSD.float, XSD.double]:  # Numeric DataTypes
                attrs[p.split("/")[-1]] = float(o)
            elif o.datatype in [XSD.string, None]:  # Categorical/String
                key = p.split("/")[-1]
                if key != 'name':  # Exclude player name from features
                    attrs[key] = str(o)

    # Encode Categorical Features
    position_encoded = position_encoder.transform([attrs.get('position_category', 'DEF')])[0] if 'position_category' in attrs else 0
    preferred_foot_encoded = foot_encoder.transform([attrs.get('preferred_foot', 'Right')])[0] if 'preferred_foot' in attrs else 0
    body_type_encoded = body_type_encoder.transform([attrs.get('body_type', 'Normal')])[0] if 'body_type' in attrs else 0
    league_encoded = league_encoder.transform([attrs.get('name', 'Unknown')])[0] if 'name' in attrs else 0
    nationality_encoded = nationality_encoder.transform([attrs.get('league_nationality_name', 'Unknown')])[0] if 'league_nationality_name' in attrs else 0

    # Handle work_rate
    work_rate_att, work_rate_def = attrs.get('work_rate', 'Medium/Medium').split('/') if 'work_rate' in attrs else ('Medium', 'Medium')
    work_rate_att_encoded = work_rate_encoder.transform([work_rate_att])[0]
    work_rate_def_encoded = work_rate_encoder.transform([work_rate_def])[0]

    # Numeric Features
    numeric_features = [v for k, v in attrs.items() if isinstance(v, float) and k != 'name' and k != 'player_id'] 

    # Combine Features
    feature_vector = numeric_features + [
        position_encoded, 
        preferred_foot_encoded, 
        work_rate_att_encoded, 
        work_rate_def_encoded, 
        body_type_encoded,
        league_encoded,
        nationality_encoded
    ]

    # Correctly Extract 'potential'
    potential = attrs.get('potential', None)

    features.append(feature_vector)
    targets.append(potential if potential is not None else 0)

# Padding feature vectors to have the same length
max_len = max(len(f) for f in features)
features = [f + [0] * (max_len - len(f)) for f in features]

In [8]:
# # Debugging: Inspect sample player data
# sample = leagues[0]
# for p, o in g.predicate_objects(sample):
#     print(f"Predicate: {p}, Object: {o}, Data Type: {type(o)}, RDF DataType: {getattr(o, 'datatype', None)}")


### Edge Index

In [9]:
edge_index = []

for s, p, o in g:
    if s in node_to_idx and o in node_to_idx:
        edge_index.append([node_to_idx[s], node_to_idx[o]])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

### Train-Test-Val Split

In [10]:
# Step 1: Extract unique player IDs
unique_player_ids = set()
player_id_to_indices = {}

for idx, node in enumerate(players):
    for p, o in g.predicate_objects(node):
        if p == BASE.player_id:
            player_id_full = str(o)
            player_unique_id = player_id_full.split('_')[0]
            unique_player_ids.add(player_unique_id)

            # Map unique player ID to indices in node_list
            if player_unique_id not in player_id_to_indices:
                player_id_to_indices[player_unique_id] = []
            player_id_to_indices[player_unique_id].append(idx)

# Step 2: Shuffle and split player IDs
unique_player_ids = list(unique_player_ids)
np.random.shuffle(unique_player_ids)

num_players = len(unique_player_ids)
train_cutoff = int(0.6 * num_players)
val_cutoff = int(0.8 * num_players)

train_ids = unique_player_ids[:train_cutoff]
val_ids = unique_player_ids[train_cutoff:val_cutoff]
test_ids = unique_player_ids[val_cutoff:]

# Step 3: Create masks
train_mask = torch.zeros(len(node_list), dtype=torch.bool)
val_mask = torch.zeros(len(node_list), dtype=torch.bool)
test_mask = torch.zeros(len(node_list), dtype=torch.bool)  

for player_id in train_ids:
    for idx in player_id_to_indices[player_id]:
        train_mask[idx] = True

for player_id in val_ids:
    for idx in player_id_to_indices[player_id]:
        val_mask[idx] = True

for player_id in test_ids:
    for idx in player_id_to_indices[player_id]:
        test_mask[idx] = True

# Step 4: Create PyG Data Object
x = torch.tensor(features, dtype=torch.float)
y = torch.tensor(targets, dtype=torch.float)

# Ensure edge_index is already defined as in the previous code
data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
data.x = torch.tensor(StandardScaler().fit_transform(data.x), dtype=torch.float)

# Display the graph data
print(data)

Data(x=[1528, 52], edge_index=[2, 2344], y=[1528], train_mask=[1528], val_mask=[1528], test_mask=[1528])


In [11]:
print(f"Total nodes: {data.num_nodes}")
print(f"Player nodes: {len(players)}, Club nodes: {len(clubs)}, League nodes: {len(leagues)}")

Total nodes: 1528
Player nodes: 764, Club nodes: 618, League nodes: 146


In [12]:
def validate_split():
    train_players = set(train_ids)
    val_players = set(val_ids)
    test_players = set(test_ids)

    overlap_train_val = train_players.intersection(val_players)
    overlap_train_test = train_players.intersection(test_players)
    overlap_val_test = val_players.intersection(test_players)

    print("Validation Results:")
    print(f"Overlap between Train and Validation: {len(overlap_train_val)}")
    print(f"Overlap between Train and Test: {len(overlap_train_test)}")
    print(f"Overlap between Validation and Test: {len(overlap_val_test)}")

    assert len(overlap_train_val) == 0, "Overlap detected between Train and Validation sets!"
    assert len(overlap_train_test) == 0, "Overlap detected between Train and Test sets!"
    assert len(overlap_val_test) == 0, "Overlap detected between Validation and Test sets!"

    # Proportion Calculation
    total_players = len(train_players) + len(val_players) + len(test_players)
    print("\nProportion of each split:")
    print(f"Train Set: {len(train_players)} players ({(len(train_players)/total_players)*100:.2f}%)")
    print(f"Validation Set: {len(val_players)} players ({(len(val_players)/total_players)*100:.2f}%)")
    print(f"Test Set: {len(test_players)} players ({(len(test_players)/total_players)*100:.2f}%)")

    print("\nNo overlaps detected. The split is valid.")

# Run validation
validate_split()

Validation Results:
Overlap between Train and Validation: 0
Overlap between Train and Test: 0
Overlap between Validation and Test: 0

Proportion of each split:
Train Set: 114 players (59.69%)
Validation Set: 38 players (19.90%)
Test Set: 39 players (20.42%)

No overlaps detected. The split is valid.


## Model Preperation

In [13]:
# Step 1: Define GNN Model
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1):
        super(GNN, self).__init__()
        
        # Graph Convolution Layers
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        
        # Fully Connected Layer for Regression
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # Graph Convolution Layers with ReLU Activation
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        # Final Output Layer
        out = self.fc(x)
        return out.squeeze()  # Squeeze to match the shape of target (y)

# Step 2: Training Function
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    return loss.item()

# Step 3: Evaluation Function (Handles Both Validation & Test)
def evaluate(model, data, criterion, mask):
    model.eval()
    with torch.no_grad():
        out = model(data)
        loss = criterion(out[mask], data.y[mask])

        # Calculate MAE, RMSE, and R^2 Score
        mae = F.l1_loss(out[mask], data.y[mask]).item()
        rmse = torch.sqrt(F.mse_loss(out[mask], data.y[mask])).item()
        ss_res = torch.sum((data.y[mask] - out[mask]) ** 2)
        ss_tot = torch.sum((data.y[mask] - torch.mean(data.y[mask])) ** 2)
        r2 = 1 - ss_res / ss_tot

    return loss.item(), mae, rmse, r2.item()



In [None]:
# Step 4: Model Training Loop
input_dim = data.x.shape[1]  # Number of features per node
#hidden_dim = 64              # Hidden dimension size
hidden_dim = 32              # Hidden dimension size

model = GNN(input_dim, hidden_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
criterion = nn.MSELoss()

# Training Loop
num_epochs = 5000
best_val_loss = float("inf")  # Track best validation loss
best_model = None  # Store best model state

for epoch in range(1, num_epochs + 1):
    train_loss = train(model, data, optimizer, criterion)
    val_loss, val_mae, val_rmse, val_r2 = evaluate(model, data, criterion, data.val_mask)  # Use validation set

    # Save best model based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model.state_dict()

    if epoch % 500 == 0:
        print(f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val MAE: {val_mae:.4f} | Val RMSE: {val_rmse:.4f} | Val R^2: {val_r2:.4f}")

# Load best model before evaluating on test set
model.load_state_dict(best_model)

# Final Test Evaluation
test_loss, test_mae, test_rmse, test_r2 = evaluate(model, data, criterion, data.test_mask)
print(f"\nFinal Test Evaluation -> Loss: {test_loss:.4f} | MAE: {test_mae:.4f} | RMSE: {test_rmse:.4f} | R^2: {test_r2:.4f}")


Epoch 1000 | Train Loss: 6.8992 | Val Loss: 11.5937 | Val MAE: 2.7139 | Val RMSE: 3.4049 | Val R^2: 0.8485
Epoch 2000 | Train Loss: 1.6645 | Val Loss: 4.4612 | Val MAE: 1.7346 | Val RMSE: 2.1121 | Val R^2: 0.9417
Epoch 3000 | Train Loss: 0.6388 | Val Loss: 3.0053 | Val MAE: 1.4336 | Val RMSE: 1.7336 | Val R^2: 0.9607
Epoch 4000 | Train Loss: 0.3379 | Val Loss: 2.6106 | Val MAE: 1.3153 | Val RMSE: 1.6157 | Val R^2: 0.9659
Epoch 5000 | Train Loss: 0.1900 | Val Loss: 2.3177 | Val MAE: 1.2097 | Val RMSE: 1.5224 | Val R^2: 0.9697

Final Test Evaluation -> Loss: 1.7831 | MAE: 1.0381 | RMSE: 1.3353 | R^2: 0.9778
