# Graph Neural Network

In [34]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

import pandas as pd
import numpy as np
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD

import os

In [35]:
os.getcwd()

'c:\\mahmoud uni\\TU\\SS2024\\KGs\\Portfolio'

## Load Knowledge Graph

In [37]:
# # Load the ontology
g = Graph()
ontology_file = r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\EA_FC_knowledge_graph_small.ttl" 
g.parse(ontology_file, format="ttl")

# g = Graph()
# ontology_file = r"dataset\EA_FC_knowledge_graph.nt" 
# g.parse(ontology_file, format="nt")


<Graph identifier=Nf526e332ff614d198b7471099ca5400e (<class 'rdflib.graph.Graph'>)>

In [38]:
# Other namespaces 
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
BASE = Namespace("http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/")

In [39]:
# Count all triples
print(f"Total triples: {len(g)}")

# Check for specific class membership
player_count = len(list(g.subjects(RDF.type, BASE.Player)))
print(f"Total Players: {player_count}")

team_count = len(list(g.subjects(RDF.type, BASE.Club)))
print(f"Total Teams: {team_count}")

league_count = len(list(g.subjects(RDF.type, BASE.League)))
print(f"Total Leagues: {league_count}")

Total triples: 3854106
Total Players: 73350
Total Teams: 2687
Total Leagues: 181


## Define Graph Convolutional Neural Network

In [40]:
# Step 1: Extract Entities and Relationships
entities = set()  # Initialize a set to store all unique entities (nodes).
relations = set()  # Initialize a set to store all unique relationships (edges).

# Iterate through all triples in the knowledge graph.
for s, p, o in g.triples((None, None, None)):
    entities.add(str(s))  # Add the subject as an entity.
    entities.add(str(o))  # Add the object as an entity.
    relations.add(str(p))  # Add the predicate as a relation.

# Create a mapping of entities and relations to unique indices.
entity_to_index = {entity: idx for idx, entity in enumerate(entities)}
relation_to_index = {relation: idx for idx, relation in enumerate(relations)}

# Print the total number of entities and relations for debugging.
print(f"Total Entities: {len(entity_to_index)}")
print(f"Total Relations: {len(relation_to_index)}")

Total Entities: 188219
Total Relations: 76


In [41]:
# Step 2: Create Edge List
edge_index = []  # List to store the source and target node indices for each edge.
edge_attr = []  # List to store the relation type for each edge.

# Iterate through all triples in the knowledge graph.
for s, p, o in g.triples((None, None, None)):
    edge_index.append([entity_to_index[str(s)], entity_to_index[str(o)]])  # Map entities to indices and store the edge.
    edge_attr.append(relation_to_index[str(p)])  # Map the relation to its index and store it.

# Convert edge list to a PyTorch tensor and transpose it to match the format (2, num_edges).
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
# Convert edge attributes to a PyTorch tensor.
edge_attr = torch.tensor(edge_attr, dtype=torch.long)

# Print the shapes of edge index and edge attributes for verification.
print(f"Edge Index Shape: {edge_index.shape}")
print(f"Edge Attributes Shape: {edge_attr.shape}")

Edge Index Shape: torch.Size([2, 3854106])
Edge Attributes Shape: torch.Size([3854106])


In [42]:
# Step 3: Generate Node Features
node_features = []  # List to store feature vectors for all nodes.

# Iterate through all entities in the graph.
for entity in entities:
    if "Player" in entity:  # Check if the entity is a player.
        # Extract player-specific features from the knowledge graph.
        features = [
            g.value(URIRef(entity), BASE.age, default=0),  # Player's age.
            g.value(URIRef(entity), BASE.pace, default=0),  # Player's pace rating.
            g.value(URIRef(entity), BASE.dribbling, default=0),  # Player's dribbling rating.
            g.value(URIRef(entity), BASE.shooting, default=0),  # Player's shooting rating.
            g.value(URIRef(entity), BASE.physic, default=0)  # Player's physical rating.
        ]
        # Append the features as a list of floats.
        node_features.append([float(f) for f in features])
    else:
        # For non-player entities, append placeholder features (all zeros).
        node_features.append([0.0] * 5)

# Convert node features to a PyTorch tensor.
node_features = torch.tensor(node_features, dtype=torch.float)
# Print the shape of the node feature matrix for verification.
print(f"Node Features Shape: {node_features.shape}")




Node Features Shape: torch.Size([188219, 5])


In [43]:
# Step 4: Generate Labels
labels = []  # List to store labels for all nodes.

# Iterate through all entities in the graph.
for entity in entities:
    if "Player" in entity:  # Check if the entity is a player.
        potential = g.value(URIRef(entity), BASE.potential, default=0)  # Extract the player's potential rating.
        labels.append(float(potential))  # Append the potential rating as a float.
    else:
        labels.append(-1.0)  # Assign a default label (-1) to non-player entities.

# Convert labels to a PyTorch tensor.
labels = torch.tensor(labels, dtype=torch.float)
# Print the shape of the label tensor for verification.
print(f"Labels Shape: {labels.shape}")




Labels Shape: torch.Size([188219])


In [44]:
# Step 5: Combine into a PyTorch Geometric Data Object

# Create a PyTorch Geometric Data object with nodes, edges, and labels.
data = Data(
    x=node_features,       # Node features matrix.
    edge_index=edge_index,  # Edge list (source and target node indices).
    edge_attr=edge_attr,   # Edge attributes (relation types).
    y=labels               # Labels for nodes.
)

# Print the Data object to inspect its components.
print(data)


Data(x=[188219, 5], edge_index=[2, 3854106], edge_attr=[3854106], y=[188219])


### P2

In [45]:
import torch
from torch_geometric.nn import GCNConv
from torch.nn import Linear
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# Define the GNN Model
class GNNModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)  # First GCN layer
        self.conv2 = GCNConv(hidden_dim, hidden_dim)  # Second GCN layer
        self.fc = Linear(hidden_dim, output_dim)  # Fully connected layer for regression

    def forward(self, x, edge_index):
        # Graph Convolution Layers
        x = self.conv1(x, edge_index)  # First graph convolution
        x = F.relu(x)  # Activation function
        x = self.conv2(x, edge_index)  # Second graph convolution
        x = F.relu(x)  # Activation function
        
        # Fully connected layer for regression output
        x = self.fc(x)
        return x


In [48]:

# Define Hyperparameters
input_dim = data.x.shape[1]  # Number of features per node
hidden_dim = 64  # Number of hidden units in GCN layers
output_dim = 1  # Single output for regression (player potential)

# Initialize Model
model = GNNModel(input_dim, hidden_dim, output_dim)


# Split Data into Train/Test
train_mask, test_mask = train_test_split(
    torch.arange(data.num_nodes),
    test_size=0.2,
    random_state=42
)  # Randomly split nodes into training and testing sets


# Define Optimizer and Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Adam optimizer
criterion = torch.nn.MSELoss()  # Mean Squared Error loss for regression


In [49]:
# Move data and model to GPU if available
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
model = model.to(device)
data = data.to(device)


In [50]:
# Training Loop
for epoch in range(100):  # Train for 100 epochs
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Zero the gradients from the last step
    
    # Forward pass
    out = model(data.x, data.edge_index)
    
    # Compute loss (only for training nodes)
    loss = criterion(out[train_mask], data.y[train_mask])
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    # Print progress
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

    
# Evaluation
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    # Forward pass for test nodes
    predictions = model(data.x, data.edge_index)[test_mask]
    true_values = data.y[test_mask]
    
    # Compute Mean Squared Error on test nodes
    mse = criterion(predictions, true_values).item()
    print(f"Test MSE: {mse}")


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 90691322500 bytes.

In [None]:
# from torch_geometric.loader import NeighborLoader

# from sklearn.model_selection import train_test_split

# # Assuming `data.y` contains the target labels for all nodes
# num_nodes = data.num_nodes
# indices = np.arange(num_nodes)

# train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
# train_idx, val_idx = train_test_split(train_idx, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
# data.val_mask = torch.zeros(num_nodes, dtype=torch.bool)
# data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)

# data.train_mask[train_idx] = True
# data.val_mask[val_idx] = True
# data.test_mask[test_idx] = True



# # Training loop
# def train_model(model, data, train_loader, optimizer, criterion, epochs=10):
#     model.train()  # Set the model to training mode
#     for epoch in range(epochs):
#         total_loss = 0
#         for batch in train_loader:
#             # Move data to the correct device
#             batch = batch.to(device)
#             optimizer.zero_grad()  # Clear gradients
            
#             # Forward pass
#             out = model(batch.x, batch.edge_index)
            
#             # Compute loss for training nodes
#             loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])
#             loss.backward()  # Backward pass
#             optimizer.step()  # Update weights
            
#             total_loss += loss.item()  # Accumulate loss

#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

# # Define the GNN Model
# class GNNModel(torch.nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels):
#         super(GNNModel, self).__init__()
#         self.conv1 = GCNConv(in_channels, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, out_channels)
#         self.fc = Linear(out_channels, 1)  # For regression (predicting potential)

#     def forward(self, x, edge_index):
#         x = self.conv1(x, edge_index)
#         x = F.relu(x)
#         x = self.conv2(x, edge_index)
#         return self.fc(x)

# # Initialize the model
# model = GNNModel(in_channels=5, hidden_channels=32, out_channels=16).to(device)


# # Optimizer and loss function
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# criterion = torch.nn.MSELoss()

# train_loader = NeighborLoader(
#     data, 
#     num_neighbors=[10, 10], 
#     batch_size=1024, 
#     input_nodes=data.train_mask,
#     shuffle=True,
# )


# # Train the model
# train_model(model, data, train_loader, optimizer, criterion, epochs=20)


ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'