# Graph Neural Networks in football analytics

This notebook aim is to explore and prototype methods to apply GNN in football analytics.

In [11]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import matplotlib.pyplot as plt
import os

import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append("../source")

from bokeh.io import output_notebook, show
output_notebook()


# Match analytics library (In progress)
from Match_Analytics import Match
from Tracking_Dynamics import calc_player_norm_positions
from Tracking_Visualization import plot_sliding_window, play_match,  draw_pitch
from Tracking_Filters import possesion_filter, ball_position_filter, time_window
from Tracking_Statistics import bivariate_normal_distribution

#GNN imports

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam

from spektral.data import Dataset, Graph, DisjointLoader, SingleLoader
from spektral.layers import ECCConv, GlobalSumPool, MessagePassing, GraphMasking
from spektral.transforms import LayerPreprocess, NormalizeAdj, NormalizeSphere

from sklearn.preprocessing import MinMaxScaler


### Pass network dataset build

The first step is to read the data from all the matches available and build the pass network dataset to train the GNN.

In [12]:
m = [Match(data_source="metrica-sports", match_id=i) for i in [1,2]]

Initializing match: 1

Reading team: home
Reading team: away
Match preprocessed successfully.

Initializing match: 2

Reading team: home
Reading team: away
Match preprocessed successfully.



In [16]:
plot_sliding_window(m[1], background_fill_color="white", grass_alpha=0, line_color="black")

In [18]:
def build_graph(game, start_time, end_time):
    
    events = time_window(game.events, start_time, end_time)
    tracking_home = time_window(game.tracking_home, start_time, end_time)
    tracking_away = time_window(game.tracking_away, start_time, end_time)

    data = events[events['Type']=='PASS'].groupby(['From',
                                                             'To',
                                                             'Start Time [s]',
                                                             'End Time [s]',
                                                             'Start X',
                                                             'Start Y',
                                                             'End X',
                                                             'End Y']).size().reset_index(name="Freq")
    data = data.groupby(['From','To']).sum().reset_index()
    data = data.apply(lambda x: x/data['Freq'] if x.name in [ 'Start Time [s]', 
                                                              'End Time [s]',
                                                              'Start X',
                                                              'End X',
                                                              'Start Y', 
                                                              'End Y',
                                                             ] else x)
    
    home_stats = bivariate_normal_distribution(tracking_home, game.home_players)
    away_stats = bivariate_normal_distribution(tracking_away, game.away_players)
    ball_stats = bivariate_normal_distribution(tracking_home, game.home_players, ball = True)
    nodes_features = pd.concat([home_stats,away_stats], ignore_index=True).sort_values('player').astype({"player":'category'})
    
    players_node_dict = dict(zip(nodes_features['player'].cat.categories,nodes_features['player'].cat.codes))
    
    data["To"] = data["To"].str.replace(" ","").apply(lambda y: players_node_dict[y])
    data["From"] = data["From"].str.replace(" ","").apply(lambda y: players_node_dict[y])
    data["Pass Time [s]"] = data["End Time [s]"] - data["Start Time [s]"]

    a = data[["From", "To", "Freq"]]
    e = data[["Pass Time [s]" , "Start X", "Start Y", "End X", "End Y"]]
    x = nodes_features[['x_mean','y_mean','normx_mean','normy_mean']]
    y = pd.DataFrame()
    y['ball_meanx'] = ball_stats["x_mean"]
    y['ball_meany'] = ball_stats["y_mean"]
    
    vals = np.unique(a[['From', 'To']])
    df2 = pd.DataFrame(0, index=vals, columns=vals)
    f = df2.index.get_indexer
    df2.values[f(a.From), f(a.To)] = a.Freq.values
    a = df2
    
    a = a.astype("float32")
    e = e.astype("float32")
    x = x.astype("float32")
    y = y.astype("float32").to_numpy()[0]
    
    return a, e, x, y

In [19]:
class PassNetworkDataset(Dataset):
    """
    Pass Network Graph dataset from a list of matches, a rolling window size and time step must be provided.
    """
    def __init__(self, games, window_size, step_size, **kwargs):
        self.games = games
        self.window_size = window_size
        self.step_size = step_size
        
        super().__init__(**kwargs)
    def download(self):
        # Create the directory
        os.mkdir(self.path)

        # Write the data to file
        for game in self.games:
            start_time = 0
            end_time = start_time + self.window_size
            while end_time <= game.tracking_home["Time [s]"].max():
                start_time += self.step_size
                end_time += self.step_size
                a, e, x, y = build_graph(game, start_time, end_time)
                filename = os.path.join(self.path, f'graph_{start_time}_{end_time}')
                np.savez(filename, a=a, e=e, x=x, y=y)
    def read(self):
        # We must return a list of Graph objects
        output = []

        for game in self.games:
            start_time = 0
            end_time = start_time + self.window_size
            while end_time <= game.tracking_home["Time [s]"].max():
                start_time += self.step_size
                end_time += self.step_size
                data = np.load(os.path.join(self.path, f'graph_{start_time}_{end_time}.npz'))
                output.append(
                    Graph(x=data['x'], a=csr_matrix(data['a']), y=data['y'], e=data['e'])
                )

        return output

In [20]:
################################################################################
# Config
################################################################################
learning_rate = 1e-3  # Learning rate
epochs = 1000  # Number of training epochs
batch_size = 10  # Batch size

In [21]:
################################################################################
# Load data
################################################################################
dataset = PassNetworkDataset(m, 60*5, 30, transforms=[NormalizeAdj(symmetric=False)])

# Parameters
F = dataset.n_node_features  # Dimension of node features
S = dataset.n_edge_features  # Dimension of edge features
n_out = dataset.n_labels  # Dimension of the target

# Train/valid/test split
idxs = np.random.permutation(len(dataset))
split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset))
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
dataset_tr = dataset[idx_tr]
dataset_va = dataset[idx_va]
dataset_te = dataset[idx_te]

In [24]:
print(f"Train samples: {len(dataset_tr)}\n\
Validation samples: {len(dataset_va)}\n\
Test samples: {len(dataset_te)}")

Train samples: 290
Validation samples: 36
Test samples: 37


In [47]:
################################################################################
# Build model
################################################################################
class BaselineFootballGNN(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = ECCConv(64, activation="relu")
        self.global_pool = GlobalSumPool()
        self.dense1 = Dense(256, activation="relu")
        self.dense2 = Dense(256, activation="relu")
        self.dense = Dense(n_out, activation="linear")

    def call(self, inputs):
        x, a, e, i = inputs
        x = self.conv1([x, a, e])
        output = self.dense1(x)
        output = self.dense2(output)
        output = self.dense(output)

        return output


model = BaselineFootballGNN()
optimizer = Adam(learning_rate)
loss_fn = MeanSquaredError()
model.compile(optimizer, loss_fn)

In [48]:
# Data loaders
loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
loader_va = DisjointLoader(dataset_va, batch_size=batch_size)
loader_te = DisjointLoader(dataset_te, batch_size=batch_size)
model.fit(loader_tr.load(), epochs=epochs, steps_per_epoch=loader_tr.steps_per_epoch)

Epoch 1/1000


InvalidArgumentError:  Incompatible shapes: [223,2] vs. [10,2]
	 [[node gradient_tape/mean_squared_error/BroadcastGradientArgs (defined at var/folders/9y/t37n4bgs7mq466bvfxxgcnk00000gn/T/ipykernel_81108/3152981193.py:5) ]] [Op:__inference_train_function_1211498]

Function call stack:
train_function


In [27]:
model.summary()

Model: "baseline_football_gnn_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
global_sum_pool_2 (GlobalSum multiple                  0         
_________________________________________________________________
dense_6 (Dense)              multiple                  1280      
_________________________________________________________________
dense_7 (Dense)              multiple                  65792     
_________________________________________________________________
dense_8 (Dense)              multiple                  514       
Total params: 67,586
Trainable params: 67,586
Non-trainable params: 0
_________________________________________________________________


In [29]:
eval_results = model.evaluate(loader_va.load(), steps=1)



In [30]:
loader_test = DisjointLoader(dataset, batch_size=10)
model_predictions = model.predict(loader_test.load(), steps=1)
model_predictions

array([[  7.812377  ,  -5.689783  ],
       [-10.165786  ,  -4.4672236 ],
       [  5.1535234 ,  -0.7932325 ],
       [-13.591016  ,  -5.9459047 ],
       [ -7.559742  ,   1.4153159 ],
       [  1.9084967 ,  -6.59728   ],
       [  5.250418  ,   0.86373526],
       [  7.3023896 ,  -1.6272063 ],
       [ -8.373605  ,   4.7530117 ],
       [ -7.721531  ,  -5.2628202 ]], dtype=float32)

In [40]:
loader_test.dataset[9].y

array([-6.212249 , -3.8676364], dtype=float32)