# Graph Neural Networks in football analytics

This notebook aim is to explore and prototype methods to apply GNN in football analytics.

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import os

import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append("../source")

from bokeh.io import output_notebook, show
output_notebook()


# Match analytics library (In progress)
from Match_Analytics import Match
from Tracking_Dynamics import calc_player_norm_positions
from Tracking_Visualization import plot_sliding_window, play_match,  draw_pitch
from Tracking_Filters import possesion_filter, ball_position_filter, time_window
from Tracking_Statistics import bivariate_normal_distribution

#GNN imports

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam

from spektral.data import Dataset, Graph, DisjointLoader, SingleLoader
from spektral.layers import ECCConv, GlobalSumPool, MessagePassing
from spektral.transforms import LayerPreprocess, NormalizeAdj, NormalizeSphere

from sklearn.preprocessing import MinMaxScaler


### Pass network dataset build

The first step is to read the data from all the matches available and build the pass network dataset to train the GNN.

In [2]:
m = [Match(data_source="metrica-sports", match_id=i) for i in [1,2]]

Initializing match: 1

Reading team: home
Reading team: away
Filtering dead time...

Match preprocessed successfully.

Initializing match: 2

Reading team: home
Reading team: away
Filtering dead time...

Match preprocessed successfully.



In [354]:
start_time = 0
end_time = 300

events = time_window(m[0].events, start_time, end_time)
tracking_home = time_window(m[0].tracking_home, start_time, end_time)
tracking_away = time_window(m[0].tracking_away, start_time, end_time)

home_stats = bivariate_normal_distribution(tracking_home, m[0].home_players)
away_stats = bivariate_normal_distribution(tracking_away, m[0].away_players)
nodes_features = pd.concat([home_stats,away_stats], ignore_index=True).astype({"player":'category'})
nodes_features['node_id'] = nodes_features['player'].cat.codes

In [72]:
plot_sliding_window(m[1], normalized = "Both")

In [71]:
m[1].tracking_home.shape[0]*0.04/60


66.002

In [47]:
def build_graph(game, start_time, end_time):
    
    events = time_window(game.events, start_time, end_time)
    tracking_home = time_window(game.tracking_home, start_time, end_time)
    tracking_away = time_window(game.tracking_away, start_time, end_time)

    data = events[events['Type']=='PASS'].groupby(['From',
                                                             'To',
                                                             'Start Time [s]',
                                                             'End Time [s]',
                                                             'Start X',
                                                             'Start Y',
                                                             'End X',
                                                             'End Y']).size().reset_index(name="Freq")
    data = data.groupby(['From','To']).sum().reset_index()
    data = data.apply(lambda x: x/data['Freq'] if x.name in [ 'Start Time [s]', 
                                                              'End Time [s]',
                                                              'Start X',
                                                              'End X',
                                                              'Start Y', 
                                                              'End Y',
                                                             ] else x)
    
    home_stats = bivariate_normal_distribution(tracking_home, game.home_players)
    away_stats = bivariate_normal_distribution(tracking_away, game.away_players)
    nodes_features = pd.concat([home_stats,away_stats], ignore_index=True).sort_values('player').astype({"player":'category'})
    
    players_node_dict = dict(zip(nodes_features['player'].cat.categories,nodes_features['player'].cat.codes))
    
    data["To"] = data["To"].str.replace(" ","").apply(lambda y: players_node_dict[y])
    data["From"] = data["From"].str.replace(" ","").apply(lambda y: players_node_dict[y])
    data["Pass Time [s]"] = data["End Time [s]"] - data["Start Time [s]"]

    a = data[["From", "To", "Freq"]]
    e = data[["Pass Time [s]" , "Start X", "Start Y", "End X", "End Y"]]
    x = nodes_features[['x_mean','y_mean','normx_mean','normy_mean']]
    y = nodes_features[['x_std', 'y_std', 'distance']]
    
    vals = np.unique(a[['From', 'To']])
    df2 = pd.DataFrame(0, index=vals, columns=vals)
    f = df2.index.get_indexer
    df2.values[f(a.From), f(a.To)] = a.Freq.values
    a = df2
    
    a = a.astype("float32")
    e = e.astype("float32")
    x = x.astype("float32")
    y = y.astype("float32")
    
    return a, e, x, y

In [48]:
a, e, x, y = build_graph(m[0], 0, 60*15)

In [53]:
e

Unnamed: 0,Pass Time [s],Start X,Start Y,End X,End Y
0,1.44,16.959999,10.200000,26.500000,8.160000
1,0.64,5.300000,21.080000,-5.300000,12.920000
2,1.14,0.530000,22.100000,-6.360000,18.360001
3,0.92,-12.720000,28.559999,-26.500000,19.719999
4,2.12,3.180000,2.040000,-2.120000,-19.040001
...,...,...,...,...,...
75,1.36,34.980000,-0.680000,34.980000,-2.720000
76,2.80,-10.600000,-2.040000,-14.840000,-27.879999
77,0.80,3.180000,-6.120000,0.000000,-10.200000
78,1.20,48.759998,27.200001,34.980000,33.320000


In [56]:
scaler_e = MinMaxScaler()
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_e.fit(e)
scaler_x.fit(x)
scaler_y.fit(y)

MinMaxScaler()

In [57]:
class PassNetworkDataset(Dataset):
    """
    Pass Network Graph dataset from a list of matches, a rolling window size and time step must be provided.
    """
    def __init__(self, games, window_size, step_size, **kwargs):
        self.games = games
        self.window_size = window_size
        self.step_size = step_size
        
        super().__init__(**kwargs)
    def download(self):
        # Create the directory
        os.mkdir(self.path)

        # Write the data to file
        for game in self.games:
            start_time = 0
            end_time = start_time + self.window_size
            while end_time <= game.tracking_home["Time [s]"].max():
                start_time += self.step_size
                end_time += self.step_size
                a, e, x, y = build_graph(game, start_time, end_time)
                filename = os.path.join(self.path, f'graph_{start_time}_{end_time}')
                np.savez(filename, a=a, e=e, x=x, y=y)
    def read(self):
        # We must return a list of Graph objects
        output = []

        for game in self.games:
            start_time = 0
            end_time = start_time + self.window_size
            while end_time <= game.tracking_home["Time [s]"].max():
                start_time += self.step_size
                end_time += self.step_size
                data = np.load(os.path.join(self.path, f'graph_{start_time}_{end_time}.npz'))
                output.append(
                    Graph(x=scaler_x.transform(data['x']), a=csr_matrix(data['a']), y=scaler_y.transform(data['y']), e=scaler_e.transform(data['e']))
                )

        return output

In [58]:
################################################################################
# Config
################################################################################
learning_rate = 1e-2  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 32  # Batch size

In [59]:
################################################################################
# Load data
################################################################################
dataset = PassNetworkDataset(m, 60*15, 30, transforms=[NormalizeAdj(symmetric=False)])

# Parameters
F = dataset.n_node_features  # Dimension of node features
S = dataset.n_edge_features  # Dimension of edge features
n_out = dataset.n_labels  # Dimension of the target

# Train/valid/test split
idxs = np.random.permutation(len(dataset))
split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset))
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
dataset_tr = dataset[idx_tr]
dataset_va = dataset[idx_va]
dataset_te = dataset[idx_te]

In [60]:
print(dataset[0].y)

[[ 0.85937476  0.4579314   0.8301678 ]
 [ 0.70138156  0.47048146  0.7667153 ]
 [ 0.76896274  0.41308022  0.78033864]
 [ 0.8794972   0.5040419   0.7907065 ]
 [ 0.974702    0.4812039   1.0077597 ]
 [ 0.8912078   0.59283185  0.93285656]
 [ 0.7893902   0.55070525  0.8503872 ]
 [ 0.9100988   0.67604643  0.7482178 ]
 [ 0.7490555   0.5887899   0.7612901 ]
 [ 0.8051895   0.7193366   0.7975198 ]
 [ 0.06301826 -0.05469709 -0.02409491]
 [ 1.0759875   0.4812445   0.9010371 ]
 [ 0.8470253   0.51641774  0.8303685 ]
 [-0.03149694  0.03104673  0.12039617]
 [ 0.6200862   0.4882776   0.73637676]
 [ 0.65958416  0.4391902   0.704365  ]
 [ 0.8201915   0.43248338  0.9048121 ]
 [ 0.92322123  0.6692934   0.9741174 ]
 [ 0.68014646  0.65184927  0.846812  ]
 [ 1.1069009   0.5788281   1.1032739 ]
 [ 0.96744     0.64040256  1.0158182 ]
 [ 0.83513117  0.80810535  0.7128906 ]]


In [61]:
print(f"Train samples: {len(dataset_tr)}\n\
Validation samples: {len(dataset_va)}\n\
Test samples: {len(dataset_te)}")

Train samples: 258
Validation samples: 32
Test samples: 33


In [62]:
################################################################################
# Build model
################################################################################
class BaselineFootballGNN(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = ECCConv(256, activation="relu")
        self.conv2 = ECCConv(256, activation="relu")
        self.conv3 = ECCConv(3, activation="linear")

    def call(self, inputs):
        x, a, e, i = inputs
        x = self.conv1([x, a, e])
        x = self.conv2([x, a, e])
        x = self.conv3([x, a, e])

        return x


model = BaselineFootballGNN()
optimizer = Adam(learning_rate)
loss_fn = MeanSquaredError()
model.compile(optimizer, loss_fn)

2022-05-04 18:25:45.668610: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-05-04 18:25:45.675967: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [63]:
# Data loaders
loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs, node_level=True)
loader_va = DisjointLoader(dataset_va, batch_size=batch_size, node_level=True)
loader_te = DisjointLoader(dataset_te, batch_size=batch_size, node_level=True)
model.fit(loader_tr.load(), epochs=epochs, steps_per_epoch=loader_tr.steps_per_epoch)

2022-05-04 18:25:50.711122: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50

KeyboardInterrupt: 

In [64]:
eval_results = model.evaluate(loader_va.load(), steps=1)
loader_va.steps_per_epoch



1

In [66]:
loader_test = DisjointLoader(dataset[0:1], batch_size=batch_size, node_level=True)
model_predictions = model.predict(loader_test.load(), steps=1)
model_predictions

array([[0.5149211 , 0.36516303, 0.3375267 ],
       [0.46559933, 0.4220923 , 0.44220665],
       [0.26492107, 0.21083173, 0.2758105 ],
       [0.6542553 , 0.5339158 , 0.6632182 ],
       [0.7345098 , 0.47384384, 0.48192564],
       [0.87060994, 0.6768361 , 0.7697088 ],
       [0.6747756 , 0.43659237, 0.48345545],
       [0.4678639 , 0.2838161 , 0.397673  ],
       [0.10919253, 0.12212954, 0.1597865 ],
       [0.6920363 , 0.39784393, 0.5773543 ],
       [0.0455641 , 0.0210865 , 0.0533072 ],
       [1.38106   , 1.2914697 , 1.6692159 ],
       [0.9171086 , 0.8241428 , 0.99389106],
       [1.1821918 , 0.6820285 , 0.7825303 ],
       [1.1514484 , 0.88262767, 1.0046681 ],
       [0.24845013, 0.177832  , 0.14843126],
       [1.4584209 , 1.0250872 , 1.1495156 ],
       [1.5038136 , 1.0829247 , 1.4810326 ],
       [1.1033167 , 0.7646451 , 0.9053337 ],
       [0.927487  , 0.62843335, 0.61118025],
       [1.2468741 , 0.934214  , 1.0038227 ],
       [1.2089005 , 1.0003514 , 1.2860439 ]], dtype=flo

ERROR:bokeh.server.protocol_handler:error handling message
 message: Message 'PATCH-DOC' content: {'events': [{'kind': 'ModelChanged', 'model': {'id': '242389'}, 'attr': 'value', 'new': [0, 4954.8]}], 'references': []} 
 error: KeyError(13)
Traceback (most recent call last):
  File "/Users/isidre/Documents/UAB/TFM/FootballAnalyticsTFM/.venv1/lib/python3.8/site-packages/pandas/core/indexes/range.py", line 385, in get_loc
    return self._range.index(new_key)
ValueError: 13 is not in range

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/isidre/Documents/UAB/TFM/FootballAnalyticsTFM/.venv1/lib/python3.8/site-packages/bokeh/server/protocol_handler.py", line 97, in handle
    work = await handler(message, connection)
  File "/Users/isidre/Documents/UAB/TFM/FootballAnalyticsTFM/.venv1/lib/python3.8/site-packages/bokeh/server/session.py", line 93, in _needs_document_lock_wrapper
    result = func(self, *args, **kwargs)
 

In [331]:
dataset[0:1][0].y

array([[ 0.5054156 ,  0.47939983,  0.65110755],
       [ 0.34424847,  0.50711954,  0.6371627 ],
       [ 0.38118178,  0.46473727,  0.6420908 ],
       [ 0.5798493 ,  0.5088868 ,  0.72407615],
       [ 0.72201926,  0.4906737 ,  1.0142263 ],
       [ 0.52551764,  0.5673492 ,  0.7857516 ],
       [ 0.5075229 ,  0.57717705,  0.7731726 ],
       [ 0.7242132 ,  0.6115916 ,  0.75973845],
       [ 0.51484567,  0.56948125,  0.7098863 ],
       [ 0.54580337,  0.7217463 ,  0.7224225 ],
       [-0.1081565 , -0.02951264, -0.09644738],
       [ 0.7993551 ,  0.4531122 ,  0.88478553],
       [ 0.45979744,  0.5064608 ,  0.67321694],
       [-0.10407141,  0.04832491,  0.07958069],
       [ 0.4484101 ,  0.5318415 ,  0.6768297 ],
       [ 0.509005  ,  0.5153173 ,  0.67696416],
       [ 0.59902054,  0.47518405,  0.8646128 ],
       [ 0.6943211 ,  0.7428477 ,  0.87494826],
       [ 0.4611054 ,  0.7209219 ,  0.81228316],
       [ 0.71425873,  0.55930436,  1.0077678 ],
       [ 0.6149476 ,  0.44961008,  0.931