# Graph Neural Networks in football analytics

This notebook aim is to explore and prototype methods to apply GNN in football analytics.

In [39]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import matplotlib.pyplot as plt
import os

import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append("../source")

from bokeh.io import output_notebook, show
output_notebook()


# Match analytics library (In progress)
from Match_Analytics import Match
from Tracking_Dynamics import calc_player_norm_positions
from Tracking_Visualization import plot_sliding_window, play_match,  draw_pitch
from Tracking_Filters import possesion_filter, ball_position_filter, time_window
from Tracking_Statistics import bivariate_normal_distribution

#GNN imports

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam

from spektral.data import Dataset, Graph, DisjointLoader, SingleLoader
from spektral.layers import ECCConv, GlobalSumPool, MessagePassing
from spektral.transforms import LayerPreprocess, NormalizeAdj, NormalizeSphere

from sklearn.preprocessing import MinMaxScaler


### Pass network dataset build

The first step is to read the data from all the matches available and build the pass network dataset to train the GNN.

In [2]:
m = [Match(data_source="metrica-sports", match_id=i) for i in [1,2]]

Initializing match: 1

Reading team: home
Reading team: away
Filtering dead time...

Match preprocessed successfully.

Initializing match: 2

Reading team: home
Reading team: away
Filtering dead time...

Match preprocessed successfully.



In [3]:
start_time = 0
end_time = 300

events = time_window(m[0].events, start_time, end_time)
tracking_home = time_window(m[0].tracking_home, start_time, end_time)
tracking_away = time_window(m[0].tracking_away, start_time, end_time)

home_stats = bivariate_normal_distribution(tracking_home, m[0].home_players)
away_stats = bivariate_normal_distribution(tracking_away, m[0].away_players)
nodes_features = pd.concat([home_stats,away_stats], ignore_index=True).astype({"player":'category'})
nodes_features['node_id'] = nodes_features['player'].cat.codes

In [16]:
plot_sliding_window(m[1], normalized = "Both")

In [353]:
def build_graph(game, start_time, end_time):
    
    events = time_window(game.events, start_time, end_time)
    tracking_home = time_window(game.tracking_home, start_time, end_time)
    tracking_away = time_window(game.tracking_away, start_time, end_time)

    data = events[events['Type']=='PASS'].groupby(['From',
                                                             'To',
                                                             'Start Time [s]',
                                                             'End Time [s]',
                                                             'Start X',
                                                             'Start Y',
                                                             'End X',
                                                             'End Y']).size().reset_index(name="Freq")
    data = data.groupby(['From','To']).sum().reset_index()
    data = data.apply(lambda x: x/data['Freq'] if x.name in [ 'Start Time [s]', 
                                                              'End Time [s]',
                                                              'Start X',
                                                              'End X',
                                                              'Start Y', 
                                                              'End Y',
                                                             ] else x)
    
    home_stats = bivariate_normal_distribution(tracking_home, game.home_players)
    away_stats = bivariate_normal_distribution(tracking_away, game.away_players)
    nodes_features = pd.concat([home_stats,away_stats], ignore_index=True).sort_values('player').astype({"player":'category'})
    
    players_node_dict = dict(zip(nodes_features['player'].cat.categories,nodes_features['player'].cat.codes))
    
    data["To"] = data["To"].str.replace(" ","").apply(lambda y: players_node_dict[y])
    data["From"] = data["From"].str.replace(" ","").apply(lambda y: players_node_dict[y])
    data["Pass Time [s]"] = data["End Time [s]"] - data["Start Time [s]"]

    a = data[["From", "To", "Freq"]]
    e = data[["Pass Time [s]" , "Start X", "Start Y", "End X", "End Y"]]
    x = nodes_features[['x_mean','y_mean','normx_mean','normy_mean']]
    y = pd.DataFrame()
    y['home_distance'] = nodes_features[nodes_features['player'].str.split("_", expand = True)[0]=="Home"]['distance']
    y['away_distance'] = nodes_features[nodes_features['player'].str.split("_", expand = True)[0]=="Away"].reset_index()['distance']
    
    vals = np.unique(a[['From', 'To']])
    df2 = pd.DataFrame(0, index=vals, columns=vals)
    f = df2.index.get_indexer
    df2.values[f(a.From), f(a.To)] = a.Freq.values
    a = df2
    
    a = a.astype("float32")
    e = e.astype("float32")
    x = x.astype("float32")
    y = y.astype("float32").mean()
    
    return a, e, x, y

In [329]:
a, e, x, y, node_features = build_graph(m[0], 0, 300)

In [330]:
class PassNetworkDataset(Dataset):
    """
    Pass Network Graph dataset from a list of matches, a rolling window size and time step must be provided.
    """
    def __init__(self, games, window_size, step_size, **kwargs):
        self.games = games
        self.window_size = window_size
        self.step_size = step_size
        
        super().__init__(**kwargs)
    def download(self):
        # Create the directory
        os.mkdir(self.path)

        # Write the data to file
        for game in self.games:
            start_time = 0
            end_time = start_time + self.window_size
            while end_time <= game.tracking_home["Time [s]"].max():
                start_time += self.step_size
                end_time += self.step_size
                a, e, x, y = build_graph(game, start_time, end_time)
                filename = os.path.join(self.path, f'graph_{start_time}_{end_time}')
                np.savez(filename, a=a, e=e, x=x, y=y)
    def read(self):
        # We must return a list of Graph objects
        output = []

        for game in self.games:
            start_time = 0
            end_time = start_time + self.window_size
            while end_time <= game.tracking_home["Time [s]"].max():
                start_time += self.step_size
                end_time += self.step_size
                data = np.load(os.path.join(self.path, f'graph_{start_time}_{end_time}.npz'))
                output.append(
                    Graph(x=data['x'], a=csr_matrix(data['a']), y=data['y'], e=data['e'])
                )

        return output

In [354]:
################################################################################
# Config
################################################################################
learning_rate = 1e-2  # Learning rate
epochs = 200  # Number of training epochs
batch_size = 10  # Batch size

In [356]:
################################################################################
# Load data
################################################################################
dataset = PassNetworkDataset(m, 60*15, 30, transforms=[NormalizeAdj(symmetric=False)])

# Parameters
F = dataset.n_node_features  # Dimension of node features
S = dataset.n_edge_features  # Dimension of edge features
n_out = dataset.n_labels  # Dimension of the target

# Train/valid/test split
idxs = np.random.permutation(len(dataset))
split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset))
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
dataset_tr = dataset[idx_tr]
dataset_va = dataset[idx_va]
dataset_te = dataset[idx_te]

In [334]:
print(f"Train samples: {len(dataset_tr)}\n\
Validation samples: {len(dataset_va)}\n\
Test samples: {len(dataset_te)}")

Train samples: 258
Validation samples: 32
Test samples: 33


In [351]:
################################################################################
# Build model
################################################################################
class BaselineFootballGNN(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = ECCConv(64, activation="relu")
        self.conv2 = ECCConv(64, activation="relu")
        self.conv3 = ECCConv(64, activation="relu")
        self.global_pool = GlobalSumPool()
        self.dense1 = Dense(200, activation="relu")
        self.dense2 = Dense(200, activation="relu")
        self.dense = Dense(n_out, activation="linear")

    def call(self, inputs):
        x, a, e, i = inputs
        x = self.conv1([x, a, e])
        x = self.conv2([x, a, e])
        x = self.conv3([x, a, e])
        output = self.global_pool([x, i])
        output = self.dense1(output)
        output = self.dense2(output)
        output = self.dense(output)

        return output


model = BaselineFootballGNN()
optimizer = Adam(learning_rate)
loss_fn = MeanSquaredError()
model.compile(optimizer, loss_fn)

In [352]:
# Data loaders
loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
loader_va = DisjointLoader(dataset_va, batch_size=batch_size)
loader_te = DisjointLoader(dataset_te, batch_size=batch_size)
model.fit(loader_tr.load(), epochs=epochs, steps_per_epoch=loader_tr.steps_per_epoch)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200

KeyboardInterrupt: 

In [337]:
eval_results = model.evaluate(loader_va.load(), steps=1)
loader_va.steps_per_epoch







4

In [348]:
loader_test = DisjointLoader(dataset[100:110], batch_size=batch_size)
model_predictions = model.predict(loader_test.load(), steps=1)
model_predictions

array([[1327.6465 , 1288.9764 ],
       [1652.6111 , 1559.8121 ],
       [1461.6605 , 1420.6996 ],
       [1805.9951 , 1695.9956 ],
       [1209.8632 , 1233.0382 ],
       [1268.924  , 1240.3374 ],
       [1734.7996 , 1641.4594 ],
       [1339.8123 , 1350.2394 ],
       [1264.9515 , 1228.7888 ],
       [ 984.0941 ,  924.41785]], dtype=float32)

In [347]:
dataset[100:110][9].y

array([1354.773 , 1355.0172], dtype=float32)