# Graph Neural Networks in football analytics

This notebook aim is to explore and prototype methods to apply GNN in football analytics.

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import os

import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append("../source")

from bokeh.io import output_notebook, show
output_notebook()


# Match analytics library (In progress)
from Match_Analytics import Match
from Tracking_Dynamics import calc_player_norm_positions
from Tracking_Visualization import plot_sliding_window, play_match,  draw_pitch
from Tracking_Filters import possesion_filter, ball_position_filter, time_window
from Tracking_Statistics import bivariate_normal_distribution

#GNN imports

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam

from spektral.data import Dataset, Graph, DisjointLoader, SingleLoader
from spektral.layers import ECCConv, GlobalSumPool, MessagePassing

from sklearn.preprocessing import MinMaxScaler


### Pass network dataset build

The first step is to read the data from all the matches available and build the pass network dataset to train the GNN.

In [2]:
m = [Match(data_source="metrica-sports", match_id=i) for i in [1,2]]

Initializing match: 1

Reading team: home
Reading team: away
Filtering dead time...

Match preprocessed successfully.

Initializing match: 2

Reading team: home
Reading team: away
Filtering dead time...

Match preprocessed successfully.



In [24]:
start_time = 0
end_time = 300

events = time_window(m[0].events, start_time, end_time)
tracking_home = time_window(m[0].tracking_home, start_time, end_time)
tracking_away = time_window(m[0].tracking_away, start_time, end_time)

home_stats = bivariate_normal_distribution(tracking_home, m[0].home_players)
away_stats = bivariate_normal_distribution(tracking_away, m[0].away_players)
nodes_features = pd.concat([home_stats,away_stats], ignore_index=True).astype({"player":'category'})
nodes_features['node_id'] = nodes_features['player'].cat.codes

In [25]:
nodes_features

Unnamed: 0,player_number,player,x_mean,y_mean,distance,normx_mean,normy_mean,cov_x_std,cov_y_std,cov_angle,x_std,y_std,cov_normx_std,cov_normy_std,cov_norm_angle,normx_std,normy_std,node_id
0,1,Home_1,-13.665268,-17.255951,640.068363,-0.362144,-1.278479,15.188261,8.339028,-0.021348,15.185844,8.34343,0.287986,0.236241,-0.222168,0.28569,0.239012,11
1,10,Home_10,10.076814,2.039163,573.575775,1.169225,0.442604,17.455848,11.189985,-0.670171,15.344733,13.944948,0.351818,0.702507,0.171962,0.366882,0.694759,12
2,11,Home_11,-42.185777,0.255707,205.274654,-2.181775,0.286284,5.999636,1.849838,0.071361,5.985819,1.894068,0.330044,0.565005,0.036421,0.330466,0.564758,13
3,2,Home_2,-16.47036,-9.302369,630.235987,-0.547495,-0.555841,15.456587,8.606973,-0.114999,15.386224,8.732134,0.279621,0.205559,-0.352841,0.27184,0.215744,14
4,3,Home_3,-17.151524,-0.34916,623.587627,-0.594931,0.233988,16.099048,7.540678,0.044067,16.08685,7.566667,0.360113,0.161211,0.710914,0.292455,0.26484,15
5,4,Home_4,-16.146418,11.783978,551.406082,-0.518871,1.313957,13.390284,7.612208,0.156618,13.27958,7.803728,0.152379,0.409343,0.287791,0.186676,0.394884,16
6,5,Home_5,1.968625,-19.277157,690.726313,0.61014,-1.468607,17.501119,8.081026,-0.071168,17.466272,8.15607,0.325584,0.28153,0.398621,0.319336,0.288598,17
7,6,Home_6,-1.659665,-7.052561,663.822569,0.376973,-0.361146,18.200579,10.858423,-0.357345,17.468733,12.000408,0.250879,0.507357,0.114298,0.25587,0.504858,18
8,7,Home_7,-8.032603,0.604457,668.9104,0.002225,0.325124,15.088343,10.469426,-0.189338,14.94913,10.667262,0.237576,0.341877,0.175486,0.241422,0.339172,19
9,8,Home_8,3.322796,13.554076,731.614848,0.683595,1.480911,19.702372,8.466795,-0.011732,19.701267,8.469367,0.367474,0.453787,0.311994,0.376452,0.446367,20


In [122]:
def build_graph(game, start_time, end_time):
    
    events = time_window(game.events, start_time, end_time)
    tracking_home = time_window(game.tracking_home, start_time, end_time)
    tracking_away = time_window(game.tracking_away, start_time, end_time)

    data = events[events['Type']=='PASS'].groupby(['From',
                                                             'To',
                                                             'Start Time [s]',
                                                             'End Time [s]',
                                                             'Start X',
                                                             'Start Y',
                                                             'End X',
                                                             'End Y']).size().reset_index(name="Freq")
    data = data.groupby(['From','To']).sum().reset_index()
    data = data.apply(lambda x: x/data['Freq'] if x.name in [ 'Start Time [s]', 
                                                              'End Time [s]',
                                                              'Start X',
                                                              'End X',
                                                              'Start Y', 
                                                              'End Y',
                                                             ] else x)
    
    home_stats = bivariate_normal_distribution(tracking_home, game.home_players)
    away_stats = bivariate_normal_distribution(tracking_away, game.away_players)
    nodes_features = pd.concat([home_stats,away_stats], ignore_index=True).sort_values('player').astype({"player":'category'})
    
    players_node_dict = dict(zip(nodes_features['player'].cat.categories,nodes_features['player'].cat.codes))
    
    data["To"] = data["To"].str.replace(" ","").apply(lambda y: players_node_dict[y])
    data["From"] = data["From"].str.replace(" ","").apply(lambda y: players_node_dict[y])

    a = data[["From", "To", "Freq"]]
    e = data[["Start Time [s]", "End Time [s]", "Start X", "Start Y", "End X", "End Y"]]
    x = nodes_features[['x_mean','y_mean','normx_mean','normy_mean']]
    y = nodes_features[['x_std', 'y_std', 'distance']]
    
    vals = np.unique(a[['From', 'To']])
    df2 = pd.DataFrame(0, index=vals, columns=vals)
    f = df2.index.get_indexer
    df2.values[f(a.From), f(a.To)] = a.Freq.values
    a = df2
    
    a = a.astype("float32")
    e = e.astype("float32")
    x = x.astype("float32")
    y = y.astype("float32")
    
    scaler_a = MinMaxScaler()
    scaler_e = MinMaxScaler()
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    
    a = scaler_a.fit_transform(a)
    e = scaler_e.fit_transform(e)
    x = scaler_x.fit_transform(x)
    y = scaler_y.fit_transform(y)
    
    return a, e, x, y

In [124]:
a, e, x, y = build_graph(m[0], 0, 300)

In [126]:
m[1].events[m[1].events["To"] == "Away_ 26"]

Unnamed: 0,Team,Type,Subtype,Period,Start Frame,Start Time [s],End Frame,End Time [s],From,To,Start X,Start Y,End X,End Y
1536,Away,PASS,,2,102118,4084.72,102148,4085.92,Away_15,Away_ 26,14.84,-21.76,31.8,-29.24
1659,Away,PASS,,2,111530,4461.2,111570,4462.8,Away_20,Away_ 26,15.9,-3.4,27.56,1.36
1683,Away,PASS,GOAL KICK,2,118048,4721.92,118135,4725.4,Away_25,Away_ 26,-48.76,4.08,5.3,-19.72
1713,Away,PASS,,2,120717,4828.68,120747,4829.88,Away_15,Away_ 26,-25.44,-32.64,-13.78,-24.48
1726,Away,PASS,,2,122962,4918.48,123022,4920.88,Away_20,Away_ 26,-6.36,2.72,4.24,-29.24
1734,Away,PASS,,2,123425,4937.0,123447,4937.88,Away_15,Away_ 26,46.64,-33.32,50.88,-31.96
1764,Away,PASS,,2,126350,5054.0,126369,5054.76,Away_20,Away_ 26,-7.42,-15.64,-3.18,-25.16
1873,Away,PASS,,2,134529,5381.16,134549,5381.96,Away_20,Away_ 26,-2.12,-11.56,11.66,-12.92
1901,Away,PASS,CROSS,2,136020,5440.8,136060,5442.4,Away_23,Away_ 26,42.4,22.44,47.7,-2.72


In [127]:
class PassNetworkDataset(Dataset):
    """
    Pass Network Graph dataset from a list of matches, a rolling window size and time step must be provided.
    """
    def __init__(self, games, window_size, step_size, **kwargs):
        self.games = games
        self.window_size = window_size
        self.step_size = step_size
        
        super().__init__(**kwargs)
    def download(self):
        # Create the directory
        os.mkdir(self.path)

        # Write the data to file
        for game in self.games:
            start_time = 0
            end_time = start_time + self.window_size
            while end_time <= game.tracking_home["Time [s]"].max():
                start_time += self.step_size
                end_time += self.step_size
                a, e, x, y = build_graph(game, start_time, end_time)
                filename = os.path.join(self.path, f'graph_{start_time}_{end_time}')
                np.savez(filename, a=a, e=e, x=x, y=y)
    def read(self):
        # We must return a list of Graph objects
        output = []

        for game in self.games:
            start_time = 0
            end_time = start_time + self.window_size
            while end_time <= game.tracking_home["Time [s]"].max():
                start_time += self.step_size
                end_time += self.step_size
                data = np.load(os.path.join(self.path, f'graph_{start_time}_{end_time}.npz'))
                output.append(
                    Graph(x=data['x'], a=csr_matrix(data['a']), y=data['y'], e=data['e'])
                )

        return output

In [128]:
################################################################################
# Config
################################################################################
learning_rate = 1e-2  # Learning rate
epochs = 100  # Number of training epochs
batch_size = 32  # Batch size

In [129]:
################################################################################
# Load data
################################################################################
dataset = PassNetworkDataset(m, 60*5, 30)

# Parameters
F = dataset.n_node_features  # Dimension of node features
S = dataset.n_edge_features  # Dimension of edge features
n_out = dataset.n_labels  # Dimension of the target

# Train/valid/test split
idxs = np.random.permutation(len(dataset))
split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset))
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
dataset_tr = dataset[idx_tr]
dataset_va = dataset[idx_va]
dataset_te = dataset[idx_te]

KeyError: 'Away_ 26'

In [91]:
print(f"Train samples: {len(dataset_tr)}\n\
Validation samples: {len(dataset_va)}\n\
Test samples: {len(dataset_te)}")

Train samples: 290
Validation samples: 36
Test samples: 37


In [92]:
################################################################################
# Build model
################################################################################
class BaselineFootballGNN(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = ECCConv(256, activation="relu")
        self.conv2 = ECCConv(256, activation="relu")
        self.conv3 = ECCConv(3, activation="relu")

    def call(self, inputs):
        x, a, e, i = inputs
        x = self.conv1([x, a, e])
        x = self.conv2([x, a, e])
        x = self.conv3([x, a, e])

        return x


model = BaselineFootballGNN()
optimizer = Adam(learning_rate)
loss_fn = MeanSquaredError()
model.compile(optimizer, loss_fn)

In [93]:
# Data loaders
loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs, node_level=True)
loader_va = DisjointLoader(dataset_va, batch_size=batch_size, node_level=True)
loader_te = DisjointLoader(dataset_te, batch_size=batch_size, node_level=True)
model.fit(loader_tr.load(), epochs=epochs, steps_per_epoch=loader_tr.steps_per_epoch)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100

KeyboardInterrupt: 

In [None]:
eval_results = model.evaluate(loader_va.load(), steps=1)
loader_va.steps_per_epoch

In [None]:
loader_test = DisjointLoader(dataset[0:1], batch_size=batch_size, node_level=True)
model_predictions = model.predict(loader_test.load(), steps=1)
model_predictions

In [None]:
dataset[0:1][0].y