In [1]:
%load_ext autoreload
%autoreload 2
import os; import sys; sys.path.insert(0, '../')
import pandas as pd
import tqdm
import pickle

import numpy as np
import warnings

In [2]:
### Configure file and folder names
data_h5 = "../data/paper/soccermix_all_data.h5"

d_weights = "../data/paper/soccermix_all_dirweights.pkl"

spadl_h5 = "../data/tomd/spadl-statsbomb.h5"

In [3]:
X = pd.read_hdf(data_h5, "X")

In [4]:
def loadall(filename):
    with open(filename, "rb") as f:
        while True:
            try:
                yield pickle.load(f)
            except EOFError:
                break

d_w = loadall(d_weights)

In [5]:
dir_weights = next(d_w)

In [6]:
games = pd.read_hdf(spadl_h5, "games")

games_1819 = games[games.season_name == '2018/2019']
games_1718 = games[games.season_name == '2017/2018']

In [7]:
players = pd.read_hdf(spadl_h5, "players")
pg = pd.read_hdf(spadl_h5, "player_games")

In [8]:
pg_1819 = pg[pg.game_id.isin(games_1819.game_id)]
pg_1718 = pg[pg.game_id.isin(games_1718.game_id)]

In [9]:
players_1819 = players[players.player_id.isin(pg_1819.player_id)]
players_1718 = players[players.player_id.isin(pg_1718.player_id)]

In [10]:
mp_1819 = pg_1819[["player_id", "minutes_played"]].groupby("player_id").sum().reset_index()
mp_1718 = pg_1718[["player_id", "minutes_played"]].groupby("player_id").sum().reset_index()

In [11]:
# Get player vectors

merged_weights = dir_weights.copy()
merged_weights["player_id"] = X.player_id.values
merged_weights["game_id"] = X.game_id.values

vectors_1718 = {}
for p in tqdm.tqdm(list(players_1718.player_id.unique())):
    vectors_1718[int(p)] = merged_weights.loc[((merged_weights.player_id == p)
                                              & (merged_weights.game_id.isin(games_1718.game_id))),
                                              dir_weights.columns].sum().values
    
vectors_1819 = {}
for p in tqdm.tqdm(list(players_1819.player_id.unique())):
    vectors_1819[int(p)] = merged_weights.loc[((merged_weights.player_id == p)
                                              & (merged_weights.game_id.isin(games_1819.game_id))),
                                              dir_weights.columns].sum().values
    
vectors_1718_pd = pd.concat({k: pd.DataFrame(v).T for k,v in vectors_1718.items()}).droplevel(level=1)
vectors_1718_pd.index.name = "player_id"
vectors_1718_pd.columns = dir_weights.columns

vectors_1819_pd = pd.concat({k: pd.DataFrame(v).T for k,v in vectors_1819.items()}).droplevel(level=1)
vectors_1819_pd.index.name = "player_id"
vectors_1819_pd.columns = dir_weights.columns

100%|██████████| 515/515 [02:00<00:00,  4.29it/s]
100%|██████████| 505/505 [01:46<00:00,  4.73it/s]


In [12]:
# Normalize vectors per 90 min game time

vectors_1718_norm = pd.merge(vectors_1718_pd, mp_1718, left_index=True, right_on='player_id').set_index('player_id')
df1 = vectors_1718_norm.loc[:, dir_weights.columns] * 90
vectors_1718_norm.loc[:, dir_weights.columns] = df1.divide(vectors_1718_norm.minutes_played, axis='rows')
vectors_1718_norm.drop(columns=['minutes_played'], inplace=True)

vectors_1819_norm = pd.merge(vectors_1819_pd, mp_1819, left_index=True, right_on='player_id').set_index('player_id')
df1 = vectors_1819_norm.loc[:, dir_weights.columns] * 90
vectors_1819_norm.loc[:, dir_weights.columns] = df1.divide(vectors_1819_norm.minutes_played, axis='rows')
vectors_1819_norm.drop(columns=['minutes_played'], inplace=True)

In [13]:
# Code below mainly from Pieter's implementation of this experiment with soccer vectors
# https://github.com/probberechts/soccer-player-vectors-thesis/blob/master/notebooks/5-experiments.ipynb

# Select correct players to test on 

train_players = pg_1718.groupby('player_id').agg({
    'minutes_played': 'sum',
    'team_id': set
}).merge(players_1718, on="player_id", how='left')

test_players = pg_1819.groupby('player_id').agg({
    'minutes_played': 'sum',
    'team_id': set
}).merge(players_1819, on="player_id", how='left')

In [14]:
all_players = pd.merge(train_players, test_players, on="player_id", suffixes=("_train", "_test"))
all_players['nb_teams'] = all_players.apply(lambda x: len(x.team_id_train | x.team_id_test), axis=1)
all_players = all_players[all_players.nb_teams == 1]

In [15]:
# Only players who played >= 900 minutes in both train and test season
all_players = all_players[(all_players.minutes_played_train >= 900) & (all_players.minutes_played_test >= 900)]

In [16]:
all_players = all_players.player_id.unique()
print("Number of players: ", len(all_players))

Number of players:  193


In [17]:
# Compute pairwise distances

from sklearn.metrics import pairwise_distances
from sklearn import preprocessing

# D = pairwise_distances(
#     vectors_1718_norm.loc[all_players],
#     vectors_1819_norm.loc[all_players],
#     metric='manhattan'
# )

D = pairwise_distances(
    preprocessing.normalize(vectors_1718_norm.loc[all_players], norm="l1"),
    preprocessing.normalize(vectors_1819_norm.loc[all_players], norm="l1"),
    metric="manhattan")

# sort each row
k_d = np.sort(D, axis = 1) 
# sort each row and replace distances by index
k_i = np.argsort(D, axis = 1) 
# replace indices by player ids
p_i = np.take(all_players, k_i, axis = 0)

In [18]:
rs = np.argmax(np.array([p_i[i,:] == all_players[i] for i in range(p_i.shape[0])]), axis=1)
rs

array([ 13,   1,   4,   0,   4,   0,  90,   0,   0,   0,   0,   0,   0,
         4,   0,   0,   0,   0,  20,   6,   0,   0,  13,   0,   1,   0,
         5,   0,   0,  29,   0,   3,   2,   1,  15,   1,   1, 142,   0,
         1,   1,   0,   0,   0,   3,   0,   0,   0,   6,   7,   1,   0,
         0,   4,   5,   0,   0,   0,   0,   0,   2,   7,   0,  15,   0,
         7,   0,   5,   2,   0,   0,  11,   5,  12,   0,   0,   4,   0,
         0,   2,   0,   1,   0,   0,   0,  60,   0,   8,   3,   0,   8,
         2,   0,   0,   0,  10,  13,   0,   0,   3,  25,  27,  23,   0,
         2,   0,   0,  34,   0,   1,  20,   1,   0,   0,   1,   0,   2,
        16,   3,   0,   0,   0,  13,   1,  11,  11,   9,   0,   8,   3,
       158,   0,   0, 106,   0,   0,   0,   5,   0,   4,   0,  40,   1,
         0,  90,   6,   0,   0,   0,   0,   0,   0,   0,   0,   1,   4,
         1,   0,   0,   0,   2,   0,   6,   0,   1,   0,   0,  24,  11,
        37,  11,  17,   4,   4,  73,  53,   1,   1,   6,   2,   

In [19]:
def mean_reciprocal_rank(rs):
    return np.mean(1. / (rs + 1))

def top_k(rs, k):
    return (rs < k).sum() / len(rs)

In [20]:
mean_reciprocal_rank(rs)

0.5885390745244184

In [21]:
top_k(rs, 10)

0.8082901554404145

In [22]:
print(top_k(rs, 5))
print(top_k(rs, 3))
print(top_k(rs, 1))

0.7150259067357513
0.6269430051813472
0.48186528497409326


# Get similar players to player

In [23]:
def get_similar_players(player_id):
    player_index = np.where(all_players == player_id)[0][0]
    print(player_index)
    sims = p_i[player_index,:]
    names = players_1819.set_index("player_id").loc[sims, "player_name"].values
    dists = k_d[player_index,:]
    return pd.DataFrame({"name": names, "dist": dists})

In [24]:
print(train_players[train_players.player_name.str.contains('Jesus')].player_id)
print(test_players[test_players.player_name.str.contains('Jesus')].player_id)

print(train_players[train_players.player_name.str.contains('Agüero')].player_id)
print(test_players[test_players.player_name.str.contains('Agüero')].player_id)

55    3202
Name: player_id, dtype: int64
48    3202
Name: player_id, dtype: int64
60    3237
Name: player_id, dtype: int64
53    3237
Name: player_id, dtype: int64


In [25]:
get_similar_players(3237) # Similar to Aguero

43


Unnamed: 0,name,dist
0,Sergio Leonel Agüero del Castillo,0.208176
1,Marko Arnautović,0.293974
2,Gabriel Fernando de Jesus,0.319744
3,Cenk Tosun,0.322811
4,Jamie Vardy,0.349722
...,...,...
188,Alex McCarthy,1.914147
189,Martin Dúbravka,1.914678
190,Asmir Begović,1.915225
191,Hugo Lloris,1.924485


In [26]:
get_similar_players(3202) # Similar to Jesus

39


Unnamed: 0,name,dist
0,Sergio Leonel Agüero del Castillo,0.232574
1,Gabriel Fernando de Jesus,0.235393
2,Jamie Vardy,0.289722
3,Harry Kane,0.297915
4,Troy Deeney,0.314603
...,...,...
188,Alex McCarthy,1.896625
189,Mathew Ryan,1.897377
190,Asmir Begović,1.899667
191,Hugo Lloris,1.905453
