In [38]:
%load_ext autoreload
%autoreload 2
import os; import sys; sys.path.append('../')
import pandas as pd
import tqdm
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import socceraction.vaep as vaep

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
## Configure file and folder names
datafolder = "../data"
spadl_h5 = os.path.join(datafolder,"spadl.h5")
predictions_h5 = os.path.join(datafolder,"predictions.h5")

In [11]:
games = pd.read_hdf(spadl_h5,"games")
games = games[games.competition_name == "FIFA World Cup"]
print("nb of games:", len(games))

nb of games: 64


In [40]:
players = pd.read_hdf(spadl_h5,"players")
teams = pd.read_hdf(spadl_h5,"teams")
actiontypes = pd.read_hdf(spadl_h5, "actiontypes")
bodyparts = pd.read_hdf(spadl_h5, "bodyparts")
results = pd.read_hdf(spadl_h5, "results")

A = []
for game in tqdm.tqdm(list(games.itertuples())):
    actions = pd.read_hdf(spadl_h5,f"actions/game_{game.game_id}")
    actions = (
        actions.merge(actiontypes)
        .merge(results)
        .merge(bodyparts)
        .merge(players,"left",on="player_id")
        .merge(teams,"left",on="team_id")
        .sort_values(["period_id", "time_seconds", "timestamp"])
        .reset_index(drop=True)
    )
    preds = pd.read_hdf(predictions_h5,f"game_{game.game_id}")
    values = vaep.value(actions,preds.scores,preds.concedes)
    A.append(pd.concat([actions,preds,values],axis=1))
A = pd.concat(A).sort_values(["game_id","period_id", "time_seconds", "timestamp"]).reset_index(drop=True)
A.columns

100%|██████████| 64/64 [00:06<00:00,  9.55it/s]


Index(['game_id', 'period_id', 'time_seconds', 'timestamp', 'team_id',
       'player_id', 'start_x', 'start_y', 'end_x', 'end_y', 'type_id',
       'result_id', 'bodypart_id', 'type_name', 'result_name', 'bodypart_name',
       'country_id', 'country_name', 'extra', 'jersey_number', 'player_name',
       'player_nickname', 'team_name', 'scores', 'concedes', 'offensive_value',
       'defensive_value', 'vaep_value'],
      dtype='object')

### Most valuable players in the world cup

In [42]:
A["player"] = A[["player_nickname","player_name"]].apply(lambda x: x[0] if x[0] else x[1],axis=1)

A["count"] = 1

A_ = A[A.type_name != "shot_penalty"] # ignore penalties
players = (
    A_[["player_id","team_name","player","vaep_value","count"]]
    .groupby(["player_id","team_name","player"])
    .sum()
    .reset_index()
)
players = players.sort_values("vaep_value",ascending=False)
players

Unnamed: 0,player_id,team_name,player,vaep_value,count
6,3009,France,Kylian Mbappé,4.022927,495
50,3308,England,Kieran Trippier,3.787583,689
122,4320,Brazil,Neymar,3.383777,749
92,3621,Belgium,Eden Hazard,3.328818,690
152,5186,Russia,Denis Cheryshev,3.194341,213
241,5463,Croatia,Luka Modrić,3.177810,1034
17,3089,Belgium,Kevin De Bruyne,3.129208,716
599,20004,France,Paul Pogba,3.081210,676
248,5470,Croatia,Ivan Rakitić,2.989283,835
36,3244,England,John Stones,2.883331,937


### Normalize for minutes played

In [43]:
pg = pd.read_hdf(spadl_h5,"player_games")
pg = pg[pg.game_id.isin(games.game_id)]
mp = pg[["player_id","minutes_played"]].groupby("player_id").sum().reset_index()
stats = players.merge(mp)
stats = stats[stats.minutes_played > 150]
stats["vaep_rating"] = stats.vaep_value * 90 / stats.minutes_played
stats.sort_values("vaep_value",ascending=False)

Unnamed: 0,player_id,team_name,player,vaep_value,count,minutes_played,vaep_rating
0,3009,France,Kylian Mbappé,4.022927,495,550,0.658297
1,3308,England,Kieran Trippier,3.787583,689,602,0.566250
2,4320,Brazil,Neymar,3.383777,749,478,0.637113
3,3621,Belgium,Eden Hazard,3.328818,690,540,0.554803
4,5186,Russia,Denis Cheryshev,3.194341,213,312,0.921444
5,5463,Croatia,Luka Modrić,3.177810,1034,720,0.397226
6,3089,Belgium,Kevin De Bruyne,3.129208,716,566,0.497577
7,20004,France,Paul Pogba,3.081210,676,563,0.492556
8,5470,Croatia,Ivan Rakitić,2.989283,835,677,0.397394
9,3244,England,John Stones,2.883331,937,672,0.386160
