In [1]:
%load_ext autoreload
%autoreload 2
import os; import sys; sys.path.append('../')
import pandas as pd
import tqdm
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import socceraction.vaep as vaep

In [2]:
## Configure file and folder names
datafolder = "../data"
spadl_h5 = os.path.join(datafolder,"spadl-statsbomb.h5")
predictions_h5 = os.path.join(datafolder,"predictions.h5")

In [3]:
games = pd.read_hdf(spadl_h5,"games")
games = games[games.competition_name == "FIFA World Cup"]
print("nb of games:", len(games))

nb of games: 64


In [4]:
players = pd.read_hdf(spadl_h5,"players")
teams = pd.read_hdf(spadl_h5,"teams")
actiontypes = pd.read_hdf(spadl_h5, "actiontypes")
bodyparts = pd.read_hdf(spadl_h5, "bodyparts")
results = pd.read_hdf(spadl_h5, "results")

A = []
for game in tqdm.tqdm(list(games.itertuples())):
    actions = pd.read_hdf(spadl_h5,f"actions/game_{game.game_id}")
    actions = (
        actions.merge(actiontypes)
        .merge(results)
        .merge(bodyparts)
        .merge(players,"left",on="player_id")
        .merge(teams,"left",on="team_id")
        .sort_values(["period_id", "time_seconds", "timestamp"])
        .reset_index(drop=True)
    )
    preds = pd.read_hdf(predictions_h5,f"game_{game.game_id}")
    values = vaep.value(actions,preds.scores,preds.concedes)
    A.append(pd.concat([actions,preds,values],axis=1))
A = pd.concat(A).sort_values(["game_id","period_id", "time_seconds", "timestamp"]).reset_index(drop=True)
A.columns

100%|██████████| 64/64 [00:04<00:00, 15.79it/s]


Index(['game_id', 'period_id', 'time_seconds', 'timestamp', 'team_id',
       'player_id', 'start_x', 'start_y', 'end_x', 'end_y', 'type_id',
       'result_id', 'bodypart_id', 'type_name', 'result_name', 'bodypart_name',
       'player_name', 'player_nickname', 'jersey_number', 'country_id',
       'country_name', 'extra', 'team_name', 'scores', 'concedes',
       'offensive_value', 'defensive_value', 'vaep_value'],
      dtype='object')

### Most valuable players in the world cup

In [5]:
A["player"] = A[["player_nickname","player_name"]].apply(lambda x: x[0] if x[0] else x[1],axis=1)

A["count"] = 1

A_ = A[A.type_name != "shot_penalty"] # ignore penalties
players = (
    A_[["player_id","team_name","player","vaep_value","count"]]
    .groupby(["player_id","team_name","player"])
    .sum()
    .reset_index()
)
players = players.sort_values("vaep_value",ascending=False)
players

Unnamed: 0,player_id,team_name,player,vaep_value,count
6,3009,France,Kylian Mbappé,4.221722,495
50,3308,England,Kieran Trippier,3.879793,689
122,4320,Brazil,Neymar,3.576450,749
599,20004,France,Paul Pogba,3.401471,676
92,3621,Belgium,Eden Hazard,3.344575,690
...,...,...,...,...,...
30,3202,Brazil,Gabriel Jesus,-0.926258,273
186,5223,Morocco,Aziz Bouhaddouz,-0.960557,9
117,4269,Serbia,Aleksandar Mitrović,-1.259143,171
462,5685,Colombia,Carlos Sánchez,-1.337455,144


### Normalize for minutes played

In [6]:
pg = pd.read_hdf(spadl_h5,"player_games")
pg = pg[pg.game_id.isin(games.game_id)]
mp = pg[["player_id","minutes_played"]].groupby("player_id").sum().reset_index()
stats = players.merge(mp)
stats = stats[stats.minutes_played > 150]
stats["vaep_rating"] = stats.vaep_value * 90 / stats.minutes_played
stats.sort_values("vaep_value",ascending=False)

Unnamed: 0,player_id,team_name,player,vaep_value,count,minutes_played,vaep_rating
0,3009,France,Kylian Mbappé,4.221722,495,550,0.690827
1,3308,England,Kieran Trippier,3.879793,689,602,0.580035
2,4320,Brazil,Neymar,3.576450,749,478,0.673390
3,20004,France,Paul Pogba,3.401471,676,563,0.543752
4,3621,Belgium,Eden Hazard,3.344575,690,540,0.557429
...,...,...,...,...,...,...,...
595,3604,France,Olivier Giroud,-0.885633,324,566,-0.140825
596,11098,Sweden,Marcus Berg,-0.886605,207,435,-0.183435
597,3202,Brazil,Gabriel Jesus,-0.926258,273,416,-0.200392
599,4269,Serbia,Aleksandar Mitrović,-1.259143,171,271,-0.418166
