In [1]:
%load_ext autoreload
%autoreload 2
import os; import sys; sys.path.append('../')
import pandas as pd
import tqdm
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import socceraction.vaep as vaep

In [2]:
## Configure file and folder names
datafolder = "../data"
spadl_h5 = os.path.join(datafolder,"spadl-statsbomb.h5")
predictions_h5 = os.path.join(datafolder,"predictions.h5")

In [3]:
games = pd.read_hdf(spadl_h5,"games")
games = games[games.competition_name == "FIFA World Cup"]
print("nb of games:", len(games))

nb of games: 64


In [4]:
players = pd.read_hdf(spadl_h5,"players")
teams = pd.read_hdf(spadl_h5,"teams")
actiontypes = pd.read_hdf(spadl_h5, "actiontypes")
bodyparts = pd.read_hdf(spadl_h5, "bodyparts")
results = pd.read_hdf(spadl_h5, "results")

A = []
for game in tqdm.tqdm(list(games.itertuples())):
    actions = pd.read_hdf(spadl_h5,f"actions/game_{game.game_id}")
    actions = (
        actions.merge(actiontypes,how="left")
        .merge(results,how="left")
        .merge(bodyparts,how="left")
        .merge(players,how="left")
        .merge(teams,how="left")
    )
    preds = pd.read_hdf(predictions_h5,f"game_{game.game_id}")
    values = vaep.value(actions,preds.scores,preds.concedes)
    A.append(pd.concat([actions,preds,values],axis=1))
A = pd.concat(A).sort_values(["game_id","period_id", "time_seconds", "timestamp"]).reset_index(drop=True)
A.columns

100%|██████████| 64/64 [00:05<00:00, 10.68it/s]


Index(['game_id', 'period_id', 'time_seconds', 'timestamp', 'team_id',
       'player_id', 'start_x', 'start_y', 'end_x', 'end_y', 'type_id',
       'result_id', 'bodypart_id', 'action_id', 'type_name', 'result_name',
       'bodypart_name', 'player_name', 'player_nickname', 'jersey_number',
       'country_id', 'country_name', 'extra', 'team_name', 'scores',
       'concedes', 'offensive_value', 'defensive_value', 'vaep_value'],
      dtype='object')

### Most valuable players in the world cup

In [5]:
A["player"] = A[["player_nickname","player_name"]].apply(lambda x: x[0] if x[0] else x[1],axis=1)

A["count"] = 1

players = (
    A[["player_id","team_name","player","vaep_value","count"]]
    .groupby(["player_id","team_name","player"])
    .sum()
    .reset_index()
)
players = players.sort_values("vaep_value",ascending=False)
players[:10]

Unnamed: 0,player_id,team_name,player,vaep_value,count
6,3009,France,Kylian Mbappé,3.788137,495
50,3308,England,Kieran Trippier,3.564976,690
122,4320,Brazil,Neymar,3.235207,749
17,3089,Belgium,Kevin De Bruyne,3.213972,716
92,3621,Belgium,Eden Hazard,3.092135,691
152,5186,Russia,Denis Cheryshev,3.022713,214
599,20004,France,Paul Pogba,3.011299,676
36,3244,England,John Stones,2.85667,937
352,5574,Germany,Toni Kroos,2.769772,647
121,4319,Uruguay,Edinson Cavani,2.769654,226


### Normalize for minutes played

In [6]:
pg = pd.read_hdf(spadl_h5,"player_games")
pg = pg[pg.game_id.isin(games.game_id)]
mp = pg[["player_id","minutes_played"]].groupby("player_id").sum().reset_index()
stats = players.merge(mp)
stats = stats[stats.minutes_played > 180]
stats["vaep_rating"] = stats.vaep_value * 90 / stats.minutes_played
stats.sort_values("vaep_rating",ascending=False)[:10]

Unnamed: 0,player_id,team_name,player,vaep_value,count,minutes_played,vaep_rating
5,5186,Russia,Denis Cheryshev,3.022713,214,312,0.871937
8,5574,Germany,Toni Kroos,2.769772,647,289,0.862559
21,5473,Nigeria,Ahmed Musa,1.798709,162,220,0.735835
26,5674,Senegal,Moussa Wagué,1.699423,168,208,0.735327
9,4319,Uruguay,Edinson Cavani,2.769654,226,355,0.702166
32,3237,Argentina,Sergio Agüero,1.458189,137,193,0.679984
11,6196,Colombia,Yerry Mina,2.337824,325,315,0.66795
0,3009,France,Kylian Mbappé,3.788137,495,550,0.619877
2,4320,Brazil,Neymar,3.235207,749,478,0.609139
22,3083,South Korea,Son Heung-Min,1.792473,230,288,0.560148
