In [1]:
import warnings
import pandas as pd
import tqdm
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
from socceraction.data.statsbomb import StatsBombLoader
from markovsoccer.extended_spadl import convert_to_extended_spadl
from markovsoccer.team_model import TeamModel
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")

In [2]:
SBL = StatsBombLoader(getter="remote", creds={"user": None, "passwd": None})

In [3]:
# View all available competitions
competitions = SBL.competitions()
set(competitions.competition_name)

{'Champions League',
 "FA Women's Super League",
 'FIFA World Cup',
 'Indian Super league',
 'La Liga',
 'NWSL',
 'Premier League',
 'UEFA Euro',
 "UEFA Women's Euro",
 "Women's World Cup"}

In [10]:
# Data of Arsenal during Premier League season 2003/2004 is used. Note: The paper uses data of all teams from the
# Premier League season 2019/2020, but this data is not available for free.
selected_competitions = competitions[competitions.competition_name == "FIFA World Cup"]
selected_competitions

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
18,3,43,FIFA World Cup,International,male,2018


In [11]:
# Get games from all selected competitions
games = pd.concat([
    SBL.games(row.competition_id, row.season_id)
    for row in selected_competitions.itertuples()
])
games[["home_team_id", "away_team_id", "game_date", "home_score", "away_score"]]

Unnamed: 0,home_team_id,away_team_id,game_date,home_score,away_score
0,781,795,2018-06-22 14:00:00,2,0
1,774,783,2018-06-15 14:00:00,0,1
2,791,794,2018-06-23 17:00:00,1,2
3,783,799,2018-06-20 17:00:00,1,0
4,782,798,2018-06-18 17:00:00,3,0
...,...,...,...,...,...
59,796,774,2018-06-19 20:00:00,3,1
60,796,785,2018-07-07 20:00:00,2,2
61,776,771,2018-06-26 16:00:00,0,0
62,778,787,2018-06-24 17:00:00,2,2


## Load and convert match data

In [12]:
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")
teams = []
actions = {}
for game in games_verbose:
    # load data
    teams.append(SBL.teams(game.game_id))
    events = SBL.events(game.game_id)
    # convert data
    actions_of_game = spadl.statsbomb.convert_to_actions(events, game.home_team_id)
    # resolve team id's
    actions_of_game = actions_of_game.merge(SBL.teams(game.game_id), how="left")\
        .reset_index(drop=True)
    [actions_of_game] = fs.play_left_to_right([actions_of_game], game.home_team_id)
    # convert to extended SPADL
    actions[game.game_id] = convert_to_extended_spadl(actions_of_game)

teams = pd.concat(teams).drop_duplicates(subset="team_id")

Loading game data: 100%|██████████| 64/64 [02:22<00:00,  2.22s/it]


In [13]:
# inspect a subset of some of the actions
actions[game.game_id][:25]

Unnamed: 0,game_id,period_id,timestamp,team_name,player_id,start_x,end_x,start_y,end_y,type_name,result_name,bodypart_name,ball_recovery,modelled_possession_sequence
0,7559,1,0 days 00:00:00,Egypt,5252.0,52.058824,37.941176,34.43038,28.405063,kick_off,success,foot,False,True
1,7559,1,0 days 00:00:01,Egypt,3568.0,37.941176,37.941176,28.405063,30.126582,dribble,success,foot,False,True
2,7559,1,0 days 00:00:01,Egypt,3568.0,37.941176,32.647059,30.126582,6.025316,pass,success,foot,False,True
3,7559,1,0 days 00:00:02,Egypt,5250.0,32.647059,32.647059,6.025316,6.886076,dribble,success,foot,False,True
4,7559,1,0 days 00:00:03,Egypt,5250.0,32.647059,20.294118,6.886076,18.075949,pass,success,foot,False,True
5,7559,1,0 days 00:00:05,Egypt,5254.0,20.294118,17.647059,18.075949,42.177215,pass,success,foot,False,True
6,7559,1,0 days 00:00:07,Egypt,4063.0,17.647059,17.647059,42.177215,44.759494,dribble,success,foot,False,True
7,7559,1,0 days 00:00:09,Egypt,4063.0,17.647059,26.470588,44.759494,65.417722,pass,success,foot,False,True
8,7559,1,0 days 00:00:10,Egypt,5263.0,26.470588,22.941176,65.417722,64.556962,dribble,success,foot,False,True
9,7559,1,0 days 00:00:11,Egypt,5263.0,22.941176,17.647059,64.556962,53.367089,pass,success,foot,False,True


## Build team models

In [14]:
# indicate minimum number of games a team has to have played for a team model to be built
NMIN_GAMES = 5
selected_competitions

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
18,3,43,FIFA World Cup,International,male,2018


In [15]:
for _,team in teams.iterrows():
    games_of_team = games[(games['home_team_id'] == team['team_id'])
                       | (games['away_team_id'] == team['team_id'])]
    if len(games_of_team) >= NMIN_GAMES:
        game_ids = games_of_team['game_id'].reset_index(drop=True)
        # concatenate all the actions that occurred in the matches that the team has played
        A = []
        for game_id in game_ids:
            A.append(actions[game_id])
        A = pd.concat(A)
        # build team model
        directory = f"../models/{team['team_name']}.prism"
        team_model = TeamModel.build_from(A, team['team_name'])
        team_model.convert_to_prism_file(f"../models/{team['team_name']}.prism")