In [1]:
import warnings
import pandas as pd
import tqdm
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
from socceraction.data.statsbomb import StatsBombLoader
from markovsoccer.extended_spadl import convert_to_extended_spadl
from markovsoccer.team_model import TeamModel
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")

ModuleNotFoundError: No module named 'pandas'

In [2]:
SBL = StatsBombLoader(getter="remote", creds={"user": None, "passwd": None})

In [3]:
# View all available competitions
competitions = SBL.competitions()
set(competitions.competition_name)

{'Champions League',
 "FA Women's Super League",
 'FIFA World Cup',
 'Indian Super league',
 'La Liga',
 'NWSL',
 'Premier League',
 'UEFA Euro',
 "UEFA Women's Euro",
 "Women's World Cup"}

In [4]:
# Data of Arsenal during Premier League season 2003/2004 is used. Note: The paper uses data of all teams from the
# Premier League season 2019/2020, but this data is not available for free.
selected_competitions = competitions[competitions.competition_name == "Premier League"]
selected_competitions

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
38,44,2,Premier League,England,male,2003/2004


In [5]:
# Get games from all selected competitions
games = pd.concat([
    SBL.games(row.competition_id, row.season_id)
    for row in selected_competitions.itertuples()
])
games[["home_team_id", "away_team_id", "game_date", "home_score", "away_score"]]

Unnamed: 0,home_team_id,away_team_id,game_date,home_score,away_score
0,46,1,2004-02-07 16:00:00,1,3
1,1,46,2003-12-26 13:00:00,3,0
2,1,39,2004-03-28 17:05:00,1,1
3,1,22,2004-05-15 16:00:00,2,1
4,1,75,2004-02-28 16:00:00,2,1
5,47,1,2003-08-24 17:05:00,0,4
6,101,1,2003-11-01 16:00:00,1,4
7,24,1,2003-10-04 13:30:00,1,2
8,1,33,2003-10-18 16:00:00,2,1
9,98,1,2004-03-13 16:00:00,0,2


## Load and convert match data

In [6]:
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")
teams = []
actions = {}
for game in games_verbose:
    # load data
    teams.append(SBL.teams(game.game_id))
    events = SBL.events(game.game_id)
    # convert data
    actions_of_game = spadl.statsbomb.convert_to_actions(events, game.home_team_id)
    # resolve team id's
    actions_of_game = actions_of_game.merge(SBL.teams(game.game_id), how="left")\
        .reset_index(drop=True)
    [actions_of_game] = fs.play_left_to_right([actions_of_game], game.home_team_id)
    # convert to extended SPADL
    actions[game.game_id] = convert_to_extended_spadl(actions_of_game)

teams = pd.concat(teams).drop_duplicates(subset="team_id")

Loading game data: 100%|██████████| 33/33 [01:00<00:00,  1.83s/it]


In [7]:
# inspect a subset of some of the actions
actions[game.game_id][:25]

Unnamed: 0,game_id,period_id,timestamp,team_name,player_id,start_x,end_x,start_y,end_y,type_name,result_name,bodypart_name,ball_recovery,modelled_possession_sequence
0,3749431,1,0 days 00:00:00,Newcastle United,40234.0,52.058824,51.264706,34.43038,33.655696,kick_off,success,foot,False,True
1,3749431,1,0 days 00:00:01,Newcastle United,40226.0,51.264706,50.823529,33.655696,33.311392,dribble,success,foot,False,True
2,3749431,1,0 days 00:00:01,Newcastle United,40226.0,50.823529,47.735294,33.311392,35.893671,pass,success,foot,False,True
3,3749431,1,0 days 00:00:02,Newcastle United,24799.0,47.735294,66.352941,35.893671,66.106329,pass,success,foot,False,True
4,3749431,1,0 days 00:00:04,Newcastle United,40227.0,66.352941,70.588235,66.106329,42.091139,pass,fail,foot,False,True
5,3749431,1,0 days 00:00:06,Arsenal,15637.0,33.617647,57.441176,26.683544,29.007595,pass,fail,head,False,False
6,3749431,1,0 days 00:00:09,Newcastle United,38418.0,46.764706,45.705882,39.767089,36.582278,dribble,success,foot,True,True
7,3749431,1,0 days 00:00:10,Newcastle United,38418.0,45.705882,55.147059,36.582278,31.589873,pass,success,foot,False,True
8,3749431,1,0 days 00:00:11,Newcastle United,24799.0,55.147059,48.352941,31.589873,29.610127,dribble,success,foot,False,True
9,3749431,1,0 days 00:00:12,Newcastle United,24799.0,48.352941,47.735294,29.610127,9.037975,pass,success,foot,False,True


## Build team models

In [8]:
# indicate minimum number of games a team has to have played for a team model to be built
NMIN_GAMES = 5
selected_competitions

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
38,44,2,Premier League,England,male,2003/2004


In [9]:
for _,team in teams.iterrows():
    games_of_team = games[(games['home_team_id'] == team['team_id'])
                       | (games['away_team_id'] == team['team_id'])]
    if len(games_of_team) >= NMIN_GAMES:
        game_ids = games_of_team['game_id'].reset_index(drop=True)
        # concatenate all the actions that occurred in the matches that the team has played
        A = []
        for game_id in game_ids:
            A.append(actions[game_id])
        A = pd.concat(A)
        # build team model
        directory = f"../models/{team['team_name']}.prism"
        team_model = TeamModel.build_from(A, team['team_name'])
        team_model.convert_to_prism_file(f"../models/{team['team_name']}.prism")