In [None]:
!pip install socceraction==1.5.3 pandera==0.15.2 multimethod==1.9.1 jupyter notebook ipykernel

In [1]:
import os
import warnings
import pandas as pd
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")
import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

#socceraction version==1.5.3
#pandera version==0.15.2
#multimethod version==1.9.1

from socceraction.data.statsbomb import StatsBombLoader
import socceraction.spadl as spadl

In [3]:
SBL = StatsBombLoader(getter="remote", creds={"user": None, "passwd": None})

In [4]:
# View all available competitions
competitions = SBL.competitions()
set(competitions.competition_name)

{'1. Bundesliga',
 'African Cup of Nations',
 'Champions League',
 'Copa America',
 'Copa del Rey',
 "FA Women's Super League",
 'FIFA U20 World Cup',
 'FIFA World Cup',
 'Indian Super league',
 'La Liga',
 'Liga Profesional',
 'Ligue 1',
 'Major League Soccer',
 'NWSL',
 'North American League',
 'Premier League',
 'Serie A',
 'UEFA Euro',
 'UEFA Europa League',
 "UEFA Women's Euro",
 "Women's World Cup"}

In [5]:
# Fifa world cup
selected_competitions = competitions[
    (competitions.competition_name == "FIFA World Cup")
    & (competitions.season_name == "2018")
]

In [6]:
selected_competitions.head()

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
30,3,43,FIFA World Cup,International,male,2018


In [7]:
# Get games from all selected competitions
games = pd.concat([
    SBL.games(row.competition_id, row.season_id)
    for row in selected_competitions.itertuples()
])
#games[["home_team_id", "away_team_id", "game_date", "home_score", "away_score"]]
games.head()

Unnamed: 0,game_id,season_id,competition_id,competition_stage,game_day,game_date,home_team_id,away_team_id,home_score,away_score,venue,referee
0,7585,3,43,Round of 16,4,2018-07-03 20:00:00,769,768,1,1,Otkritie Bank Arena,Mark Geiger
1,7570,3,43,Group Stage,3,2018-06-28 20:00:00,768,782,0,1,Stadion Kaliningrad,Damir Skomina
2,7586,3,43,Round of 16,4,2018-07-03 16:00:00,790,773,1,0,Saint-Petersburg Stadium,Damir Skomina
3,7557,3,43,Group Stage,3,2018-06-25 20:00:00,797,780,1,1,Mordovia Arena,Enrique Cáceres
4,7542,3,43,Group Stage,2,2018-06-20 14:00:00,780,788,1,0,Stadion Luzhniki,Mark Geiger


## Load and Convert data

In [8]:
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")
teams, players = [], []
actions = {}
for game in games_verbose:
    # load data
    teams.append(SBL.teams(game.game_id))
    players.append(SBL.players(game.game_id))
    events = SBL.events(game.game_id)
    # convert data
    actions[game.game_id] = spadl.statsbomb.convert_to_actions(
        events, #DataFrame containing StatsBomb events from a single game.
        home_team_id=game.home_team_id, #ID of the home team in the corresponding game.
        xy_fidelity_version=1,
        shot_fidelity_version=1
    )

teams = pd.concat(teams).drop_duplicates(subset="team_id")
players = pd.concat(players)

Loading game data: 100%|██████████| 64/64 [00:46<00:00,  1.38it/s]


In [9]:
teams.head()

Unnamed: 0,team_id,team_name
0,768,England
1,769,Colombia
0,782,Belgium
0,773,Switzerland
1,790,Sweden


In [10]:
players.head()

Unnamed: 0,game_id,team_id,player_id,player_name,nickname,jersey_number,is_starter,starting_position_id,starting_position_name,minutes_played
0,7585,768,3094,Bamidele Alli,Dele Alli,20,True,20,Left Attacking Midfield,84
1,7585,768,3205,Kyle Walker,,2,True,3,Right Center Back,123
2,7585,768,3233,Raheem Sterling,Raheem Sterling,10,True,22,Right Center Forward,91
3,7585,768,3244,John Stones,,5,True,4,Center Back,132
4,7585,768,3293,Jesse Lingard,,7,True,18,Right Attacking Midfield,132


In [11]:
first_match_actions = next(iter(actions.values())) #DataFrame
print( first_match_actions )


      game_id                     original_event_id  period_id  time_seconds  \
0        7585  d4883f20-ce68-4f84-b26a-a049a13cb6be          1          0.24   
1        7585  b948f032-4c54-4782-a71a-ffeed8908d00          1          0.48   
2        7585  9bdb71f9-c87b-4a66-96f0-def5312ca921          1          2.12   
3        7585  2ffa2904-8b47-4817-af26-aa9ac8d2881a          1          3.44   
4        7585  6cb0d85d-bd14-42e3-9c2d-7f99ce437796          1          4.20   
...       ...                                   ...        ...           ...   
2286     7585  fd52691f-c11b-4464-9515-42aab2f0c61d          5        238.64   
2287     7585  656b7495-8667-4026-be53-1d5423daeb92          5        290.20   
2288     7585  248aad43-431e-40e2-a896-540d5c533e0b          5        337.68   
2289     7585  0d5eb466-acfb-4432-a8ca-ea8533f72625          5        385.20   
2290     7585  90523b61-4f2c-40bb-ad54-29bf70c0e6ec          5        434.84   

      team_id  player_id  start_x  star

In [12]:
datafolder = "data-fifa"

# Create data folder if it doesn't exist
if not os.path.exists(datafolder):
    os.mkdir(datafolder)
    print(f"Directory {datafolder} created.")

spadl_h5 = os.path.join(datafolder, "spadl-fifa2018.h5")

# Store all spadl data in h5-file
with pd.HDFStore(spadl_h5) as spadlstore:
    spadlstore["competitions"] = selected_competitions
    spadlstore["games"] = games
    spadlstore["teams"] = teams
    spadlstore["players"] = players[['player_id', 'player_name', 'nickname']].drop_duplicates(subset='player_id')
    spadlstore["player_games"] = players[['player_id', 'game_id', 'team_id', 'is_starter', 'starting_position_id', 'starting_position_name', 'minutes_played']]
    for game_id in actions.keys():
        spadlstore[f"actions/game_{game_id}"] = actions[game_id]