# Data Extraction
Information Geometry Soft Clustering for Sport Analytics

Joaquin Garay

In [18]:
#!pip install socceraction==1.5.3 pandera==0.15.2 multimethod==1.9.1 jupyter notebook ipykernel

In [19]:
import os
import warnings
import pandas as pd
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")
import tqdm

In [20]:
%load_ext autoreload
%autoreload 2

#socceraction version==1.5.3
#pandera version==0.15.2
#multimethod version==1.9.1

from socceraction.data.statsbomb import StatsBombLoader
import socceraction.spadl as spadl

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
SBL = StatsBombLoader(getter="remote", creds={"user": None, "passwd": None})

In [22]:
# View all available competitions
competitions = SBL.competitions()
set(competitions.competition_name)

{'1. Bundesliga',
 'African Cup of Nations',
 'Champions League',
 'Copa America',
 'Copa del Rey',
 "FA Women's Super League",
 'FIFA U20 World Cup',
 'FIFA World Cup',
 'Indian Super league',
 'La Liga',
 'Liga Profesional',
 'Ligue 1',
 'Major League Soccer',
 'NWSL',
 'North American League',
 'Premier League',
 'Serie A',
 'UEFA Euro',
 'UEFA Europa League',
 "UEFA Women's Euro",
 "Women's World Cup"}

In [23]:
# Fifa world cup
selected_competitions = competitions[
    (competitions.competition_name == "FIFA World Cup")
    & (competitions.season_name == "2018")
]

#Premier League
#selected_competitions = competitions[competitions.competition_name == "Premier League"]

In [24]:
selected_competitions.head()

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
30,3,43,FIFA World Cup,International,male,2018


In [25]:
# Get games from all selected competitions
games = pd.concat([
    SBL.games(row.competition_id, row.season_id)
    for row in selected_competitions.itertuples()
])
#games[["home_team_id", "away_team_id", "game_date", "home_score", "away_score"]]
games.describe()

Unnamed: 0,game_id,season_id,competition_id,game_day,game_date,home_team_id,away_team_id,home_score,away_score
count,64.0,64.0,64.0,64.0,64,64.0,64.0,64.0,64.0
mean,7693.21875,3.0,43.0,2.71875,2018-06-25 20:06:33.750000128,782.28125,782.734375,1.421875,1.21875
min,7525.0,3.0,43.0,1.0,2018-06-14 17:00:00,768.0,768.0,0.0,0.0
25%,7543.75,3.0,43.0,1.75,2018-06-19 19:15:00,775.0,774.0,0.0,0.75
50%,7559.5,3.0,43.0,2.5,2018-06-25 06:00:00,782.0,782.5,1.0,1.0
75%,7578.25,3.0,43.0,3.25,2018-06-29 07:00:00,789.25,791.25,2.0,2.0
max,8658.0,3.0,43.0,7.0,2018-07-15 17:00:00,799.0,799.0,6.0,3.0
std,366.178223,0.0,0.0,1.537559,,8.8344,9.620154,1.354647,0.916667


## Load and Convert data

In [26]:
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")
teams, players = [], []
actions = {}
for game in games_verbose:
    # load data
    teams.append(SBL.teams(game.game_id))
    players.append(SBL.players(game.game_id))
    events = SBL.events(game.game_id)
    # convert data
    actions[game.game_id] = spadl.statsbomb.convert_to_actions(
        events, #DataFrame containing StatsBomb events from a single game.
        home_team_id=game.home_team_id, #ID of the home team in the corresponding game.
        xy_fidelity_version=1,
        shot_fidelity_version=1
    )

teams = pd.concat(teams).drop_duplicates(subset="team_id")
players = pd.concat(players)

Loading game data: 100%|██████████| 64/64 [00:47<00:00,  1.35it/s]


In [27]:
teams.head()

Unnamed: 0,team_id,team_name
0,768,England
1,769,Colombia
0,782,Belgium
0,773,Switzerland
1,790,Sweden


In [28]:
players.head()

Unnamed: 0,game_id,team_id,player_id,player_name,nickname,jersey_number,is_starter,starting_position_id,starting_position_name,minutes_played
0,7585,768,3094,Bamidele Alli,Dele Alli,20,True,20,Left Attacking Midfield,84
1,7585,768,3205,Kyle Walker,,2,True,3,Right Center Back,123
2,7585,768,3233,Raheem Sterling,Raheem Sterling,10,True,22,Right Center Forward,91
3,7585,768,3244,John Stones,,5,True,4,Center Back,132
4,7585,768,3293,Jesse Lingard,,7,True,18,Right Attacking Midfield,132


In [29]:
datafolder = "data"
filename = "spadl-fifa2018"

# Create data folder if it doesn't exist
if not os.path.exists(datafolder):
    os.mkdir(datafolder)
    print(f"Directory {datafolder} created.")

spadl_h5 = os.path.join(datafolder, f'{filename}.h5')

# Store all spadl data in h5-file
with pd.HDFStore(spadl_h5) as spadlstore:
    spadlstore["competitions"] = selected_competitions
    spadlstore["games"] = games
    spadlstore["teams"] = teams
    spadlstore["players"] = players[['player_id', 'player_name', 'nickname']].drop_duplicates(subset='player_id')
    spadlstore["player_games"] = players[['player_id', 'game_id', 'team_id', 'is_starter', 'starting_position_id', 'starting_position_name', 'minutes_played']]
    for game_id in actions.keys():
        spadlstore[f"actions/game_{game_id}"] = actions[game_id]