# Features and labels
This notebook generates features (shot distance, angle, etc.) and labels (goal/no goal) for all shots and stores them in a HDF file. Storing intermediate data is a good practice to save computational time if you want to experiment with multiple pipelines.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from socceraction import spadl
from socceraction import vaep

In [2]:
%load_ext autoreload
%autoreload 2
    
from soccer_xg.data import HDFDataset
import soccer_xg.attributes as fs
import soccer_xg.xg as xg

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [3]:
# see 1-load-and-convert-statsbomb-data
data_fp = Path("../data")
dataset = HDFDataset(data_fp / "spadl-statsbomb-bigfive-1516.h5", mode='a')

## Feature and label generators

By default, all features defined in `soccer_xg.attributes.default_features` are computed. It is also possible to compute a subset of these features or add additional feature generators. Each feature generator is a function that expects either a DataFrame object containing SPADL actions, a list of DataFrame objects containing consecutive SPADL actions (i.e., game states) or the raw provider-specific events. Let's take some data and look at some of these feature generators.

In [4]:
game = dataset.games().loc[3890561]
actions = spadl.utils.add_names(dataset.actions(game_id=3890561))
events = dataset.events(game_id=3890561)

### Action-based features

Feature generators which calculate a set of features based on the shot and all preceding actions. The input is a Pandas DataFrame of actions in SPADL format and a boolean mask to select the shots for which features should be computed.

In [5]:
# convert action to Left-to-Right orientation
ltr_actions = spadl.utils.play_left_to_right(actions, game.home_team_id)
# get actions corresponding to shots
shot_mask = (
    actions.type_name.isin(["shot", "shot_penalty", "shot_freekick"]) 
    & ~actions.result_name.isin(["owngoal", "offside"])
)
# compute feature
fs.shot_dist(ltr_actions, shot_mask).head()

Unnamed: 0_level_0,dist_shot
action_id,Unnamed: 1_level_1
151,12.881039
207,8.294462
240,9.495718
359,19.15699
430,14.870452


### Gamestate-based features

Feature generators which calculate a set of features based on the shot and the N previous actions (i.e., shot context). The input is a list of gamestates. Internally each game state is represented as a list of SPADL action dataframes `[a_0, a_1, ...]` where each row in the `a_i` dataframe contains the previous action of the action in the same row in the `a_{i-1}` dataframe. `a_0` is the shot action.

In [6]:
# convert actions to Left-to-Right gamestates
gamestates = vaep.features.gamestates(actions, nb_prev_actions=3)
ltr_gamestates = vaep.features.play_left_to_right(gamestates, game.home_team_id)
# get gamestates corresponding to shots
shot_gamestates = [states.loc[shot_mask] for states in ltr_gamestates]
# compute feature
fs.speed(shot_gamestates).head()

Unnamed: 0_level_0,speedx_a01,speedy_a01,speed_a01,speedx_a02,speedy_a02,speed_a02
action_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
151,0.0,0.0,0.0,3.510238,10.442959,11.01713
207,0.0,0.0,0.0,4.817636,5.487703,7.302362
240,0.76087,5.26087,5.315606,0.498826,3.449027,3.484913
359,0.0,0.0,0.0,6.564303,0.420445,6.577754
430,0.0,0.0,0.0,1.135184,1.3233,1.743493


### Event-based features

Feature generators which calculate a set of features based on the original event data. These feature generators are provider-specific. The input is a pandas DataFrame of events and a series with event IDs to select the shots for which features should be computed.

In [7]:
shot_events_idx = actions.loc[shot_mask, "original_event_id"]
fs.statsbomb_goalkeeper_position(events, shot_events_idx).head()

Unnamed: 0,goalkeeper_x,goalkeeper_y,goalkeeper_dist_to_ball,goalkeeper_dist_to_goal,goalkeeper_angle_to_goal
ba46e9d6-e828-4599-952c-39c1f7d22659,104.117647,35.549367,11.5834,1.782999,-1.053111
85d67225-30fb-47c8-b478-cf568941a164,101.470588,32.708861,4.529539,3.758163,0.350701
adac17d3-5e67-4e8c-b482-4bae2f36e06e,103.764706,35.118987,8.715584,1.666759,-0.736036
abffd193-62bc-4c8d-8636-1e3f0f0ebbe5,84.705882,45.792405,4.290909,23.471515,-0.526388
d9cea903-f92a-40e1-a393-1a849d83f157,103.5,37.443038,11.711233,3.755597,-1.15993


### Defining your own feature generator

In [8]:
@fs.ftype("actions")
def rebound(actions, shot_mask):
    """Determine whether the shot was a rebound.

    Parameters
    ----------
    actions : pd.DataFrame
        The actions of a game in SPADL format.
    shot_mask : pd.Series
        A boolean mask to select the shots for which features should be
        computed.

    Returns
    -------
    pd.DataFrame
        A dataframe with a column indicating whether the shot was a rebound
        ('rebound').
    """
    shot = actions.loc[shot_mask]
    a1 = actions.shift(1).loc[shot_mask]
    a2 = actions.shift(2).loc[shot_mask]
    rebound = (
        # the previous action was a shot and less than 5 seconds ago
        (a1["type_name"].isin(["shot", "shot_penalty", "shot_freekick"])
            & (shot["time_seconds"] - a1["time_seconds"] < 5))
        # or there was a shot two actions before, less than 5 seconds ago
        | (a2["type_name"].isin(["shot", "shot_penalty", "shot_freekick"])
            & (shot["time_seconds"] - a2["time_seconds"] < 5))
    )
    return pd.DataFrame({"rebound": rebound}, index=shot.index)

rebound(ltr_actions, shot_mask).head()

Unnamed: 0_level_0,rebound
action_id,Unnamed: 1_level_1
151,False
207,False
240,False
359,False
430,False


### Computing a list of feature generators

In [9]:
feature_generators = [
    fs.shot_dist,
    fs.shot_visible_angle,
    fs.shot_bodypart,
    fs.statsbomb_open_goal,
    fs.statsbomb_first_touch,
    fs.statsbomb_free_projection,
    fs.statsbomb_goalkeeper_position,
    fs.statsbomb_defenders_position,
    fs.statsbomb_assist,
    fs.statsbomb_counterattack,
    fs.statsbomb_shot_impact_height
]

In [10]:
df_features, df_labels = fs.compute_attributes(
    game=dataset.games().loc[3890561], 
    actions=dataset.actions(game_id=3890561), 
    events=dataset.events(game_id=3890561), 
    xfns=feature_generators,
    yfns=[fs.goal_from_shot]
)
pd.concat([df_features, df_labels], axis=1).head()

Unnamed: 0_level_0,dist_shot,visible_angle_shot,bodypart_name_shot,open_goal,first_touch,free_projection_gaps,free_projection_pct,goalkeeper_x,goalkeeper_y,goalkeeper_dist_to_ball,goalkeeper_dist_to_goal,goalkeeper_angle_to_goal,dist_to_defender,under_pressure,nb_defenders_in_shot_line,nb_defenders_behind_ball,one_on_one,end_x_assist,end_y_assist,carry_dist,type_assist,height_assist,from_counterattack,impact_height,goal
action_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
151,12.881039,0.465099,foot,False,True,2,0.505297,104.117647,35.549367,11.5834,1.782999,-1.053111,4.374854,False,2,4,False,94.5,42.005063,0.0,cross,high,False,low,False
207,8.294462,0.813487,foot,False,True,2,0.60973,101.470588,32.708861,4.529539,3.758163,0.350701,1.36758,False,0,0,True,96.970588,32.192405,0.0,standard_pass,high,False,low,True
240,9.495718,0.177482,foot,False,True,1,0.138353,103.764706,35.118987,8.715584,1.666759,-0.736036,1.047878,True,1,1,False,,,,,,False,low,False
359,19.15699,0.31927,foot,False,False,1,1.0,84.705882,45.792405,4.290909,23.471515,-0.526388,6.794148,False,0,2,False,80.294118,44.070886,8.708532,through_ball,ground,True,ground,True
430,14.870452,0.320055,foot,False,False,2,0.739963,103.5,37.443038,11.711233,3.755597,-1.15993,2.179715,False,0,3,False,91.147059,51.21519,6.792372,standard_pass,ground,False,ground,False


## Compute features and labels

We can easily compute all and features and labels for an entire dataset.

In [11]:
X, y = xg.prepare(dataset, xfns=feature_generators)

Preparing dataset: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [15:09<00:00,  2.00it/s]


In [12]:
# we cannot store a categorical dtype in a HDF file
dataset["xg/features"] = X.astype({c: 'object' for c in X.select_dtypes(include='category').columns})
dataset["xg/labels"] = y.astype({c: 'object' for c in y.select_dtypes(include='category').columns})

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['bodypart_name_shot', 'type_assist', 'height_assist', 'impact_height'], dtype='object')]

  dataset["xg/features"] = X.astype({c: 'object' for c in X.select_dtypes(include='category').columns})


## Load features

In [13]:
display(dataset["xg/features"].head())
display(dataset["xg/labels"].head())

Unnamed: 0_level_0,Unnamed: 1_level_0,dist_shot,visible_angle_shot,bodypart_name_shot,open_goal,first_touch,free_projection_gaps,free_projection_pct,goalkeeper_x,goalkeeper_y,goalkeeper_dist_to_ball,goalkeeper_dist_to_goal,goalkeeper_angle_to_goal,dist_to_defender,under_pressure,nb_defenders_in_shot_line,nb_defenders_behind_ball,one_on_one,end_x_assist,end_y_assist,carry_dist,type_assist,height_assist,from_counterattack,impact_height
game_id,action_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
3890561,151,12.881039,0.465099,foot,False,True,2.0,0.505297,104.117647,35.549367,11.5834,1.782999,-1.053111,4.374854,False,2.0,4.0,False,94.5,42.005063,0.0,cross,high,False,low
3890561,207,8.294462,0.813487,foot,False,True,2.0,0.60973,101.470588,32.708861,4.529539,3.758163,0.350701,1.36758,False,0.0,0.0,True,96.970588,32.192405,0.0,standard_pass,high,False,low
3890561,240,9.495718,0.177482,foot,False,True,1.0,0.138353,103.764706,35.118987,8.715584,1.666759,-0.736036,1.047878,True,1.0,1.0,False,,,,,,False,low
3890561,359,19.15699,0.31927,foot,False,False,1.0,1.0,84.705882,45.792405,4.290909,23.471515,-0.526388,6.794148,False,0.0,2.0,False,80.294118,44.070886,8.708532,through_ball,ground,True,ground
3890561,430,14.870452,0.320055,foot,False,False,2.0,0.739963,103.5,37.443038,11.711233,3.755597,-1.15993,2.179715,False,0.0,3.0,False,91.147059,51.21519,6.792372,standard_pass,ground,False,ground


Unnamed: 0_level_0,Unnamed: 1_level_0,goal
game_id,action_id,Unnamed: 2_level_1
3890561,151,False
3890561,207,True
3890561,240,False
3890561,359,True
3890561,430,False


In [14]:
dataset.close()