# Creating a training dataset

This notebook creates a dataset of passes and generates features and labels.

In [1]:
from pathlib import Path

import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from unxpass.databases import SQLiteDatabase
from unxpass.datasets import PassesDataset

## Configure folder names

First, we define were the processed data should be stored.

In [4]:
DATA_DIR = Path("../stores/")

## Create database connection

We need a database with StatsBomb 360 data to extract passes from.

In [5]:
DB_PATH = DATA_DIR / "database.sqlite"
db = SQLiteDatabase(DB_PATH)

In [6]:
from socceraction.spadl.utils import add_names

game_id = 3795107

# load SPADL actions
df_actions = add_names(db.actions(game_id))
df_actions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,bodypart_id,type_id,result_id,possession_team_id,play_pattern_name,under_pressure,extra,visible_area_360,in_visible_area_360,freeze_frame_360,type_name,result_name,bodypart_name
game_id,action_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3795107,0,cac7b0b7-e051-4266-874a-37d5185c8a4e,1,1.0,782,3289,52.058824,34.43038,46.941176,37.61519,5,0,1,782.0,From Kick Off,0,"{'pass': {'recipient': {'id': 5642, 'name': 'A...","[[39.95293111994783, 2.994426202079026], [9.12...",1,"[{'teammate': True, 'actor': False, 'keeper': ...",pass,success,foot_right
3795107,1,5c1ef827-e053-44f8-a0f0-b312a3ca093d,1,1.0,782,5642,46.941176,37.61519,44.470588,39.853165,0,21,1,782.0,From Kick Off,0,"{'carry': {'end_location': [51.4, 33.7]}}","[[40.54724912231884, 3.620299352250072], [11.6...",1,"[{'teammate': True, 'actor': False, 'keeper': ...",dribble,success,foot
3795107,2,ba26ec0b-3274-481c-90d1-c58bd71bc81f,1,2.0,782,5642,44.470588,39.853165,29.205882,53.797468,5,0,1,782.0,From Kick Off,0,"{'pass': {'recipient': {'id': 3077, 'name': 'J...","[[38.72217799304041, 0.04784815699034084], [8....",1,"[{'teammate': True, 'actor': False, 'keeper': ...",pass,success,foot_right
3795107,3,1783a21c-bb70-4353-bed4-5883f66a1f68,1,4.0,782,3077,29.205882,53.797468,28.235294,56.207595,0,21,1,782.0,From Kick Off,0,"{'carry': {'end_location': [33.0, 14.7]}}","[[36.00339793873086, 3.8648379122013665], [6.9...",1,"[{'teammate': True, 'actor': False, 'keeper': ...",dribble,success,foot
3795107,4,f8b6772a-4272-4aa1-8eda-184d2a8248a8,1,6.0,782,3077,28.235294,56.207595,35.382353,67.483544,4,0,1,782.0,From Kick Off,0,"{'pass': {'recipient': {'id': 5632, 'name': 'T...","[[33.14790413631122, 6.700149200225219], [2.90...",1,"[{'teammate': True, 'actor': False, 'keeper': ...",pass,success,foot_left


## Select passes

We only use passes that are 
- performed by foot
- part of open play
- for which the start and end location are included in the 360 snapshot

In [7]:
passes_idx = PassesDataset.actionfilter(df_actions)
df_actions.loc[passes_idx].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,bodypart_id,type_id,result_id,possession_team_id,play_pattern_name,under_pressure,extra,visible_area_360,in_visible_area_360,freeze_frame_360,type_name,result_name,bodypart_name
game_id,action_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3795107,0,cac7b0b7-e051-4266-874a-37d5185c8a4e,1,1.0,782,3289,52.058824,34.43038,46.941176,37.61519,5,0,1,782.0,From Kick Off,0,"{'pass': {'recipient': {'id': 5642, 'name': 'A...","[[39.95293111994783, 2.994426202079026], [9.12...",1,"[{'teammate': True, 'actor': False, 'keeper': ...",pass,success,foot_right
3795107,2,ba26ec0b-3274-481c-90d1-c58bd71bc81f,1,2.0,782,5642,44.470588,39.853165,29.205882,53.797468,5,0,1,782.0,From Kick Off,0,"{'pass': {'recipient': {'id': 3077, 'name': 'J...","[[38.72217799304041, 0.04784815699034084], [8....",1,"[{'teammate': True, 'actor': False, 'keeper': ...",pass,success,foot_right
3795107,4,f8b6772a-4272-4aa1-8eda-184d2a8248a8,1,6.0,782,3077,28.235294,56.207595,35.382353,67.483544,4,0,1,782.0,From Kick Off,0,"{'pass': {'recipient': {'id': 5632, 'name': 'T...","[[33.14790413631122, 6.700149200225219], [2.90...",1,"[{'teammate': True, 'actor': False, 'keeper': ...",pass,success,foot_left
3795107,10,c8e37b3e-a26f-46c7-b9fe-0fb8bb29a0ea,1,14.0,782,20005,6.882353,23.240506,22.941176,5.939241,5,0,1,782.0,From Kick Off,0,"{'pass': {'recipient': {'id': 3176, 'name': 'T...","[[21.532267836858974, 0.0], [0.0, 21.529731410...",1,"[{'teammate': True, 'actor': False, 'keeper': ...",pass,success,foot_right
3795107,34,af79c04b-94e3-429c-a34a-8c3bbc7d8bfb,1,50.0,782,5642,66.529412,54.227848,69.0,45.534177,5,0,1,782.0,From Counter,0,"{'pass': {'recipient': {'id': 3089, 'name': 'K...","[[55.557024445034784, 18.836771932086435], [56...",1,"[{'teammate': True, 'actor': False, 'keeper': ...",pass,success,foot_right


## Compute features and labels

The `unxpass.features` and `unxpass.labels` modules implement various feature generation and labeling functions, respectively.

In [8]:
from unxpass import features as fs
from unxpass import labels as ls

# List of available features
print("Features:", [f.__name__ for f in fs.all_features])

# List of available labels
print("Labels:", [f.__name__ for f in ls.all_labels])

Features: ['actiontype', 'actiontype_onehot', 'result', 'result_onehot', 'actiontype_result_onehot', 'bodypart', 'bodypart_onehot', 'time', 'startlocation', 'relative_startlocation', 'endlocation', 'relative_endlocation', 'startpolar', 'endpolar', 'movement', 'team', 'time_delta', 'space_delta', 'goalscore', 'angle', 'under_pressure', 'packing_rate', 'ball_height', 'ball_height_onehot', 'player_possession_time', 'speed', 'nb_opp_in_path', 'dist_defender', 'freeze_frame_360', 'defenders_in_3m_radius', 'defenders_in_5m_radius']
Labels: ['scores', 'scores_xg', 'concedes', 'concedes_xg', 'success', 'receiver']


As some of these functions require data of the entire game (e.g., to determine the current scoreline) they should always be applied on the game state representation of the full game. Relevant actions can be selected afterwards.

In [9]:
from socceraction.vaep.features import gamestates as to_gamestates
from unxpass.utils import play_left_to_right

# convert actions to gamestates
home_team_id, _ = db.get_home_away_team_id(game_id)
gamestates = play_left_to_right(to_gamestates(df_actions, nb_prev_actions=3), home_team_id)

In [10]:
# compute features and labels
pd.concat([
    fs.actiontype(gamestates),
    ls.success(df_actions)
], axis=1).loc[passes_idx]

Unnamed: 0_level_0,Unnamed: 1_level_0,type_id_a0,type_id_a1,type_id_a2,success
game_id,action_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3795107,0,0,0,0,True
3795107,2,0,21,0,True
3795107,4,0,21,0,True
3795107,10,0,21,0,True
3795107,34,0,19,21,True
3795107,...,...,...,...,...
3795107,2187,0,21,0,True
3795107,2189,0,21,0,True
3795107,2195,0,21,7,True
3795107,2197,0,21,0,True


In [11]:
# or, as a shorthand to the above
pd.concat([
    fs.get_features(db, game_id, xfns=[fs.actiontype], actionfilter=PassesDataset.actionfilter),
    ls.get_labels(db, game_id, yfns=[ls.success], actionfilter=PassesDataset.actionfilter)
], axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_id_a0,type_id_a1,type_id_a2,success
game_id,action_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3795107,0,0,0,0,True
3795107,2,0,21,0,True
3795107,4,0,21,0,True
3795107,10,0,21,0,True
3795107,34,0,19,21,True
3795107,...,...,...,...,...
3795107,2187,0,21,0,True
3795107,2189,0,21,0,True
3795107,2195,0,21,7,True
3795107,2197,0,21,0,True


## The "PassesDataset" interface

To make things easier, we provide an interface that does all of the above. Additionally, it can store all computed features and labels locally. This is recommended when experimenting with multiple model configurations. It also functions as a PyTorch dataset.

In [12]:
dataset = PassesDataset(
    path=DATA_DIR / "datasets" / "euro2020",
    xfns=["actiontype"],
    yfns=["success"]
)
dataset.create(db)

You can now retrieve the computed features and labels as a Pandas DataFrame.

In [13]:
dataset.features

Unnamed: 0_level_0,Unnamed: 1_level_0,type_id_a0,type_id_a1,type_id_a2
game_id,action_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3788741,28,0,21,19
3788741,30,0,21,0
3788741,32,0,21,0
3788741,34,0,21,0
3788741,52,0,21,0
...,...,...,...,...
3795506,2776,0,21,0
3795506,2780,0,21,0
3795506,2782,0,21,0
3795506,2786,0,21,0


In [14]:
dataset.labels

Unnamed: 0_level_0,Unnamed: 1_level_0,success
game_id,action_id,Unnamed: 2_level_1
3788741,28,True
3788741,30,True
3788741,32,True
3788741,34,False
3788741,52,True
...,...,...
3795506,2776,True
3795506,2780,True
3795506,2782,True
3795506,2786,True


Or you can iterate over all examples, returning dictionary with the features and labels.

In [15]:
dataset[0]

{'game_id': 3788741,
 'action_id': 28,
 'type_id_a0': 0,
 'type_id_a1': 21,
 'type_id_a2': 19,
 'success': True}

In [16]:
db.close()