In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from mplsoccer import Standardizer
from mplsoccer.dimensions import create_pitch_dims

Load the data

In [2]:
STATSBOMB = os.path.join('..', '..', 'data', 'statsbomb')

In [3]:
df_statsbomb_match = pd.read_parquet(os.path.join(STATSBOMB, 'match.parquet'))

In [4]:
df_statsbomb_related = pd.read_parquet(os.path.join(STATSBOMB, 'related.parquet'))

In [5]:
df_statsbomb_event = pd.read_parquet(os.path.join(STATSBOMB, 'event.parquet'))

In [6]:
df_statsbomb_lineup = pd.read_parquet(os.path.join(STATSBOMB, 'lineup.parquet'))
df_statsbomb_lineup = df_statsbomb_lineup[['player_id', 'country_name']].copy()
df_statsbomb_lineup.drop_duplicates('player_id', inplace=True)

Standardize coordinates

In [7]:
standard = Standardizer(pitch_from='statsbomb', pitch_to='uefa')

new_x, new_y = standard.transform(df_statsbomb_event.x, df_statsbomb_event.y)
df_statsbomb_event['x'] = new_x
df_statsbomb_event['y'] = new_y

new_end_x, new_end_y = standard.transform(df_statsbomb_event.end_x, df_statsbomb_event.end_y)
df_statsbomb_event['end_x'] = new_end_x
df_statsbomb_event['end_y'] = new_end_y

Get goalkeeper for shots

In [8]:
df_gk = df_statsbomb_related.loc[(df_statsbomb_related.type_name == 'Shot') & 
                                 (df_statsbomb_related.type_name_related == 'Goal Keeper'), ['id', 'id_related']]
df_gk = df_gk.merge((df_statsbomb_event[['id', 'player_id', 'type_name', 'sub_type_name']]
                     .rename({'id': 'id_related'}, axis=1)), how='left', on='id_related')

Create a dataframe of the strongest foot

Assumptions:
- based on all events with body_part_name left or right
- both feet if the percentage of events on either foot are within 10 percentage points of each other (if the players have more than 20 events)
- left foot if left is used more often (other than when marked both feet)
- otherwise right foot

In [9]:
df_foot = df_statsbomb_event[df_statsbomb_event.body_part_name.isin(['Right Foot', 'Left Foot'])].copy()
df_foot = df_foot.groupby('player_id').body_part_name.value_counts()
df_foot = df_foot.unstack()
df_foot.reset_index(drop=False, inplace=True)
df_foot.index.name = ''
df_foot.rename({'Left Foot': 'left', 'Right Foot': 'right'}, axis=1, inplace=True)
df_foot.fillna(0, inplace=True)
n_total = df_foot[['left', 'right']].sum(axis=1)
df_foot['left'] = df_foot['left'] / n_total
df_foot['right'] = df_foot['right'] / n_total
df_foot.loc[(np.abs(df_foot.left - df_foot.right) < 0.1) & n_total > 20, 'foot'] = 'both'
df_foot.loc[(df_foot.foot.isnull()) & (df_foot.left > df_foot.right), 'foot'] = 'left'
df_foot.loc[(df_foot.foot.isnull()), 'foot'] = 'right'

Fast attack, win ball in own third, shoot in last quarter in 7-25 seconds

In [10]:
df_statsbomb_event['eventSec'] = df_statsbomb_event.timestamp.apply(lambda x: x.minute * 60 + x.second + x.microsecond / 1000000)
mask_gk = ((df_statsbomb_event.type_name == 'Goal Keeper') & 
           (~df_statsbomb_event.sub_type_name.isin(['Shot Faced', 'Goal Conceded', 'Penalty Conceded'])))
mask_50 =((df_statsbomb_event.type_name == '50/50') & 
          (df_statsbomb_event.outcome_name.isin(['Success To Team', 'Won','Success To Opposition'])))
mask_other = (df_statsbomb_event.type_name == 'Interception') | (df_statsbomb_event.sub_type_name == 'Tackle')
mask_defence_win = (mask_gk | mask_50 | mask_other) & (df_statsbomb_event.x < 35)
df_statsbomb_event.loc[mask_defence_win, 'defence_win'] = df_statsbomb_event.loc[mask_defence_win, 'team_id']
df_statsbomb_event.loc[mask_defence_win, 'defence_sec'] = df_statsbomb_event.loc[mask_defence_win, 'eventSec']
group_match = df_statsbomb_event.groupby(['match_id', 'period'])
df_statsbomb_event[['defence_win', 'defence_sec']] = group_match[['defence_win', 'defence_sec']].ffill()
mask_fast = (((df_statsbomb_event.eventSec - df_statsbomb_event.defence_sec) <= 25) &
             (df_statsbomb_event.x > 78.75) &
             (df_statsbomb_event.type_name == 'Shot') & (df_statsbomb_event.team_id == df_statsbomb_event.defence_win))
df_statsbomb_event['fast_break'] = mask_fast

Add on previous info

In [11]:
# first filter out some events so the previous event is the correct assist type
on_ball = ['Ball Recovery','Block','Clearance','Dispossessed','Dribble','Referee Ball-Drop','Shield',
           'Error','Foul Won','Goal Keeper','Interception','Miscontrol','Own Goal Against','Pass','Shot']
# won 50/50s
mask_50_50_success = ((df_statsbomb_event.type_name=='50/50')&
                      (df_statsbomb_event.outcome_name.isin(['Won','Success To Team','Success To Opposition'])))
# off ball goal keeper
mask_goalkeeper_off_ball = df_statsbomb_event.sub_type_name.isin(['Shot Faced', 'Goal Conceded',
                                                                  'Penalty Conceded', 'Smother'])
# all on the ball events
mask_on_ball = ((df_statsbomb_event.type_name.isin(on_ball))&(~mask_goalkeeper_off_ball))|(mask_50_50_success)
df_statsbomb_event = df_statsbomb_event[mask_on_ball].copy()
match_group = df_statsbomb_event.groupby(['match_id', 'period'])
for i in range(1, 4):
    df_statsbomb_event[f'prev_id_{i}'] = match_group.id.shift(i)
    df_statsbomb_event[f'prev_type_name_{i}'] = match_group.type_name.shift(i)
    df_statsbomb_event[f'prev_player_id_{i}'] = match_group.player_id.shift(i)
    df_statsbomb_event[f'prev_team_id_{i}'] = match_group.team_id.shift(i)

Add set piece column

In [12]:
mask_set_piece = df_statsbomb_event.sub_type_name.isin(['Throw-in', 'Corner', 'Free Kick'])
df_statsbomb_event.loc[mask_set_piece, 'set_piece'] = df_statsbomb_event.loc[mask_set_piece, 'sub_type_name']

Flag 10 seconds from a corner or freekick/ 20 seconds from a throw-in

In [13]:
group_match = df_statsbomb_event.groupby(['match_id', 'period'])
for set_piece in ['Corner', 'Throw-in', 'Free Kick']:
    mask = df_statsbomb_event.set_piece == set_piece
    name = set_piece.replace(' ', '_').replace('-', '_').lower()
    df_statsbomb_event.loc[mask, f'{name}_sec'] = df_statsbomb_event.loc[mask, 'eventSec']
    df_statsbomb_event.loc[mask, f'{name}_team'] = df_statsbomb_event.loc[mask, 'team_id']
    df_statsbomb_event[f'{name}_sec'] = group_match[f'{name}_sec'].ffill()
    df_statsbomb_event[f'{name}_team'] = group_match[f'{name}_team'].ffill()
    df_statsbomb_event[f'{name}_sec'] = df_statsbomb_event.eventSec - df_statsbomb_event[f'{name}_sec']
df_statsbomb_event.loc[df_statsbomb_event.throw_in_sec > 20, 'throw_in_sec'] = np.nan
df_statsbomb_event.loc[df_statsbomb_event.free_kick_sec > 10, 'free_kick_sec'] = np.nan
df_statsbomb_event.loc[df_statsbomb_event.corner_sec > 10, 'corner_sec'] = np.nan
df_statsbomb_event['play_type'] = df_statsbomb_event[['throw_in_sec', 'free_kick_sec', 'corner_sec']].idxmin(axis=1).str[:-4]
# if throw-in and defensive set to null
mask_defensive = ((df_statsbomb_event.play_type == 'throw_in') &
                  (df_statsbomb_event['throw_in_team'] != df_statsbomb_event.team_id))
df_statsbomb_event.loc[mask_defensive, 'play_type'] = np.nan

  df_statsbomb_event['play_type'] = df_statsbomb_event[['throw_in_sec', 'free_kick_sec', 'corner_sec']].idxmin(axis=1).str[:-4]


Add player name

In [14]:
player_name_series = df_statsbomb_event.player_name.str.strip().str.replace(pat='"', repl="'").str.split(' ')
df_statsbomb_event['firstName'] = player_name_series.apply(lambda x: x[0] if isinstance(x, list) else None)
df_statsbomb_event['middleName'] = player_name_series.apply(lambda x: ' '.join(x[1:-1]) if isinstance(x, list) else None)
df_statsbomb_event['middleName'] = df_statsbomb_event['middleName'].str.strip()
df_statsbomb_event['lastName'] = player_name_series.apply(lambda x: x[-1] if isinstance(x, list) else None)
df_statsbomb_event['Name'] = ((df_statsbomb_event['firstName'] + ' ' + df_statsbomb_event['middleName']).str.strip()
                              + ' ' + df_statsbomb_event['lastName'])

### Filter Shots

In [15]:
df_statsbomb_shots = (df_statsbomb_event[(df_statsbomb_event.type_name=='Shot')&
                                        (~df_statsbomb_event.sub_type_name.isin(['Penalty', 'Corner']))]
                      .dropna(how='all', axis=1)
                      .copy())
print('Number of shots:',len(df_statsbomb_shots))
print('Number of goals:', (df_statsbomb_shots.outcome_name == 'Goal').sum())

Number of shots: 85773
Number of goals: 8687


Add on competition gender

In [16]:
df_statsbomb_shots = df_statsbomb_shots.merge(df_statsbomb_match[['match_id', 'home_team_gender']].rename({'home_team_gender': 'competition_gender'},
                                                                                                          axis='columns'),
                                              on='match_id', how='left')
# fill in when the match dataset is missing
women = [7298, 22536, 265905, 266234, 266466, 266574, 266933, 267161, 267405, 267609, 267679]
men = [18235, 18236, 18237, 18240, 18241, 18242, 18245, 18236, 18237, 18240, 18241, 18242, 18243, 18244, 18245]
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(women), 'competitio_gender'] = 'female'
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(men), 'competition_gender'] = 'male'

In [17]:
additional_higher_fidelity_shots = [22536, 265905, 266234, 266466, 266574, 266933, 267161, 267405,
                                    267609, 267679]
df_statsbomb_shots = df_statsbomb_shots.merge(df_statsbomb_match[['match_id', 'metadata_shot_fidelity_version']],
                                              on='match_id', how='left', validate='m:1')
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(additional_higher_fidelity_shots),
                       'metadata_shot_fidelity_version'] = '2'

Add on pass assist information

In [18]:
pass_cols = ['pass_assisted_shot_id', 'end_x', 'end_y', 'pass_height_name',
             'pass_switch', 'technique_name', 'pass_cross', 'pass_cut_back']
df_pass = df_statsbomb_event.loc[df_statsbomb_event.pass_assisted_shot_id.notnull(), pass_cols]
df_pass.rename({'pass_assisted_shot_id': 'id', 'end_x': 'pass_end_x', 'end_y': 'pass_end_y'}, axis=1, inplace=True)
df_pass['pass_switch'].fillna(False, inplace=True)
df_pass['pass_cross'].fillna(False, inplace=True)
df_pass['pass_cut_back'].fillna(False, inplace=True)
df_statsbomb_shots = df_statsbomb_shots.merge(df_pass, on='id', how='left', suffixes=['', '_pass'])
df_statsbomb_shots.rename({'technique_name_pass': 'pass_technique_name'}, axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_pass['pass_switch'].fillna(False, inplace=True)
  df_pass['pass_switch'].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_pass['pass_cross'].fillna(False, inplace=True)
  df_pass['pass_cross'].fillna(False, inplace=True)
The behavior will change i

Simplify the pass height name column

In [19]:
df_statsbomb_shots.pass_height_name.replace({'Ground Pass': 'Ground/ Low Pass',
                                             'Low Pass': 'Ground/ Low Pass'}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_statsbomb_shots.pass_height_name.replace({'Ground Pass': 'Ground/ Low Pass',


Add carry length

In [20]:
df_statsbomb_shots['carry_length'] = (((df_statsbomb_shots.x - df_statsbomb_shots.pass_end_x)) ** 2 +
                                      ((df_statsbomb_shots.y - df_statsbomb_shots.pass_end_y)) ** 2) ** 0.5

Simplify body part name. As wyscout just has either foot and other

In [21]:
df_statsbomb_shots['body_part_name'] = df_statsbomb_shots.body_part_name.replace({'Head': 'Other'})

Merge on strongest foot

In [22]:
df_statsbomb_shots = df_statsbomb_shots.merge(df_foot, how='left', on='player_id')

Flag for whether shot is on the strongest foot

In [23]:
df_statsbomb_shots['strong_foot'] = (((df_statsbomb_shots.foot.isin(['left', 'both'])) &
                                     (df_statsbomb_shots.body_part_name == 'Left Foot')) |
                                    ((df_statsbomb_shots.foot.isin(['right', 'both'])) & 
                                     (df_statsbomb_shots.body_part_name == 'Right Foot')))

Angles/ distance to goals

In [24]:
dim_uefa = create_pitch_dims(pitch_type='uefa')
goal_width = dim_uefa.goal_width
dx = abs(dim_uefa.right - df_statsbomb_shots.x)
dy = abs(dim_uefa.center_width - df_statsbomb_shots.y)
df_statsbomb_shots['visible_angle'] = np.arctan2(goal_width * dx , (dx**2 + dy**2 - (goal_width / 2.) ** 2))
df_statsbomb_shots['middle_angle'] = np.arctan2(dy, dx)
df_statsbomb_shots['distance_to_goal'] = round((dy**2 + dx**2)**0.5, 1)

Interaction between angle and distance

In [25]:
df_statsbomb_shots['distance_visible_angle'] = df_statsbomb_shots.distance_to_goal * df_statsbomb_shots.visible_angle

Log distance

In [26]:
df_statsbomb_shots['log_distance_to_goal'] = np.log(df_statsbomb_shots.distance_to_goal)

Counter attack

In [27]:
df_statsbomb_shots['counter_attack'] = df_statsbomb_shots.play_pattern_name == 'From Counter'

Assist type

In [28]:
df_statsbomb_shots.loc[df_statsbomb_shots.pass_end_x.notnull(), 'assist_type'] = 'pass'
df_statsbomb_shots.loc[df_statsbomb_shots.sub_type_name.isin(['Free Kick', 'Corner', 'Kick Off']), 'assist_type'] = 'direct'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull() &
                       ((df_statsbomb_shots.prev_type_name_3.isin(['Shot', 'Goal Keeper'])) |
                        (df_statsbomb_shots.prev_type_name_2.isin(['Shot', 'Goal Keeper'])) |
                        (df_statsbomb_shots.prev_type_name_1.isin(['Shot', 'Goal Keeper']))), 'assist_type'] = 'rebound'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull() &
                       ((df_statsbomb_shots.prev_type_name_3.isin(['Clearance'])) |
                        (df_statsbomb_shots.prev_type_name_2.isin(['Clearance'])) |
                        (df_statsbomb_shots.prev_type_name_1.isin(['Clearance']))), 'assist_type'] = 'clearance'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull(), 'assist_type'] = 'recovery'

Shot type name

In [29]:
df_statsbomb_shots.sub_type_name.replace({'Open Play': np.nan,
                                          'Free Kick': 'direct_set_piece',
                                          'Kick Off': 'direct_set_piece'}, inplace=True)
mask_null = df_statsbomb_shots.sub_type_name.isnull()
df_statsbomb_shots.loc[mask_null, 'sub_type_name'] = df_statsbomb_shots.loc[mask_null, 'play_type']
df_statsbomb_shots.loc[df_statsbomb_shots.sub_type_name.isnull(), 'sub_type_name'] = 'open_play'
df_statsbomb_shots.rename({'sub_type_name': 'shot_type_name'}, axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_statsbomb_shots.sub_type_name.replace({'Open Play': np.nan,


Add goal boolean

In [30]:
df_statsbomb_shots['goal'] = df_statsbomb_shots.outcome_name == 'Goal'

Turn some of the StatsBomb columns to boolean columns

In [31]:
df_statsbomb_shots['shot_open_goal'].fillna(False, inplace=True)
df_statsbomb_shots['under_pressure'] = df_statsbomb_shots['under_pressure'].fillna(0).astype(bool)
df_statsbomb_shots['shot_one_on_one'].fillna(False, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_statsbomb_shots['shot_open_goal'].fillna(False, inplace=True)
  df_statsbomb_shots['shot_open_goal'].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_statsbomb_shots['shot_one_on_one'].fillna(False, inplace=True)
  df_statsbomb_shots['shot_one_on_

Reduce columns

In [32]:
cols_to_keep = ['match_id', 'id', 'eventSec', 'period', 'goal', 'team_id', 'team_name', 'player_id', 'firstName',
                'middleName', 'lastName', 'Name', 'shot_type_name', 'x', 'y', 'counter_attack',
                'fast_break', 'strong_foot', 'body_part_name', 'assist_type',
                'pass_end_y', 'pass_end_x', 'pass_switch', 'pass_cross',
                'pass_cut_back', 'pass_height_name', 'pass_technique_name',
                'carry_length', 'visible_angle', 'middle_angle', 'distance_to_goal', 
                'distance_visible_angle', 'log_distance_to_goal', 'competition_gender',
                'shot_one_on_one', 'shot_open_goal', 'under_pressure']
df_statsbomb_shots = df_statsbomb_shots[cols_to_keep].copy()

Add on the goalkeeper player id

In [33]:
df_statsbomb_shots = df_statsbomb_shots.merge(df_gk[['id', 'player_id']], how='left', on='id', suffixes=['', '_goalkeeper'])

Add on country name

In [34]:
df_statsbomb_shots = df_statsbomb_shots.merge(df_statsbomb_lineup, how='left', on='player_id', validate='m:1')

Save dataset

In [35]:
df_statsbomb_shots.reset_index(drop=True, inplace=True)
df_statsbomb_shots.to_parquet(os.path.join(STATSBOMB, 'shots.parquet'))

Show information

In [36]:
df_statsbomb_shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85773 entries, 0 to 85772
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_id                85773 non-null  int64  
 1   id                      85773 non-null  object 
 2   eventSec                85773 non-null  float64
 3   period                  85773 non-null  int64  
 4   goal                    85773 non-null  bool   
 5   team_id                 85773 non-null  int64  
 6   team_name               85773 non-null  object 
 7   player_id               85773 non-null  float64
 8   firstName               85773 non-null  object 
 9   middleName              85773 non-null  object 
 10  lastName                85773 non-null  object 
 11  Name                    85773 non-null  object 
 12  shot_type_name          85773 non-null  object 
 13  x                       85773 non-null  float64
 14  y                       85773 non-null