In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from mplsoccer import Standardizer
from mplsoccer.dimensions import create_pitch_dims
from shapely.geometry import Polygon
import geopandas as gpd

Load the data

In [2]:
WYSCOUT = os.path.join('..', '..', 'data', 'wyscout')
df_wyscout_event = pd.read_parquet(os.path.join(WYSCOUT, 'event.parquet'))
df_wyscout_player = pd.read_parquet(os.path.join(WYSCOUT, 'player.parquet'))
df_wyscout_match = pd.read_parquet(os.path.join(WYSCOUT, 'match.parquet'))
df_wyscout_formation = pd.read_parquet(os.path.join(WYSCOUT, 'formation.parquet'))
df_wyscout_sub = pd.read_parquet(os.path.join(WYSCOUT, 'substitution.parquet'))

Get the time in minutes when the goalkeepers come on the pitch

In [3]:
df_wyscout_gk = df_wyscout_player.loc[df_wyscout_player.role_name == 'Goalkeeper', ['player_id', 'role_name']].copy()
df_wyscout_formation = df_wyscout_formation.merge(df_wyscout_gk, how='inner')
df_wyscout_sub_in = df_wyscout_sub[['match_id', 'player_id_in', 'minute']].copy()
df_wyscout_sub_in.rename({'player_id_in': 'player_id', 'minute': 'minute_in'}, axis=1, inplace=True)
df_wyscout_formation = df_wyscout_formation.merge(df_wyscout_sub_in, how='left')
df_wyscout_formation.loc[df_wyscout_formation.bench == False, 'minute_in'] = 0
df_wyscout_formation = df_wyscout_formation[df_wyscout_formation.minute_in.notnull()].copy()
df_wyscout_formation.sort_values('minute_in', inplace=True)

Add on team name

In [4]:
df_team = pd.concat([(df_wyscout_match[['away_team_id', 'away_team_name']]
                      .rename({'away_team_id': 'team_id', 'away_team_name': 'team_name'}, axis=1)),
                     (df_wyscout_match[['home_team_id', 'home_team_name']]
                      .rename({'home_team_id': 'team_id', 'home_team_name': 'team_name'}, axis=1))])
df_team.drop_duplicates('team_id', inplace=True)
df_wyscout_event = df_wyscout_event.merge(df_team, on='team_id', how='left')

Add on smart pass marker

In [5]:
df_wyscout_event['smart_pass'] = (df_wyscout_event.subEventName == 'Smart pass')

Replace player_id = Zero with null

In [6]:
df_wyscout_event.player_id.replace({0: np.nan}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_wyscout_event.player_id.replace({0: np.nan}, inplace=True)


Rename Free Kick to Set Piece

In [7]:
df_wyscout_event.eventName.replace('Free Kick', 'Set Piece', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_wyscout_event.eventName.replace('Free Kick', 'Set Piece', inplace=True)


Add on a column for a pass attempt

In [8]:
mask_pass = ((df_wyscout_event.eventName == 'Pass') | 
             df_wyscout_event.subEventName.isin(['Throw in', 'Free Kick', 'Goal kick', 'Corner', 'Free kick cross']))
df_wyscout_event['pass_attempt'] = mask_pass

Add shot attempt boolean column

In [9]:
mask_corner_goal = (df_wyscout_event.subEventName=='Corner') & (df_wyscout_event.goal==True)
df_wyscout_event['shot'] = ((df_wyscout_event.eventName == 'Shot') | 
                            (df_wyscout_event.subEventName=='Free kick shot') | 
                            (df_wyscout_event.subEventName == 'Penalty') |
                            (mask_corner_goal))

Add on switch (StatsBomb definition = ball transitioned at least 50% of the pitch vertically). Note that I have already removed dodgy end locations near the corner flags so this works.

In [10]:
mask_switch = (abs(df_wyscout_event.end_y - df_wyscout_event.y) >= 50) & (df_wyscout_event.pass_attempt)
df_wyscout_event['pass_switch'] = mask_switch

Add on cross (StatsBomb definition: See Appendix 6 of the Docs for Events). We are using Shapely geom to calculate whether a pass is a cross

In [11]:
dim = create_pitch_dims(pitch_type='wyscout')
# 69.50952 is 84.1 (the minimum x value for a cross in StatsBomb's open-data) converted to Wyscout coordinates
# using mplsoccer Standardizer (rounded to 5 digits)
cross_right_start = np.array([[dim.right, dim.six_yard_bottom],
                              [dim.right, dim.bottom],
                              [69.50952, dim.bottom],
                              [69.50952, dim.penalty_area_bottom],
                              [dim.penalty_area_right, dim.penalty_area_bottom]])
cross_left_start = np.array([[dim.right, dim.six_yard_top],
                              [dim.right, dim.top],
                              [69.50952, dim.top],
                              [69.50952, dim.penalty_area_top],
                              [dim.penalty_area_right, dim.penalty_area_top]])
cross_right_end = np.array([[dim.right, dim.top],
                            [dim.penalty_area_right, dim.top],
                            [dim.penalty_area_right, dim.six_yard_bottom],
                            [dim.right, dim.six_yard_bottom]])
cross_left_end = np.array([[dim.right, dim.bottom],
                           [dim.penalty_area_right, dim.bottom],
                           [dim.penalty_area_right, dim.six_yard_top],
                           [dim.right, dim.six_yard_top]])

cross_left_start = Polygon(cross_left_start)
cross_right_start = Polygon(cross_right_start)
cross_left_end = Polygon(cross_left_end)
cross_right_end = Polygon(cross_right_end)

# find intersection of passes and cross polygons
df_pass = df_wyscout_event[df_wyscout_event.pass_attempt].copy()
start_pass_points = gpd.GeoSeries.from_xy(df_pass['x'], df_pass['y'])
end_pass_points = gpd.GeoSeries.from_xy(df_pass['end_x'], df_pass['end_y'])
mask_cross = ((start_pass_points.intersects(cross_left_start) & end_pass_points.intersects(cross_left_end)) |
              (start_pass_points.intersects(cross_right_start) & end_pass_points.intersects(cross_right_end)))
cross_ids = df_pass[mask_cross].id
df_wyscout_event['cross'] = df_wyscout_event.id.isin(cross_ids)

Add on cut-back StatsBomb definition (StatsBomb definition: See Appendix 5 of the Docs for Events)

In [12]:
cut_left_start = np.array([[dim.right, dim.six_yard_top],
                           [dim.right, dim.top],
                           [dim.six_yard_right, dim.top],
                           [dim.six_yard_right, dim.six_yard_top]])
cut_right_start = np.array([[dim.right, dim.six_yard_bottom],
                            [dim.right, dim.bottom],
                            [dim.six_yard_right, dim.bottom],
                            [dim.six_yard_right, dim.six_yard_bottom]])

cut_left_end = np.array([[dim.six_yard_right, dim.penalty_area_top],
                         [dim.penalty_area_right, dim.penalty_area_top],
                         [dim.penalty_area_right, dim.six_yard_bottom],
                         [dim.six_yard_right, dim.six_yard_bottom]])
cut_right_end = np.array([[dim.six_yard_right, dim.penalty_area_bottom],
                         [dim.penalty_area_right, dim.penalty_area_bottom],
                         [dim.penalty_area_right, dim.six_yard_top],
                         [dim.six_yard_right, dim.six_yard_top]])

cut_left_start = Polygon(cut_left_start)
cut_right_start = Polygon(cut_right_start)
cut_left_end = Polygon(cut_left_end)
cut_right_end = Polygon(cut_right_end)

# find intersection of passes and cut polygons
mask_cut = ((start_pass_points.intersects(cut_left_start) & end_pass_points.intersects(cut_left_end)) |
            (start_pass_points.intersects(cut_right_start) & end_pass_points.intersects(cut_right_end)))
# not high and comes from a normal pass (not corner etc.)
cut_ids = df_pass[((mask_cut) & (df_pass.high == False) & (df_pass.eventName == 'Pass'))].id
df_wyscout_event['cut_back'] = df_wyscout_event.id.isin(cut_ids)

Fix a corner at the wrong end

In [13]:
df_wyscout_event.loc[(df_wyscout_event.y == 100) & (df_wyscout_event.x == 0) & 
                     (df_wyscout_event.subEventName == 'Corner'), 'x'] = 100

Correct a few shots that appear to be on the wrong side of the pitch. I haven't checked, but these seem to be too far away to be real shots. Especially as some are goals

In [14]:
mask_correct_shot = (df_wyscout_event.x < 34) & (df_wyscout_event.shot)
df_wyscout_event.loc[mask_correct_shot, 'x'] = 100 - df_wyscout_event.loc[mask_correct_shot, 'x']

Standardize coordinates

In [15]:
standard = Standardizer(pitch_from='wyscout', pitch_to='uefa')

new_x, new_y = standard.transform(df_wyscout_event.x, df_wyscout_event.y)
df_wyscout_event['x'] = new_x
df_wyscout_event['y'] = new_y

new_end_x, new_end_y = standard.transform(df_wyscout_event.end_x, df_wyscout_event.end_y)
df_wyscout_event['end_x'] = new_end_x
df_wyscout_event['end_y'] = new_end_y

Add goal scored (excluding shootouts)

In [16]:
df_wyscout_event['goal_scored_excl_shootout'] = (((df_wyscout_event.goal) &
                                                  (df_wyscout_event['matchPeriod']!='P') &
                                                  (df_wyscout_event['shot'])) |
                                                 (df_wyscout_event.own_goal))

Check for missing Goals - there is one Kevin de Bruyne goal missing in the Wyscout event data

In [17]:
goals_per_game = pd.DataFrame(df_wyscout_event[df_wyscout_event['goal_scored_excl_shootout']]
                              .groupby('match_id')
                              .match_id.count())
goals_per_game.columns = ['Goals']
goals_per_game.reset_index(inplace=True)
goals_per_game = goals_per_game.merge(df_wyscout_match[['home_score', 'label', 'away_score','match_id', 'kick_off']], 
                                      how='right', on='match_id')
goals_per_game = goals_per_game.fillna(0)
# only one game missing a goal it's a Kevin de Bruyne goal and it's not in the event data
goals_per_game[goals_per_game.Goals != (goals_per_game.home_score + goals_per_game.away_score)]

Unnamed: 0,match_id,Goals,home_score,label,away_score,kick_off
313,2499781,0.0,0.0,"Chelsea - Manchester City, 0 - 1",1.0,2017-09-30 18:30:00


#### Create a pass_height_name feature.

Assumptions:
- headed passes are high (roughly 60% are in the StatsBomb data)
- smart passes with the through ball tag are Ground/ low. It says in the Wyscout docs that through ball is added to smart pass if the pass is on the ground or it’s over the heads of the opposite players, but it’s on short distance – 5-10 meters.
- smart passes without through balls are high passes
- hand passes are ground/low (hopefully launch catches high passes)
- throw-in / goal kick are high
- free-kick, crosses and corners are low unless high=True

In [18]:
# assumption made here that head passes are high passes (in StatsBomb roughly 60% are)
df_wyscout_event.loc[df_wyscout_event.subEventName.isin(['High pass', 'Launch', 'Head pass']),
                     'pass_height_name'] = 'High Pass'
df_wyscout_event.loc[(df_wyscout_event.high == True) & (df_wyscout_event.eventName == 'Pass'),
                     'pass_height_name'] = 'High Pass'
# if smart pass and not a through ball assumed high
df_wyscout_event.loc[(df_wyscout_event.subEventName == 'Smart pass') & (df_wyscout_event.through == False),
                     'pass_height_name'] = 'High Pass'
df_wyscout_event.loc[df_wyscout_event.subEventName == 'Simple pass',
                     'pass_height_name'] = 'Ground/ Low Pass' 
df_wyscout_event.loc[(df_wyscout_event.subEventName.isin(['Corner', 'Free kick cross', 'Free Kick', 'Cross'])) &
                     (df_wyscout_event.high == False),
                     'pass_height_name'] = 'Ground/ Low Pass'
df_wyscout_event.loc[df_wyscout_event.subEventName.isin(['Throw in', 'Goal kick']),
                     'pass_height_name'] = 'High Pass'
df_wyscout_event.loc[(df_wyscout_event.subEventName.isin(['Corner', 'Free kick cross', 'Cross'])) &
                     (df_wyscout_event.high),
                     'pass_height_name'] = 'High Pass'
# assumption made here that smart through balls are ground/ low
df_wyscout_event.loc[(df_wyscout_event.subEventName == 'Smart pass') & (df_wyscout_event.through),
                     'pass_height_name'] = 'Ground/ Low Pass'
df_wyscout_event.loc[(df_wyscout_event.subEventName == 'Hand pass'),
                     'pass_height_name'] = 'Ground/ Low Pass'

Seperate out names in player dataset

In [19]:
df_wyscout_player['fullName'] = (df_wyscout_player.firstName + ' ' + df_wyscout_player.lastName).str.strip()
player_name_series = df_wyscout_player.fullName.str.split(' ')
df_wyscout_player['firstName'] = player_name_series.apply(lambda x: x[0] if isinstance(x, list) else None)
df_wyscout_player['middleName'] = player_name_series.apply(lambda x: ' '.join(x[1:-1]) if isinstance(x, list) else None)
df_wyscout_player['lastName'] = player_name_series.apply(lambda x: x[-1] if isinstance(x, list) else None)
df_wyscout_player['middleName'] = df_wyscout_player['middleName'].str.strip()
df_wyscout_player['Name'] = ((df_wyscout_player['firstName'] + ' ' + df_wyscout_player['middleName']).str.strip()
                             + ' ' + df_wyscout_player['lastName'])

Add on the player name/ foot

In [20]:
df_wyscout_player.foot.replace({'null': None, '': None}, inplace=True)
df_wyscout_player.rename({'birthArea_name': 'country_name'}, axis='columns', inplace=True)
df_wyscout_event = df_wyscout_event.merge(df_wyscout_player[['player_id', 'firstName', 'middleName', 'lastName', 'Name', 'foot', 'country_name']],
                                          how='left', on='player_id')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_wyscout_player.foot.replace({'null': None, '': None}, inplace=True)


Create a pass_technique name.

Assumptions:
- outswinging, different foot as the side of the pitch
- inswinging, same foot as the side of the pitch
- missing/ both foot = inswinging
- each player takes the kick with the foot in the player table.

This misses straight corner kicks and may not be 100% correct. 

In [21]:
mask_corner = df_wyscout_event.subEventName == 'Corner'
mask_right = df_wyscout_event.foot == 'right'
mask_left = df_wyscout_event.foot == 'left'
mask_right_side = df_wyscout_event.y >= dim.center_width
mask_left_side = df_wyscout_event.y < dim.center_width
mask_both = df_wyscout_event.foot == 'both'
mask_missing = df_wyscout_event.foot.isnull()
mask_inswing = mask_corner & ((mask_left & mask_left_side) | (mask_right & mask_right_side) | mask_both | mask_missing)
mask_outswing = mask_corner & ((mask_left & mask_right_side) | (mask_right & mask_left_side))
df_wyscout_event.loc[mask_inswing, 'pass_technique_name'] = 'Inswinging'
df_wyscout_event.loc[mask_outswing, 'pass_technique_name'] = 'Outswinging'
df_wyscout_event.loc[df_wyscout_event.through, 'pass_technique_name'] = 'Through Ball'

Fast attack, win ball in own third, shoot in last quarter in 7-25 seconds

In [22]:
mask_defence_win = ((df_wyscout_event.subEventName.isin(['Ground defending duel', 'Air duel', 'Save attempt'])) |
                    (df_wyscout_event.interception)) & (df_wyscout_event.x < 35)
df_wyscout_event.loc[mask_defence_win, 'defence_win'] = df_wyscout_event.loc[mask_defence_win, 'team_id']
df_wyscout_event.loc[mask_defence_win, 'defence_sec'] = df_wyscout_event.loc[mask_defence_win, 'eventSec']
group_match = df_wyscout_event.groupby(['match_id', 'matchPeriod'])
df_wyscout_event[['defence_win', 'defence_sec']] = group_match[['defence_win', 'defence_sec']].ffill()
mask_fast = (((df_wyscout_event.eventSec - df_wyscout_event.defence_sec) <= 25) &
             (df_wyscout_event.x > 78.75) &
             (df_wyscout_event.shot) & (df_wyscout_event.team_id == df_wyscout_event.defence_win))
df_wyscout_event['fast_break'] = mask_fast

Flag 10 seconds from a corner or freekick/ 20 seconds from a throw-in

In [23]:
for set_piece in ['Corner', 'Throw in', ['Free kick cross', 'Free kick shot']]:
    if isinstance(set_piece, list):
        mask = df_wyscout_event.subEventName.isin(set_piece)
        name = 'free_kick'
    else:
        mask = df_wyscout_event.subEventName.isin([set_piece])
        name = set_piece.replace(' ', '_').lower()
    df_wyscout_event.loc[mask, f'{name}_sec'] = df_wyscout_event.loc[mask, 'eventSec']
    df_wyscout_event.loc[mask, f'{name}_team'] = df_wyscout_event.loc[mask, 'team_id']
    df_wyscout_event[f'{name}_sec'] = group_match[f'{name}_sec'].ffill()
    df_wyscout_event[f'{name}_team'] = group_match[f'{name}_team'].ffill()
    df_wyscout_event[f'{name}_sec'] = df_wyscout_event.eventSec - df_wyscout_event[f'{name}_sec']
df_wyscout_event.loc[df_wyscout_event.throw_in_sec > 20, 'throw_in_sec'] = np.nan
df_wyscout_event.loc[df_wyscout_event.free_kick_sec > 10, 'free_kick_sec'] = np.nan
df_wyscout_event.loc[df_wyscout_event.corner_sec > 10, 'corner_sec'] = np.nan
df_wyscout_event['play_type'] = df_wyscout_event[['throw_in_sec', 'free_kick_sec', 'corner_sec']].idxmin(axis=1).str[:-4]
# if throw-in and defensive set to null
mask_defensive = ((df_wyscout_event.play_type == 'throw_in') &
                  (df_wyscout_event['throw_in_team'] != df_wyscout_event.team_id))
df_wyscout_event.loc[mask_defensive, 'play_type'] = np.nan

  df_wyscout_event['play_type'] = df_wyscout_event[['throw_in_sec', 'free_kick_sec', 'corner_sec']].idxmin(axis=1).str[:-4]


Add on previous info

In [24]:
# first filter out some events so the previous event is the correct assist type
mask_exclude = ((df_wyscout_event.eventName.isin(['Goalkeeper leaving line', 'Interruption'])) |
                (df_wyscout_event.subEventName == 'Acceleration'))
df_wyscout_event = df_wyscout_event[~mask_exclude].copy()
match_group = df_wyscout_event.groupby(['match_id', 'matchPeriod'])
for i in range(1, 4):
    df_wyscout_event[f'prev_id_{i}'] = match_group.id.shift(i)
    df_wyscout_event[f'prevEventName_{i}'] = match_group.eventName.shift(i)
    df_wyscout_event[f'prevSubEventName_{i}'] = match_group.subEventName.shift(i)
    df_wyscout_event[f'prev_player_id_{i}'] = match_group.player_id.shift(i)
    df_wyscout_event[f'prev_team_id_{i}'] = match_group.team_id.shift(i)
    df_wyscout_event[f'prev_pass_attempt_{i}'] = match_group.pass_attempt.shift(i)
    df_wyscout_event[f'prev_shot_{i}'] = match_group.shot.shift(i)

### Filter Non-penalty/ non-corner shots

In [25]:
mask_shot1 = (((df_wyscout_event.eventName=='Shot') | (df_wyscout_event.subEventName=='Free kick shot')) & 
              (df_wyscout_event['matchPeriod']!='P'))
mask_shot2 = (df_wyscout_event.subEventName=='Corner') & (df_wyscout_event.goal==True)
mask_corner_goal = (df_wyscout_event.subEventName=='Corner') & (df_wyscout_event.goal==True)
mask_shot = (mask_shot1 | mask_shot2) & (~mask_corner_goal)
df_wyscout_shots = df_wyscout_event[mask_shot].copy()
print('Number of shots:', len(df_wyscout_shots))
print('Number of goals:', df_wyscout_shots.goal.sum())

Number of shots: 42957
Number of goals: 4397


Add body part name

In [26]:
df_wyscout_shots.loc[df_wyscout_shots.left_foot, 'body_part_name'] = 'Left Foot'
df_wyscout_shots.loc[df_wyscout_shots.right_foot, 'body_part_name'] = 'Right Foot'
df_wyscout_shots.loc[df_wyscout_shots.head_body, 'body_part_name'] = 'Other'
df_wyscout_shots.drop(['left_foot', 'right_foot', 'head_body'], axis=1, inplace=True)

Strongest foot column

In [27]:
mask_strong_foot = (((df_wyscout_shots.foot.isin(['right', 'both'])) &
                     (df_wyscout_shots.body_part_name == 'Right Foot')) |
                    (((df_wyscout_shots.foot.isin(['left', 'both'])) & 
                      (df_wyscout_shots.body_part_name == 'Left Foot'))))
df_wyscout_shots['strong_foot'] = mask_strong_foot

Add shot type name

In [28]:
# note there are three goals that come direct from corner kicks - they are tagged 'Free Kick' originally though
# this was renamed earlier to Set Piece
df_wyscout_shots['shot_type_name']  = df_wyscout_shots.eventName.replace({'Shot': 'Open Play',
                                                                          'Set Piece': 'Direct Set Piece'})

Assist type, and get event id if pass

In [29]:
# direct from set pieces
df_wyscout_shots.loc[df_wyscout_shots.shot_type_name == 'Direct Set Piece', 'assist_type'] = 'direct'
# rebound/ clearance from the previous event
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) &
                     (df_wyscout_shots.prevEventName_1.isin(['Save attempt', 'Shot', 'Set Piece'])), 'assist_type'] = 'rebound'
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) &
                     (df_wyscout_shots.prevSubEventName_1 == 'Clearance'), 'assist_type'] = 'clearance'
# pass from the previous event
mask_pass1 = ((df_wyscout_shots.prev_pass_attempt_1 == True) &
              (df_wyscout_shots.team_id == df_wyscout_shots.prev_team_id_1) &
              (df_wyscout_shots.assist_type.isnull()))
df_wyscout_shots.loc[mask_pass1, 'assist_type'] = 'pass'
df_wyscout_shots.loc[mask_pass1, 'pass_id'] = df_wyscout_shots.loc[mask_pass1, 'prev_id_1']
# pass from the third previous event if and there are two duels in-between
mask_duel = ((df_wyscout_shots.prevEventName_1 == 'Duel') &
             (df_wyscout_shots.prevEventName_2 == 'Duel') &
             (df_wyscout_shots.assist_type.isnull()))             
mask_pass2 = (mask_duel & (df_wyscout_shots.prev_pass_attempt_3) &
              (df_wyscout_shots.team_id == df_wyscout_shots.prev_team_id_3))
df_wyscout_shots.loc[mask_pass2, 'assist_type'] = 'pass'
df_wyscout_shots.loc[mask_pass2, 'pass_id'] = df_wyscout_shots.loc[mask_pass2, 'prev_id_3'] 
# rebound/clearance if the third previous event involved a shot or save and there are two duels in-between
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) & mask_duel &
                     ((df_wyscout_shots.prev_shot_3) | (df_wyscout_shots.prevEventName_3 == 'Save attempt')),
                     'assist_type'] = 'rebound'
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) & mask_duel &
                     (df_wyscout_shots.prevSubEventName_3 =='Clearance'),
                     'assist_type'] = 'clearance'
# if still null and the second event involves a shot or save set to a rebound
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) & 
                     ((df_wyscout_shots.prevEventName_2 == 'Save attempt') | (df_wyscout_shots.prev_shot_2)),
                     'assist_type'] = 'rebound'
# if still null and the second event involves a clearance set to a clearance
df_wyscout_shots.loc[df_wyscout_shots.assist_type.isnull() & (df_wyscout_shots.prevSubEventName_2 == 'Clearance'),
                     'assist_type'] = 'clearance'
# if still null and the third event involves a pass set to a pass
mask_pass3 = ((df_wyscout_shots.assist_type.isnull()) & (df_wyscout_shots.prev_pass_attempt_3) & 
              (df_wyscout_shots.prev_team_id_3 == df_wyscout_shots.team_id))
df_wyscout_shots.loc[mask_pass3, 'assist_type'] = 'pass'
df_wyscout_shots.loc[mask_pass3, 'pass_id'] = df_wyscout_shots.loc[mask_pass3, 'prev_id_3']
# if still null set to recovery
df_wyscout_shots.loc[df_wyscout_shots.assist_type.isnull(), 'assist_type'] = 'recovery'

Keep subset of columns

In [30]:
df_wyscout_shots = df_wyscout_shots[['match_id', 'matchPeriod', 'eventSec', 'id', 'goal', 'team_id',
                                     'team_name', 'player_id',
                                     'firstName', 'middleName', 'lastName', 'Name', 'country_name',
                                     'shot_type_name', 'play_type',
                                     'x', 'y', 'counter_attack', 'fast_break',
                                     'strong_foot', 'body_part_name', 'assist_type', 'pass_id']].copy()                               

Add on pass information

In [31]:
df_pass = df_wyscout_event.loc[df_wyscout_event.pass_attempt, ['id', 'end_y', 'end_x', 'smart_pass',
                                                               'pass_switch', 'cross', 'cut_back', 'pass_height_name',
                                                               'pass_technique_name']].copy()
df_pass.rename({'id': 'pass_id', 'end_x': 'pass_end_x', 'end_y': 'pass_end_y',
                'cross': 'pass_cross', 'cut_back': 'pass_cut_back'}, axis=1, inplace=True)
df_wyscout_shots = df_wyscout_shots.merge(df_pass, how='left', on='pass_id')

Angles/ distance to goals

In [32]:
dim_uefa = create_pitch_dims(pitch_type='uefa')
goal_width = dim_uefa.goal_width
dx = abs(dim_uefa.right - df_wyscout_shots.x)
dy = abs(dim_uefa.center_width - df_wyscout_shots.y)
df_wyscout_shots['visible_angle'] = np.arctan2(goal_width * dx , (dx**2 + dy**2 - (goal_width / 2.) ** 2))
df_wyscout_shots['middle_angle'] = np.arctan2(dy, dx)
df_wyscout_shots['distance_to_goal'] = round((dy**2 + dx**2)**0.5, 1)

Interaction between angle and distance

In [33]:
df_wyscout_shots['distance_visible_angle'] = df_wyscout_shots.distance_to_goal * df_wyscout_shots.visible_angle

Log distance

In [34]:
df_wyscout_shots['log_distance_to_goal'] = np.log(df_wyscout_shots.distance_to_goal)

Amend shot type to take into account the play_type (set piece column) made earlier

In [35]:
mask_amend = (df_wyscout_shots.shot_type_name != 'Direct Set Piece') & (df_wyscout_shots.play_type.notnull())
df_wyscout_shots.loc[mask_amend, 'shot_type_name'] = df_wyscout_shots.loc[mask_amend, 'play_type']
df_wyscout_shots['shot_type_name'] = df_wyscout_shots.shot_type_name.str.lower().str.replace(' ', '_')
df_wyscout_shots.drop('play_type', axis=1, inplace=True)

Add Men

In [36]:
df_wyscout_shots['competition_gender'] = 'male'

Add on the goalkeeper

In [37]:
df_wyscout_shots['minute'] = np.ceil(df_wyscout_shots.eventSec / 60)
df_wyscout_shots.loc[(df_wyscout_shots.matchPeriod == '1H') & (df_wyscout_shots.minute > 45), 'minute'] = 45
df_wyscout_shots.sort_values('minute', inplace=True)
df_wyscout_shots = pd.merge_asof(df_wyscout_shots, df_wyscout_formation[['player_id', 'minute_in']],
                                 left_on='minute', right_on='minute_in',
                                 allow_exact_matches=True, direction='backward', suffixes=['', '_goalkeeper'])

Match Period consistent with StatsBomb

In [38]:
df_wyscout_shots.rename({'matchPeriod': 'period'}, axis=1, inplace=True)
df_wyscout_shots['period'] = df_wyscout_shots.period.map({'1H': 1, '2H': 2, 'E1': 3, 'E2': 4})

Save dataset

In [39]:
df_wyscout_shots.drop(['pass_id', 'minute', 'minute_in'], axis=1, inplace=True)
df_wyscout_shots.reset_index(drop=True, inplace=True)
df_wyscout_shots.to_parquet(os.path.join(WYSCOUT, 'shots.parquet'))

Info on dataset

In [40]:
df_wyscout_shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42957 entries, 0 to 42956
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_id                42957 non-null  int64  
 1   period                  42957 non-null  int64  
 2   eventSec                42957 non-null  float64
 3   id                      42957 non-null  int64  
 4   goal                    42957 non-null  bool   
 5   team_id                 42957 non-null  int64  
 6   team_name               42957 non-null  object 
 7   player_id               42954 non-null  float64
 8   firstName               42954 non-null  object 
 9   middleName              42954 non-null  object 
 10  lastName                42954 non-null  object 
 11  Name                    42954 non-null  object 
 12  country_name            42954 non-null  object 
 13  shot_type_name          42957 non-null  object 
 14  x                       42957 non-null