In [None]:
import pandas as pd
import numpy as np
import json
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None

In [None]:
# read all events
#events_folder = 'data/events'
#events = pd.concat([pd.DataFrame(json.load(open(os.path.join(events_folder, filename), encoding='utf8'))).assign(match_id=filename.split('.')[0]) for filename in os.listdir(events_folder)])
#events.reset_index(inplace=True, drop=True)

In [None]:
# get all match_ids for La Liga matches
match_ids = []
laliga_comps = 'data/matches/11'
for filename in os.listdir(laliga_comps):
    with open(os.path.join(laliga_comps, filename), encoding='utf8') as f:
        matches = json.load(f)
        matches = pd.DataFrame(matches)
        match_ids.extend(matches['match_id'].unique())

# read the events from all La Liga matches
events_folder = 'data/events'
events = pd.concat([pd.DataFrame(json.load(open(os.path.join(events_folder, str(match_id) + '.json'), encoding='utf8'))).assign(match_id=match_id) for match_id in match_ids])
events.reset_index(inplace=True)

In [None]:
def area(x1, y1, x2, y2, x3, y3):
    return abs((x1*(y2-y3)+(x2*(y3-y1))+(x3*(y1-y2)))/2.0)

def point_in_triangle(x_shot, y_shot, x_player, y_player):
    x_post = 120
    y_post1 = 36 - 1
    y_post2 = 44 + 1
    A = area(x_shot, y_shot, x_post, y_post1, x_post, y_post2)
    A1 = area(x_player, y_player, x_post, y_post1, x_post, y_post2)
    A2 = area(x_player, y_player, x_shot, y_shot, x_post, y_post1)
    A3 = area(x_player, y_player, x_shot, y_shot, x_post, y_post2)
    if (A1+A2+A3) - A < 0.1:
        return True
    else:
        return False

# Add goalkeeper_id for the opposing goalkeeper at a shot
def get_goalkeeper(players):
    return next(iter([player['player']['id'] for player in players if (player['position']['id'] == 1) and (player['teammate'] == False)]), None)

In [None]:
def get_df_shots(events):
    # get all shot assits
    events.loc[:, 'pass_event'] = [1 if events['type'].iloc[i]['id'] == 30 else 0 for i in range(len(events))]
    passes = events[events['pass_event'] == 1]
    passes.loc[:, 'assisted_shot_id'] = [passes['pass'].iloc[i]['assisted_shot_id'] if 'assisted_shot_id' in passes['pass'].iloc[i].keys() else '-1' for i in range(len(passes))]
    passes.loc[:, 'pass_height'] = [passes['pass'].iloc[i]['height']['id'] if 'height' in passes['pass'].iloc[i].keys() else 1 for i in range(len(passes))]
    passes = passes[passes['assisted_shot_id'] != '-1']
    passes.index = np.arange(len(passes))
    
    # get all shots
    events.loc[:, 'shot_event'] = [1 if events['type'].iloc[i]['id'] == 16 else 0 for i in range(len(events))]
    shots = events[events['shot_event'] == 1]
    shots.drop(['shot_event', 'pass_event'], axis=1, inplace=True)
    shots.index = np.arange(len(shots))
    
    # add the height and the player_id of the assisted pass to a shot
    shots.loc[:, 'pass_height'] = [passes.loc[passes['assisted_shot_id'] == shots['id'].iloc[i], 'pass_height'].iloc[0] if len(passes[passes['assisted_shot_id'] == shots['id'].iloc[i]]) > 0 else 1 for i in range(len(shots))]
    shots.loc[:, 'assistant_id'] = [passes.loc[passes['assisted_shot_id'] == shots['id'].iloc[i], 'player'].iloc[0]['id'] if len(passes[passes['assisted_shot_id'] == shots['id'].iloc[i]]) > 0 else -1 for i in range(len(shots))]
    
    # extract coordinates of the location of a shot
    shots.loc[:, 'location_x'] = [shots['location'].iloc[i][0] for i in range(len(shots))]
    shots.loc[:, 'location_y'] = [shots['location'].iloc[i][1] for i in range(len(shots))]
    #shots.loc[:, 'location_z'] = [shots['location'].iloc[i][2] if len(shots['location'].iloc[i]) > 2 else -1 for i in range(len(shots))]

    # Add team_id and player_id from whom a shot was taken
    shots = shots.assign(team_id = [shots['team'].iloc[i]['id'] for i in range(len(shots))],
                         player_id = [shots['player'].iloc[i]['id'] for i in range(len(shots))])

    shots_with_gk_idx = [i for i in shots.index if isinstance(shots['shot'].iloc[i], dict) and ('freeze_frame' in shots['shot'].iloc[i].keys())]
    shots.loc[shots_with_gk_idx, 'goalkeeper_id'] = [get_goalkeeper(shots['shot'].loc[i]['freeze_frame']) for i in shots_with_gk_idx]

    # Generation of features for a shot
    shots = shots.assign(goal = [1 if shots['shot'].iloc[i]['outcome']['id'] == 97 else 0 for i in range(len(shots))],
                         statsbomb_xg = [shots['shot'].iloc[i]['statsbomb_xg'] for i in range(len(shots))],
                         first_touch = [1 if ('first_time' in shots['shot'].iloc[i].keys()) and (shots['shot'].iloc[i]['first_time']) else 0 for i in range(len(shots))],
                         follows_dribble = [1 if ('follows_dribble' in shots['shot'].iloc[i].keys()) and (shots['shot'].iloc[i]['follows_dribble']) else 0 for i in range(len(shots))],
                         foot_shot = [1 if shots['shot'].iloc[i]['body_part']['id'] in [38, 40] else 0 for i in range(len(shots))],
                         header = [1 if shots['shot'].iloc[i]['body_part']['id'] == 37 else 0 for i in range(len(shots))],
                         open_goal = [1 if ('open_goal' in shots['shot'].iloc[i].keys()) and (shots['shot'].iloc[i]['open_goal']) else 0 for i in range(len(shots))],
                         under_pressure = [1 if shots['under_pressure'].iloc[i] == True else 0 for i in range(len(shots))],
                         defender = [1 if shots['position'].iloc[i]['id'] < 9 else 0 for i in range(len(shots))],
                         midfielder = [1 if (shots['position'].iloc[i]['id'] > 8) and (shots['position'].iloc[i]['id'] < 22) else 0 for i in range(len(shots))],
                         striker = [1 if shots['position'].iloc[i]['id'] > 21 else 0 for i in range(len(shots))],
                         goal_distance = [np.sqrt(np.square(120-shots['location'].iloc[i][0]) + np.square(40-shots['location'].iloc[i][1])) for i in range(len(shots))])

    # calculate goal distance in meters
    shots.loc[:, 'goal_distance_m'] = [np.sqrt(np.square((120-shots['location_x'].iloc[i])/120*105) + np.square((40-shots['location_y'].iloc[i])/80*70)) for i in range(len(shots))]
    
    # One-Hot-Encoding of the Play-Patterns
    play_patterns = {1: 'Regular Play', 2: 'From Corner', 3: 'From Free Kick', 4: 'From Throw In', 6: 'From Counter', 
                     7: 'From Goal Kick', 8: 'From Keeper', 9: 'From Kick Off'} 
    for pattern_id, pattern_name in play_patterns.items():
        shots.loc[:, pattern_name] = [1 if shots['play_pattern'].iloc[i]['id'] == pattern_id else 0 for i in range(len(shots))]
        
    # One-Hot-Encoding of the Technique with that a shot was taken
    techniques = {89: 'Backheel', 90: 'Diving Header', 91: 'Half Volley', 92: 'Lob', 94: 'Overhead Kick', 95: 'Volley'} 
    for technique_id, technique_name in techniques.items():
        shots.loc[:, technique_name] = [1 if shots['shot'].iloc[i]['technique']['id'] == technique_id else 0 for i in range(len(shots))]

    # One-Hot-Encoding of the type of a shot
    types = {61: 'Corner', 62: 'Free Kick', 87: 'Open Play', 88: 'Penalty', 65: 'Kick Off'} 
    for type_id, type_name in types.items():
        shots.loc[:, type_name] = [1 if shots['shot'].iloc[i]['type']['id'] == type_id else 0 for i in range(len(shots))]

    # add if a shot resulted from a pressing situation
    shots.loc[:, 'from_pressing'] = 0
    for idx, row in shots.iterrows():
        if row['second'] >= 8:
            prev_events = events[(events['match_id'] == row['match_id']) & (events['minute'] == row['minute']) & (events['second'].between(row['second']-8, row['second']))]
        else:
            prev_events = events[(events['match_id'] == row['match_id']) & (((events['minute'] == row['minute']) & (events['second'] <= row['second'])) | ((events['minute'] == row['minute']-1) & (events['second'].between(60+row['second']-8, 60))))]
        if prev_events['counterpress'].sum() > 0:
            shots.loc[idx, 'from_pressing'] = 1
    shots.loc[(shots['from_pressing'] == 1) & (shots['Open Play'] == 0), 'from_pressing'] = 0
        
    # Calculation of the goal angle for a shot
    shots.loc[:, 'goal_angle'] = [np.arctan(8 * (120 - shots['location'].iloc[i][0]) / ((120 - shots['location'].iloc[i][0])**2 + np.abs(40 - shots['location'].iloc[i][1])**2 - (8/2)**2))  for i in range(len(shots))]
    shots.loc[:, 'goal_angle'] = [shots['goal_angle'].iloc[i] + np.pi if shots['goal_angle'].iloc[i] < 0 else shots['goal_angle'].iloc[i] for i in range(len(shots))]
    shots.loc[:, 'goal_angle'] = shots['goal_angle']*180 / np.pi

    # number of opposing players and teammates between a shot and the goal
    shots['num_teammates'] = [np.sum([1 if player['teammate'] and point_in_triangle(shots['location'].iloc[i][0], shots['location'].iloc[i][1], player['location'][0], player['location'][1]) else 0 for player in shots['shot'].iloc[i]['freeze_frame']]) if isinstance(shots['shot'].iloc[i], dict) and ('freeze_frame' in shots['shot'].iloc[i].keys()) else 0 for i in range(len(shots))]
    shots['num_opposition'] = [np.sum([1 if (player['teammate'] == False) and point_in_triangle(shots['location'].iloc[i][0], shots['location'].iloc[i][1], player['location'][0], player['location'][1]) else 0 for player in shots['shot'].iloc[i]['freeze_frame']]) if isinstance(shots['shot'].iloc[i], dict) and ('freeze_frame' in shots['shot'].iloc[i].keys()) else 0 for i in range(len(shots))]
    shots['gk_in_goal'] = [np.sum([1 if (player['position']['id'] == 1) and point_in_triangle(shots['location'].iloc[i][0], shots['location'].iloc[i][1], player['location'][0], player['location'][1]) else 0 for player in shots['shot'].iloc[i]['freeze_frame']]) if isinstance(shots['shot'].iloc[i], dict) and ('freeze_frame' in shots['shot'].iloc[i].keys()) else 0 for i in range(len(shots))]

    shots['num_teammates'] = shots['num_teammates'].fillna(0).astype(int)
    shots['num_opposition'] = shots['num_opposition'].fillna(0).astype(int)
    shots['gk_in_goal'] = shots['gk_in_goal'].fillna(0).astype(int)
    
    # Dropping of irrelevant columns
    shots.drop(['level_0', 'index', '50_50', 'clearance', 'half_end', 'half_start', 'injury_stoppage', 'miscontrol',
                'period', 'position', 'possession_team', 'player', 'team', 'shot', 'location', 'off_camera', 'out',
                'player_off', 'interception', 'pass', 'play_pattern', 'possession', 'id', 'related_events',
                'substitution', 'tactics', 'timestamp', 'type', 'bad_behaviour', 'ball_receipt', 'ball_recovery', 'block',
                'carry', 'counterpress', 'dribble', 'duel', 'foul_committed', 'foul_won', 'goalkeeper'], axis=1, inplace=True, errors='ignore')
    
    # renaming of columns
    shots.columns = ['duration', 'match_id', 'minute', 'second', 'under_pressure',
                     'pass_height', 'assistant_id', 'location_x', 'location_y', 'team_id',
                     'player_id', 'goalkeeper_id', 'goal', 'statsbomb_xg', 'first_touch',
                     'follows_dribble', 'foot_shot', 'header', 'open_goal', 'defender',
                     'midfielder', 'striker', 'goal_distance', 'goal_distance_m', 'regular_play', 
                     'from_corner', 'from_free_kick', 'from_throw_in', 'from_counter', 'from_goal_kick',
                     'from_keeper', 'from_kick_off', 'backheel', 'diving_header',
                     'half_volley', 'lob', 'overhead_kick', 'volley', 'corner', 'free_kick',
                     'open_play', 'penalty', 'kick_off', 'from_pressing', 'goal_angle', 'num_teammates',
                     'num_opposition', 'gk_in_goal']
    
    # data preprocessing
    shots.loc[shots['gk_in_goal'] == 2, 'gk_in_goal'] = 1
    shots['goalkeeper_id'] = shots['goalkeeper_id'].fillna(-1).astype(int)

    return shots

In [None]:
shots = get_df_shots(events)

In [None]:
shots.head()

In [None]:
shots.to_excel('extracted_data/shots.xlsx', columns=shots.columns, index=False)