In [None]:
import pandas as pd
import numpy as np
import json
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None

In [None]:
# get all match_ids for La Liga matches
match_ids = []
laliga_comps = 'data/matches/11'
for filename in os.listdir(laliga_comps):
    with open(os.path.join(laliga_comps, filename), encoding='utf8') as f:
        matches = json.load(f)
        matches = pd.DataFrame(matches)
        match_ids.extend(matches['match_id'].unique())

In [None]:
# read the events from all La Liga matches
events_folder = 'data/events'
#events = pd.concat([pd.DataFrame(json.load(open(os.path.join(events_folder, str(match_id) + '.json'), encoding='utf8'))) for match_id in match_ids])
events = pd.concat([pd.DataFrame(json.load(open(os.path.join(events_folder, str(match_id) + '.json'), encoding='utf8'))).assign(match_id=match_id) for match_id in match_ids])
events.reset_index(inplace=True)

In [None]:
def get_df_passes(events):
    # get all passes
    events.loc[:, 'pass_event'] = [1 if events['type'].iloc[i]['id'] == 30 else 0 for i in range(len(events))]
    passes = events[events['pass_event'] == 1]
    passes.drop('pass_event', axis=1, inplace=True)
    passes.index = np.arange(len(passes))

    # Generation of features for a pass
    passes = passes.assign(recipient = [passes['pass'].iloc[i]['recipient']['id'] if 'recipient' in passes['pass'].iloc[i].keys() else -1 for i in range(len(passes))],
                           length = [passes['pass'].iloc[i]['length'] for i in range(len(passes))],
                           angle = [passes['pass'].iloc[i]['angle'] if 'angle' in passes['pass'].iloc[i].keys() else 0 for i in range(len(passes))],
                           end_location_x = [passes['pass'].iloc[i]['end_location'][0] for i in range(len(passes))],
                           end_location_y = [passes['pass'].iloc[i]['end_location'][1] for i in range(len(passes))],
                           backheel = [1 if ('backheel' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['backheel']) else 0 for i in range(len(passes))],
                           deflected = [1 if ('deflected' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['deflected']) else 0 for i in range(len(passes))],
                           miscommunication = [1 if ('miscommunication' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['miscommunication']) else 0 for i in range(len(passes))],
                           cross = [1 if ('cross' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['cross']) else 0 for i in range(len(passes))],
                           cut_back = [1 if ('cut_back' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['cut_back']) else 0 for i in range(len(passes))],
                           switch = [1 if ('switch' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['switch']) else 0 for i in range(len(passes))],
                           shot_assist = [1 if ('shot_assist' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['shot_assist']) else 0 for i in range(len(passes))],
                           goal_assist = [1 if ('goal_assist' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['goal_assist']) else 0 for i in range(len(passes))],
                           under_pressure = [1 if passes['under_pressure'].iloc[i] == True else 0 for i in range(len(passes))],
                           team_id = [passes['team'].iloc[i]['id'] for i in range(len(passes))],
                           player_id = [passes['player'].iloc[i]['id'] for i in range(len(passes))],
                           location_x = [passes['location'].iloc[i][0] for i in range(len(passes))],
                           location_y = [passes['location'].iloc[i][1] for i in range(len(passes))])
    
    # add the direction of a pass
    passes.loc[:, 'direction_x'] = passes['end_location_x'] - passes['location_x']
    passes.loc[:, 'direction_y'] = passes['end_location_y'] - passes['location_y']

    # One-Hot-Encoding of the pass-height
    pass_heights = {1: 'Ground Pass', 2: 'Low Pass', 3: 'High Pass'} 
    for height_id, height_name in pass_heights.items():
        passes.loc[:, height_name] = [1 if ('height' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['height']['id'] == height_id) else 0 for i in range(len(passes))]

    # One-Hot-Encoding of the body-part that's used for a pass
    body_parts = {68: 'Drop Kick', 37: 'Head', 69: 'Kepper Arm', 38: 'Left Foot', 40: 'Right Foot', 106: 'No Touch'} 
    for body_part_id, body_part_name in body_parts.items():
        passes.loc[:, body_part_name] = [1 if ('body_part' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['body_part']['id'] == body_part_id) else 0 for i in range(len(passes))]

    # One-Hot-Encoding of the type of a pass
    types = {61: 'Corner', 62: 'Free Kick', 63: 'Goal Kick', 64: 'Interception', 65: 'Kick Off', 66: 'Recovery', 67: 'Throw-in'} 
    for type_id, type_name in types.items():
        passes.loc[:, type_name] = [1 if ('type' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['type']['id'] == type_id) else 0 for i in range(len(passes))]

    # One-Hot-Encoding of the outcome of a pass
    outcomes = {9: 'Incomplete', 74: 'Injury Clearance', 75: 'Out', 76: 'Pass Offside'} 
    for outcome_id, outcome_name in outcomes.items():
        passes.loc[:, outcome_name] = [1 if ('outcome' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['outcome']['id'] == outcome_id) else 0 for i in range(len(passes))]
    passes.loc[:, 'complete'] = [1 if 'outcome' not in passes['pass'].iloc[i].keys() else 0 for i in range(len(passes))]

    # One-Hot-Encoding of the technique of a pass
    techniques = {104: 'Inswinging', 105: 'Outswinging', 107: 'Straight', 108: 'Through Ball'} 
    for technique_id, technique_name in techniques.items():
        passes.loc[:, technique_name] = [1 if ('technique' in passes['pass'].iloc[i].keys()) and (passes['pass'].iloc[i]['technique']['id'] == technique_id) else 0 for i in range(len(passes))]

    # Extraction of the position of the player who played a pass
    passes.loc[:, 'position'] = [passes['position'].iloc[i]['id'] for i in range(len(passes))]

    # Dropping of irrelevant columns
    passes.drop(['level_0', 'index', '50_50', 'clearance', 'half_end', 'half_start', 'injury_stoppage', 'miscontrol',
                'possession_team', 'player', 'team', 'shot', 'location', 'off_camera', 'out',
                'player_off', 'interception', 'pass', 'play_pattern', 'possession', 'id', 'related_events',
                'substitution', 'tactics', 'timestamp', 'type', 'bad_behaviour', 'ball_receipt', 'ball_recovery', 'block',
                'carry', 'counterpress', 'dribble', 'duel', 'foul_committed', 'foul_won', 'goalkeeper'], axis=1, inplace=True, errors='ignore')
    
    passes.columns = ['duration', 'match_id', 'minute', 'period', 'position', 'second', 'under_pressure', 'recipient',
                      'length', 'angle', 'end_location_x', 'end_location_y', 'backheel', 'deflected', 'miscommunication',
                      'cross', 'cut_back', 'switch', 'shot_assist', 'goal_assist', 'team_id', 'player_id', 'location_x',
                      'location_y', 'direction_x', 'direction_y', 'ground_pass', 'low_pass', 'high_pass', 'drop_kick', 'head',
                      'keeper_arm', 'left_foot', 'right_foot', 'no_touch', 'corner', 'free_kick', 'goal_kick', 'interception', 
                      'kick_off', 'recovery', 'throw_in', 'incomplete', 'injury_clearance', 'out', 'pass_offside', 'complete',
                      'inswinging', 'outswinging', 'straight', 'through_ball']
    
    return passes

In [None]:
passes = get_df_passes(events)

In [None]:
passes.head()

In [None]:
passes.shape

In [None]:
#passes.to_excel('extracted_data/passes.xlsx', columns=passes.columns, index=False)