In [2]:
import pandas as pd
import numpy as np
import json
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# read all events
#events_folder = 'data/events'
#events = pd.concat([pd.DataFrame(json.load(open(os.path.join(events_folder, filename), encoding='utf8'))).assign(match_id=filename.split('.')[0]) for filename in os.listdir(events_folder)])
#events.reset_index(inplace=True, drop=True)

In [4]:
# get all match_ids for La Liga matches
match_ids = []
laliga_comps = 'data/matches/11'
for filename in os.listdir(laliga_comps):
    with open(os.path.join(laliga_comps, filename), encoding='utf8') as f:
        matches = json.load(f)
        matches = pd.DataFrame(matches)
        match_ids.extend(matches['match_id'].unique())

# read the events from all La Liga matches
events_folder = 'data/events'
events = pd.concat([pd.DataFrame(json.load(open(os.path.join(events_folder, str(match_id) + '.json'), encoding='utf8'))).assign(match_id=match_id) for match_id in match_ids])
events.reset_index(inplace=True)

In [5]:
# Add goalkeeper_id for the opposing goalkeeper at a shot
def get_goalkeeper(players):
    return next(iter([player['player']['id'] for player in players if (player['position']['id'] == 1) and (player['teammate'] == False)]), None)

In [6]:
def get_df_penalties(events):
    # get all penalties
    events.loc[:, 'shot_event'] = [1 if events['type'].iloc[i]['id'] == 16 else 0 for i in range(len(events))]
    shots = events[events['shot_event'] == 1]
    shots.loc[:, 'penalty'] = [1 if shots['shot'].iloc[i]['type']['id'] == 88 else 0 for i in range(len(shots))]
    penalties = shots[shots['penalty'] == 1]
    penalties.drop(['shot_event', 'penalty'], axis=1, inplace=True)
    penalties.reset_index(inplace=True, drop=True)
   
    # Add team_id and player_id from whom a shot was taken
    penalties = penalties.assign(team_id = [penalties['team'].iloc[i]['id'] for i in range(len(penalties))],
                                 player_id = [penalties['player'].iloc[i]['id'] for i in range(len(penalties))])

    penalties_with_gk_idx = [i for i in penalties.index if isinstance(penalties['shot'].iloc[i], dict) and ('freeze_frame' in penalties['shot'].iloc[i].keys())]
    penalties.loc[penalties_with_gk_idx, 'goalkeeper_id'] = [get_goalkeeper(penalties['shot'].loc[i]['freeze_frame']) for i in penalties_with_gk_idx]

    # Generation of features for a shot
    penalties = penalties.assign(goal = [1 if penalties['shot'].iloc[i]['outcome']['id'] == 97 else 0 for i in range(len(penalties))],
                                 statsbomb_xg = [penalties['shot'].iloc[i]['statsbomb_xg'] for i in range(len(penalties))],
                                 defender = [1 if penalties['position'].iloc[i]['id'] < 9 else 0 for i in range(len(penalties))],
                                 midfielder = [1 if (penalties['position'].iloc[i]['id'] > 8) and (penalties['position'].iloc[i]['id'] < 22) else 0 for i in range(len(penalties))],
                                 striker = [1 if penalties['position'].iloc[i]['id'] > 21 else 0 for i in range(len(penalties))],
                                 end_location_x = [penalties['shot'].iloc[i]['end_location'][0] for i in range(len(penalties))],
                                 end_location_y = [penalties['shot'].iloc[i]['end_location'][1] for i in range(len(penalties))],
                                 end_location_z = [penalties['shot'].iloc[i]['end_location'][2] if len(penalties['shot'].iloc[i]['end_location']) > 2 else -1 for i in range(len(penalties))])

    # Dropping of irrelevant columns
    penalties.drop(['level_0', 'index', '50_50', 'clearance', 'half_end', 'half_start', 'injury_stoppage', 'miscontrol',
                    'period', 'position', 'possession_team', 'player', 'team', 'shot', 'location', 'off_camera', 'out',
                    'player_off', 'interception', 'pass', 'play_pattern', 'possession', 'id', 'related_events', 'under_pressure',
                    'substitution', 'tactics', 'timestamp', 'type', 'bad_behaviour', 'ball_receipt', 'ball_recovery', 'block',
                    'carry', 'counterpress', 'dribble', 'duel', 'foul_committed', 'foul_won', 'goalkeeper'], axis=1, inplace=True, errors='ignore')
    
    # data preprocessing
    penalties['goalkeeper_id'] = penalties['goalkeeper_id'].fillna(-1).astype(int)

    return penalties

In [7]:
penalties = get_df_penalties(events)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [8]:
penalties.head()

Unnamed: 0,duration,match_id,minute,second,team_id,player_id,goalkeeper_id,goal,statsbomb_xg,defender,midfielder,striker,end_location_x,end_location_y,end_location_z
0,0.671,9609,20,1,217,5503,-1,1,0.76,0,0,1,120.0,37.4,1.2
1,0.581456,9827,47,50,208,6930,-1,1,0.76,0,0,1,120.0,43.1,2.3
2,2.16,9880,86,7,207,6595,-1,1,0.76,0,1,0,120.0,42.0,0.2
3,0.619538,9581,38,37,217,5503,-1,0,0.76,0,0,1,119.3,36.9,0.8
4,0.437,9726,69,39,217,5503,-1,0,0.76,0,0,1,117.8,41.5,1.9


In [9]:
penalties.to_excel('extracted_data/penalties.xlsx', columns=penalties.columns, index=False)