In [1]:
import os
import math
import json
import pandas as pd
import numpy as np

In [2]:
def get_files(DIR="", size=1):
    files = os.listdir(DIR)
    subset = size*len(files)
    subset = int(subset)
    new_files = files[:subset]
    return new_files

In [3]:
all_files = get_files(DIR="events", size=1)

In [4]:
with open("events/15946.json", 'r', encoding='utf-8' ) as out_file:
    raw_data = json.load(out_file)
    pd_data = pd.DataFrame(raw_data)

In [5]:
pd_data.columns

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'team', 'duration',
       'tactics', 'related_events', 'player', 'position', 'location', 'pass',
       'carry', 'ball_receipt', 'under_pressure', 'duel', 'counterpress',
       'interception', 'off_camera', 'ball_recovery', 'shot', 'goalkeeper',
       'clearance', 'block', 'dribble', 'foul_committed', 'foul_won', 'out',
       'miscontrol', 'bad_behaviour', 'substitution'],
      dtype='object')

In [6]:
len(all_files)

1321

In [7]:
def convert_json_pandas(files=[]):
    list_dfs = []
    for file in files:
        with open("events/"+file, 'r', encoding='utf-8' ) as json_file:
            data = json.load(json_file)
            data_frame = pd.DataFrame(data)
            match_id = file[:-5]
            data_frame["match_id"] = match_id
            data_frame = data_frame[["match_id","type", "play_pattern", "team", "tactics", "player", "position", "location",
                        "under_pressure","shot", "goalkeeper"]]
        list_dfs.append(data_frame)
    final_df = pd.concat(list_dfs, ignore_index=True)
    return final_df

In [8]:
df = convert_json_pandas(all_files)

In [9]:
df

Unnamed: 0,match_id,type,play_pattern,team,tactics,player,position,location,under_pressure,shot,goalkeeper
0,2275050,"{'id': 35, 'name': 'Starting XI'}","{'id': 1, 'name': 'Regular Play'}","{'id': 974, 'name': 'Reading WFC'}","{'formation': 41212, 'lineup': [{'player': {'i...",,,,,,
1,2275050,"{'id': 35, 'name': 'Starting XI'}","{'id': 1, 'name': 'Regular Play'}","{'id': 969, 'name': 'Birmingham City WFC'}","{'formation': 4231, 'lineup': [{'player': {'id...",,,,,,
2,2275050,"{'id': 18, 'name': 'Half Start'}","{'id': 1, 'name': 'Regular Play'}","{'id': 969, 'name': 'Birmingham City WFC'}",,,,,,,
3,2275050,"{'id': 18, 'name': 'Half Start'}","{'id': 1, 'name': 'Regular Play'}","{'id': 974, 'name': 'Reading WFC'}",,,,,,,
4,2275050,"{'id': 30, 'name': 'Pass'}","{'id': 9, 'name': 'From Kick Off'}","{'id': 969, 'name': 'Birmingham City WFC'}",,"{'id': 31563, 'name': 'Rachel Williams'}","{'id': 23, 'name': 'Center Forward'}","[61.0, 40.1]",,,
...,...,...,...,...,...,...,...,...,...,...,...
4666308,3795108,"{'id': 23, 'name': 'Goal Keeper'}","{'id': 5, 'name': 'Other'}","{'id': 772, 'name': 'Spain'}",,"{'id': 11748, 'name': 'Unai Simón Mendibil'}","{'id': 1, 'name': 'Goalkeeper'}","[1.0, 40.0]",,,"{'position': {'id': 44, 'name': 'Set'}, 'type'..."
4666309,3795108,"{'id': 16, 'name': 'Shot'}","{'id': 5, 'name': 'Other'}","{'id': 772, 'name': 'Spain'}",,"{'id': 6685, 'name': 'Mikel Oyarzabal Ugarte'}","{'id': 17, 'name': 'Right Wing'}","[108.0, 40.0]",,"{'one_on_one': True, 'statsbomb_xg': 0.7835, '...",
4666310,3795108,"{'id': 23, 'name': 'Goal Keeper'}","{'id': 5, 'name': 'Other'}","{'id': 773, 'name': 'Switzerland'}",,"{'id': 5550, 'name': 'Yann Sommer'}","{'id': 1, 'name': 'Goalkeeper'}","[1.0, 40.0]",,,"{'type': {'id': 28, 'name': 'Penalty Conceded'..."
4666311,3795108,"{'id': 34, 'name': 'Half End'}","{'id': 5, 'name': 'Other'}","{'id': 772, 'name': 'Spain'}",,,,,,,


In [10]:
def get_id(x):
    return x["id"]

def get_event_name(x):
    return x["name"]

df["event_id"] = df["type"].map(get_id)
df["event_type"] = df["type"].map(get_event_name)

In [11]:
df

Unnamed: 0,match_id,type,play_pattern,team,tactics,player,position,location,under_pressure,shot,goalkeeper,event_id,event_type
0,2275050,"{'id': 35, 'name': 'Starting XI'}","{'id': 1, 'name': 'Regular Play'}","{'id': 974, 'name': 'Reading WFC'}","{'formation': 41212, 'lineup': [{'player': {'i...",,,,,,,35,Starting XI
1,2275050,"{'id': 35, 'name': 'Starting XI'}","{'id': 1, 'name': 'Regular Play'}","{'id': 969, 'name': 'Birmingham City WFC'}","{'formation': 4231, 'lineup': [{'player': {'id...",,,,,,,35,Starting XI
2,2275050,"{'id': 18, 'name': 'Half Start'}","{'id': 1, 'name': 'Regular Play'}","{'id': 969, 'name': 'Birmingham City WFC'}",,,,,,,,18,Half Start
3,2275050,"{'id': 18, 'name': 'Half Start'}","{'id': 1, 'name': 'Regular Play'}","{'id': 974, 'name': 'Reading WFC'}",,,,,,,,18,Half Start
4,2275050,"{'id': 30, 'name': 'Pass'}","{'id': 9, 'name': 'From Kick Off'}","{'id': 969, 'name': 'Birmingham City WFC'}",,"{'id': 31563, 'name': 'Rachel Williams'}","{'id': 23, 'name': 'Center Forward'}","[61.0, 40.1]",,,,30,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4666308,3795108,"{'id': 23, 'name': 'Goal Keeper'}","{'id': 5, 'name': 'Other'}","{'id': 772, 'name': 'Spain'}",,"{'id': 11748, 'name': 'Unai Simón Mendibil'}","{'id': 1, 'name': 'Goalkeeper'}","[1.0, 40.0]",,,"{'position': {'id': 44, 'name': 'Set'}, 'type'...",23,Goal Keeper
4666309,3795108,"{'id': 16, 'name': 'Shot'}","{'id': 5, 'name': 'Other'}","{'id': 772, 'name': 'Spain'}",,"{'id': 6685, 'name': 'Mikel Oyarzabal Ugarte'}","{'id': 17, 'name': 'Right Wing'}","[108.0, 40.0]",,"{'one_on_one': True, 'statsbomb_xg': 0.7835, '...",,16,Shot
4666310,3795108,"{'id': 23, 'name': 'Goal Keeper'}","{'id': 5, 'name': 'Other'}","{'id': 773, 'name': 'Switzerland'}",,"{'id': 5550, 'name': 'Yann Sommer'}","{'id': 1, 'name': 'Goalkeeper'}","[1.0, 40.0]",,,"{'type': {'id': 28, 'name': 'Penalty Conceded'...",23,Goal Keeper
4666311,3795108,"{'id': 34, 'name': 'Half End'}","{'id': 5, 'name': 'Other'}","{'id': 772, 'name': 'Spain'}",,,,,,,,34,Half End


In [12]:
shots_df = df[df["event_type"] == "Shot"]

In [15]:
shots_df_v2 = shots_df.copy()

In [16]:
shots_df_v2

Unnamed: 0,match_id,type,play_pattern,team,tactics,player,position,location,under_pressure,shot,goalkeeper,event_id,event_type
0,2275050,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 974, 'name': 'Reading WFC'}",,"{'id': 10251, 'name': 'Fara Williams'}","{'id': 19, 'name': 'Center Attacking Midfield'}","[85.0, 31.2]",,"{'statsbomb_xg': 0.006284028, 'end_location': ...",,16,Shot
1,2275050,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 974, 'name': 'Reading WFC'}",,"{'id': 10251, 'name': 'Fara Williams'}","{'id': 19, 'name': 'Center Attacking Midfield'}","[100.6, 49.2]",,"{'statsbomb_xg': 0.057841927, 'end_location': ...",,16,Shot
2,2275050,"{'id': 16, 'name': 'Shot'}","{'id': 3, 'name': 'From Free Kick'}","{'id': 969, 'name': 'Birmingham City WFC'}",,"{'id': 31565, 'name': 'Abbi Grant'}","{'id': 21, 'name': 'Left Wing'}","[113.5, 36.0]",,"{'statsbomb_xg': 0.124636784, 'end_location': ...",,16,Shot
3,2275050,"{'id': 16, 'name': 'Shot'}","{'id': 3, 'name': 'From Free Kick'}","{'id': 974, 'name': 'Reading WFC'}",,"{'id': 26570, 'name': 'Amalie Vevle Eikeland'}","{'id': 24, 'name': 'Left Center Forward'}","[101.8, 52.6]",,"{'statsbomb_xg': 0.033757873, 'end_location': ...",,16,Shot
4,2275050,"{'id': 16, 'name': 'Shot'}","{'id': 4, 'name': 'From Throw In'}","{'id': 974, 'name': 'Reading WFC'}",,"{'id': 15723, 'name': 'Brooke Chaplen'}","{'id': 22, 'name': 'Right Center Forward'}","[97.3, 18.2]",True,"{'statsbomb_xg': 0.0097342925, 'end_location':...",,16,Shot
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33622,3795108,"{'id': 16, 'name': 'Shot'}","{'id': 5, 'name': 'Other'}","{'id': 772, 'name': 'Spain'}",,"{'id': 6765, 'name': 'Rodrigo Hernández Cascan...","{'id': 15, 'name': 'Left Center Midfield'}","[108.0, 40.0]",,"{'one_on_one': True, 'statsbomb_xg': 0.7835, '...",,16,Shot
33623,3795108,"{'id': 16, 'name': 'Shot'}","{'id': 5, 'name': 'Other'}","{'id': 773, 'name': 'Switzerland'}",,"{'id': 5549, 'name': 'Manuel Obafemi Akanji'}","{'id': 5, 'name': 'Left Center Back'}","[108.0, 40.0]",,"{'one_on_one': True, 'statsbomb_xg': 0.7835, '...",,16,Shot
33624,3795108,"{'id': 16, 'name': 'Shot'}","{'id': 5, 'name': 'Other'}","{'id': 772, 'name': 'Spain'}",,"{'id': 6766, 'name': 'Gerard Moreno Balaguero'}","{'id': 23, 'name': 'Center Forward'}","[108.0, 40.0]",,"{'one_on_one': True, 'statsbomb_xg': 0.7835, '...",,16,Shot
33625,3795108,"{'id': 16, 'name': 'Shot'}","{'id': 5, 'name': 'Other'}","{'id': 773, 'name': 'Switzerland'}",,"{'id': 30401, 'name': 'Ruben Vargas'}","{'id': 17, 'name': 'Right Wing'}","[108.0, 40.0]",,"{'one_on_one': True, 'statsbomb_xg': 0.7835, '...",,16,Shot


In [13]:
shots_df.reset_index(drop=True, inplace=True)

In [17]:
def get_stats_bomb(x):
    return x["statsbomb_xg"]

def get_body_part(x):
    return x["body_part"]["name"]

def get_end_loc(x):
    return x["end_location"]

def get_outcome(x):
    return x["outcome"]["name"]

def get_shot_type(x):
    return x["type"]["name"]

def get_freeze(x):
    try:
         output = x["freeze_frame"]
    except KeyError:
        output = np.nan
    return output

In [None]:
keys = ['statsbomb_xg', 'end_location', '']
#List of shot event names needed
"""
statsbomb_xg
end_location
outcome
type
freeze_frame
body_part
"""

In [18]:
shots_df["stats_xg"] = shots_df["shot"].apply(get_stats_bomb)
shots_df["loc_end"] = shots_df["shot"].apply(get_end_loc)
shots_df["shot_outcome"] = shots_df["shot"].apply(get_outcome)
shots_df["shot_type"] = shots_df["shot"].apply(get_shot_type)
shots_df["frame_freezed"] = shots_df["shot"].apply(get_freeze)
shots_df["body_part"] = shots_df["shot"].apply(get_body_part)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots_df["stats_xg"] = shots_df["shot"].apply(get_stats_bomb)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots_df["loc_end"] = shots_df["shot"].apply(get_end_loc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots_df["shot_outcome"] = shots_df["shot"].apply(get_outcome)
A value is trying to b

### Extracting information about Competition and Match ID

In [19]:
#Crawl the different directories to create a single directory containing all json files

def crawl_directories(root_dir):
    json_files = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".json"):
                json_files.append(os.path.join(root, file))
    return json_files

In [20]:
files = crawl_directories("matches")

In [21]:
df_files = []
for ind_file in files:
    with open(ind_file, 'r', encoding='utf-8') as match_file:
        json_file = json.load(match_file)
        df = pd.DataFrame(json_file)
        df_files.append(df)
all_df = pd.concat(df_files, ignore_index=True)

In [22]:
new_all = all_df[["match_id", "match_date", "competition", "season"]]

In [23]:
new_all

Unnamed: 0,match_id,match_date,competition,season
0,3827767,2022-03-20,"{'competition_id': 1238, 'country_name': 'Indi...","{'season_id': 108, 'season_name': '2021/2022'}"
1,3827335,2022-03-15,"{'competition_id': 1238, 'country_name': 'Indi...","{'season_id': 108, 'season_name': '2021/2022'}"
2,3827336,2022-03-16,"{'competition_id': 1238, 'country_name': 'Indi...","{'season_id': 108, 'season_name': '2021/2022'}"
3,3827338,2022-03-12,"{'competition_id': 1238, 'country_name': 'Indi...","{'season_id': 108, 'season_name': '2021/2022'}"
4,3827337,2022-03-11,"{'competition_id': 1238, 'country_name': 'Indi...","{'season_id': 108, 'season_name': '2021/2022'}"
...,...,...,...,...
1306,3749108,2003-08-27,"{'competition_id': 2, 'country_name': 'England...","{'season_id': 44, 'season_name': '2003/2004'}"
1307,3749153,2004-01-10,"{'competition_id': 2, 'country_name': 'England...","{'season_id': 44, 'season_name': '2003/2004'}"
1308,3749403,2004-03-20,"{'competition_id': 2, 'country_name': 'England...","{'season_id': 44, 'season_name': '2003/2004'}"
1309,3749526,2003-10-26,"{'competition_id': 2, 'country_name': 'England...","{'season_id': 44, 'season_name': '2003/2004'}"


In [24]:
def get_competition_id(x):
    return x["competition_id"]

def get_season(x):
    return x["season_id"]

new_all["competition_id"] = new_all["competition"].apply(get_competition_id)
new_all["season_id"] = new_all["season"].apply(get_season)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_all["competition_id"] = new_all["competition"].apply(get_competition_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_all["season_id"] = new_all["season"].apply(get_season)


In [25]:
new_all.drop(columns=["competition", "season"], inplace=True)
new_all

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_all.drop(columns=["competition", "season"], inplace=True)


Unnamed: 0,match_id,match_date,competition_id,season_id
0,3827767,2022-03-20,1238,108
1,3827335,2022-03-15,1238,108
2,3827336,2022-03-16,1238,108
3,3827338,2022-03-12,1238,108
4,3827337,2022-03-11,1238,108
...,...,...,...,...
1306,3749108,2003-08-27,2,44
1307,3749153,2004-01-10,2,44
1308,3749403,2004-03-20,2,44
1309,3749526,2003-10-26,2,44


In [26]:
new_all.dtypes
shots_df.dtypes
def change_to_int(x):
    return int(x)

shots_df["match_id"] = shots_df["match_id"].apply(change_to_int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots_df["match_id"] = shots_df["match_id"].apply(change_to_int)


In [27]:
merged = pd.merge(new_all, shots_df, how="left", on="match_id")
with open("competitions.json", "r", encoding='utf-8') as comp_file:
    data = json.load(comp_file)
    comp_df = pd.DataFrame(data)

comp_df = comp_df[["competition_id", "season_id", "competition_name", "season_name"]]

In [28]:
comp_df

Unnamed: 0,competition_id,season_id,competition_name,season_name
0,16,4,Champions League,2018/2019
1,16,1,Champions League,2017/2018
2,16,2,Champions League,2016/2017
3,16,27,Champions League,2015/2016
4,16,26,Champions League,2014/2015
5,16,25,Champions League,2013/2014
6,16,24,Champions League,2012/2013
7,16,23,Champions League,2011/2012
8,16,22,Champions League,2010/2011
9,16,21,Champions League,2009/2010


In [29]:
merged

Unnamed: 0,match_id,match_date,competition_id,season_id,type,play_pattern,team,tactics,player,position,...,shot,goalkeeper,event_id,event_type,stats_xg,loc_end,shot_outcome,shot_type,frame_freezed,body_part
0,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 4, 'name': 'From Throw In'}","{'id': 7289, 'name': 'Hyderabad'}",,"{'id': 164490, 'name': 'Souvik Chakrabarti'}","{'id': 9, 'name': 'Right Defensive Midfield'}",...,"{'statsbomb_xg': 0.0068775634, 'end_location':...",,16,Shot,0.006878,"[119.0, 38.7, 1.3]",Saved,Open Play,"[{'location': [97.6, 52.1], 'player': {'id': 1...",Right Foot
1,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 4, 'name': 'From Throw In'}","{'id': 7283, 'name': 'Kerala Blasters'}",,"{'id': 28251, 'name': 'Jorge Rolando Pereyra D...","{'id': 22, 'name': 'Right Center Forward'}",...,"{'statsbomb_xg': 0.19758572, 'end_location': [...",,16,Shot,0.197586,"[120.0, 35.4, 1.6]",Off T,Open Play,"[{'location': [98.2, 60.0], 'player': {'id': 1...",Head
2,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 3, 'name': 'From Free Kick'}","{'id': 7283, 'name': 'Kerala Blasters'}",,"{'id': 163354, 'name': 'Rahul Kannoly Praveen'}","{'id': 12, 'name': 'Right Midfield'}",...,"{'statsbomb_xg': 0.03577913, 'end_location': [...",,16,Shot,0.035779,"[120.0, 45.6, 5.9]",Off T,Open Play,"[{'location': [88.6, 13.3], 'player': {'id': 1...",Right Foot
3,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 4, 'name': 'From Throw In'}","{'id': 7283, 'name': 'Kerala Blasters'}",,"{'id': 26304, 'name': 'Adrián Nicolás Luna Ret...","{'id': 16, 'name': 'Left Midfield'}",...,"{'statsbomb_xg': 0.006838626, 'end_location': ...",,16,Shot,0.006839,"[94.5, 22.3]",Blocked,Open Play,"[{'location': [100.7, 12.1], 'player': {'id': ...",Right Foot
4,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 7, 'name': 'From Goal Kick'}","{'id': 7283, 'name': 'Kerala Blasters'}",,"{'id': 24035, 'name': 'Álvaro Vázquez García'}","{'id': 24, 'name': 'Left Center Forward'}",...,"{'statsbomb_xg': 0.031175608, 'end_location': ...",,16,Shot,0.031176,"[119.9, 37.6, 2.8]",Post,Open Play,"[{'location': [89.0, 24.2], 'player': {'id': 2...",Right Foot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33364,3749431,2004-04-11,2,44,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 37, 'name': 'Newcastle United'}",,"{'id': 40227, 'name': 'Laurent Robert'}","{'id': 16, 'name': 'Left Midfield'}",...,"{'statsbomb_xg': 0.0052038087, 'end_location':...",,16,Shot,0.005204,"[103.8, 40.0]",Blocked,Open Play,"[{'location': [118.7, 39.9], 'player': {'id': ...",Left Foot
33365,3749431,2004-04-11,2,44,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 1, 'name': 'Arsenal'}",,"{'id': 23816, 'name': 'José Antonio Reyes Cald...","{'id': 24, 'name': 'Left Center Forward'}",...,"{'one_on_one': True, 'statsbomb_xg': 0.1367552...",,16,Shot,0.136755,"[118.3, 38.0, 1.3]",Saved,Open Play,"[{'location': [108.5, 33.6], 'player': {'id': ...",Head
33366,3749431,2004-04-11,2,44,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 1, 'name': 'Arsenal'}",,"{'id': 15516, 'name': 'Thierry Henry'}","{'id': 22, 'name': 'Right Center Forward'}",...,"{'statsbomb_xg': 0.023022754, 'end_location': ...",,16,Shot,0.023023,"[120.0, 45.4, 4.5]",Off T,Open Play,"[{'location': [104.3, 33.8], 'player': {'id': ...",Right Foot
33367,3749431,2004-04-11,2,44,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 37, 'name': 'Newcastle United'}",,"{'id': 40226, 'name': 'Craig Bellamy'}","{'id': 22, 'name': 'Right Center Forward'}",...,"{'statsbomb_xg': 0.038628004, 'end_location': ...",,16,Shot,0.038628,"[120.0, 36.1, 5.3]",Off T,Open Play,"[{'location': [119.0, 38.6], 'player': {'id': ...",Left Foot


In [84]:
dataframe = pd.merge(merged, comp_df, how="left", on=["competition_id","season_id"])

In [85]:
dataframe.columns

Index(['match_id', 'match_date', 'competition_id', 'season_id', 'type',
       'play_pattern', 'team', 'tactics', 'player', 'position', 'location',
       'under_pressure', 'shot', 'goalkeeper', 'event_id', 'event_type',
       'stats_xg', 'loc_end', 'shot_outcome', 'shot_type', 'frame_freezed',
       'body_part', 'competition_name', 'season_name'],
      dtype='object')

In [87]:
from collections import Counter
Counter(dataframe['shot_outcome'])

Counter({'Saved': 8006,
         'Off T': 10582,
         'Blocked': 8067,
         'Post': 786,
         'Wayward': 1578,
         'Goal': 4093,
         'Saved to Post': 96,
         'Saved Off Target': 161})

In [None]:
dataframe['shot_outcome'].isin(['Saved'
         'Blocked': 8067,
         'Post': 786,
         'Goal': 4093,
         'Saved to Post': 96,
         'Saved Off Target': 161])

In [88]:
dataframe[dataframe['shot_outcome'] == 'Off T']

Unnamed: 0,match_id,match_date,competition_id,season_id,type,play_pattern,team,tactics,player,position,...,event_id,event_type,stats_xg,loc_end,shot_outcome,shot_type,frame_freezed,body_part,competition_name,season_name
1,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 4, 'name': 'From Throw In'}","{'id': 7283, 'name': 'Kerala Blasters'}",,"{'id': 28251, 'name': 'Jorge Rolando Pereyra D...","{'id': 22, 'name': 'Right Center Forward'}",...,16,Shot,0.197586,"[120.0, 35.4, 1.6]",Off T,Open Play,"[{'location': [98.2, 60.0], 'player': {'id': 1...",Head,Indian Super league,2021/2022
2,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 3, 'name': 'From Free Kick'}","{'id': 7283, 'name': 'Kerala Blasters'}",,"{'id': 163354, 'name': 'Rahul Kannoly Praveen'}","{'id': 12, 'name': 'Right Midfield'}",...,16,Shot,0.035779,"[120.0, 45.6, 5.9]",Off T,Open Play,"[{'location': [88.6, 13.3], 'player': {'id': 1...",Right Foot,Indian Super league,2021/2022
6,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 3, 'name': 'From Free Kick'}","{'id': 7283, 'name': 'Kerala Blasters'}",,"{'id': 163357, 'name': 'Lalthathanga Khawlhring'}","{'id': 11, 'name': 'Left Defensive Midfield'}",...,16,Shot,0.018697,"[120.0, 45.8, 6.9]",Off T,Open Play,"[{'location': [80.5, 35.8], 'player': {'id': 2...",Left Foot,Indian Super league,2021/2022
9,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 7289, 'name': 'Hyderabad'}",,"{'id': 26186, 'name': 'Bartholomew Owogbalor O...","{'id': 24, 'name': 'Left Center Forward'}",...,16,Shot,0.093055,"[120.0, 42.6, 4.5]",Off T,Open Play,"[{'location': [117.2, 42.2], 'player': {'id': ...",Right Foot,Indian Super league,2021/2022
10,3827767,2022-03-20,1238,108,"{'id': 16, 'name': 'Shot'}","{'id': 6, 'name': 'From Counter'}","{'id': 7283, 'name': 'Kerala Blasters'}",,"{'id': 163357, 'name': 'Lalthathanga Khawlhring'}","{'id': 11, 'name': 'Left Defensive Midfield'}",...,16,Shot,0.032857,"[120.0, 49.3, 0.4]",Off T,Open Play,"[{'location': [97.0, 19.3], 'player': {'id': 2...",Left Foot,Indian Super league,2021/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33357,3749431,2004-04-11,2,44,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 1, 'name': 'Arsenal'}",,"{'id': 15512, 'name': 'Sylvain Wiltord'}","{'id': 12, 'name': 'Right Midfield'}",...,16,Shot,0.415773,"[120.0, 44.8, 0.2]",Off T,Open Play,"[{'location': [113.5, 38.6], 'player': {'id': ...",Left Foot,Premier League,2003/2004
33362,3749431,2004-04-11,2,44,"{'id': 16, 'name': 'Shot'}","{'id': 3, 'name': 'From Free Kick'}","{'id': 37, 'name': 'Newcastle United'}",,"{'id': 40233, 'name': 'Olivier Bernard'}","{'id': 6, 'name': 'Left Back'}",...,16,Shot,0.010018,"[120.0, 36.0, 0.3]",Off T,Open Play,"[{'location': [106.1, 34.1], 'player': {'id': ...",Left Foot,Premier League,2003/2004
33366,3749431,2004-04-11,2,44,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 1, 'name': 'Arsenal'}",,"{'id': 15516, 'name': 'Thierry Henry'}","{'id': 22, 'name': 'Right Center Forward'}",...,16,Shot,0.023023,"[120.0, 45.4, 4.5]",Off T,Open Play,"[{'location': [104.3, 33.8], 'player': {'id': ...",Right Foot,Premier League,2003/2004
33367,3749431,2004-04-11,2,44,"{'id': 16, 'name': 'Shot'}","{'id': 1, 'name': 'Regular Play'}","{'id': 37, 'name': 'Newcastle United'}",,"{'id': 40226, 'name': 'Craig Bellamy'}","{'id': 22, 'name': 'Right Center Forward'}",...,16,Shot,0.038628,"[120.0, 36.1, 5.3]",Off T,Open Play,"[{'location': [119.0, 38.6], 'player': {'id': ...",Left Foot,Premier League,2003/2004


### Feature Engineering - Extract features

In [61]:
def extract_names(x):
    return x["name"]

cols = ['play_pattern', 'team', 'player', 'position']

for col in cols:
    dataframe[col+"_name"] = dataframe[col].apply(extract_names)

In [62]:
dataframe.columns

Index(['match_id', 'match_date', 'competition_id', 'season_id', 'type',
       'play_pattern', 'team', 'tactics', 'player', 'position', 'location',
       'under_pressure', 'shot', 'goalkeeper', 'event_id', 'event_type',
       'stats_xg', 'loc_end', 'shot_outcome', 'shot_type', 'frame_freezed',
       'body_part', 'competition_name', 'season_name', 'play_pattern_name',
       'team_name', 'player_name', 'position_name'],
      dtype='object')

In [63]:
dataframe.drop(columns=['play_pattern', 'team', 'player', 'position', 
                        'tactics', 'goalkeeper','type', 'event_type'], inplace=True)

In [64]:
dataframe.columns

Index(['match_id', 'match_date', 'competition_id', 'season_id', 'location',
       'under_pressure', 'shot', 'event_id', 'stats_xg', 'loc_end',
       'shot_outcome', 'shot_type', 'frame_freezed', 'body_part',
       'competition_name', 'season_name', 'play_pattern_name', 'team_name',
       'player_name', 'position_name'],
      dtype='object')

In [65]:
def get_technique(x):
    return x["technique"]["name"]

dataframe["shot_technique"] = dataframe["shot"].apply(get_technique)

### Getting the shot distance using the shot coordinates on the pitch. 

In [66]:
loc_x, loc_y = [], [] 
for i in dataframe["location"]:
    loc_x.append(i[0])
    loc_y.append(i[1])

In [67]:
def get_dist(x):
    gp_center = [120, 40]
    
    x_val = gp_center[0] - x[0]
    y_val = gp_center[1] - x[1]
    dist_center = np.sqrt(x_val**2 + y_val**2)
    return dist_center

dataframe["shot_distance"] = dataframe["location"].apply(get_dist)

### Getting the shot angle using the shot coordinates and goal post coordinates.

In [69]:
def get_angle(x):
    #define goal post coordinates
    pos_a = [120, 36]
    pos_b = [120, 44]
    
    #Obtain the euclidean distance of the shots from the sides of the posts a and b.
    dist_pos_a = np.sqrt((pos_a[0] - x[0])**2 + (pos_a[1] - x[1])**2)
    dist_pos_b = np.sqrt((pos_b[0] - x[0])**2 + (pos_b[1] - x[1])**2)
    
    #find angle using cosine rule
    cos_angle = (dist_pos_a**2 + dist_pos_b**2 - 8**2) / (2 * dist_pos_a * dist_pos_b)

    # Change the angle to degrees
    shot_rad = np.arccos(cos_angle)
    cos_angle = np.rad2deg(shot_rad)
    
    return cos_angle

dataframe["shot_angle"] = dataframe["location"].apply(get_angle)

In [70]:
dataframe

Unnamed: 0,match_id,match_date,competition_id,season_id,location,under_pressure,shot,event_id,stats_xg,loc_end,...,body_part,competition_name,season_name,play_pattern_name,team_name,player_name,position_name,shot_technique,shot_distance,shot_angle
0,3827767,2022-03-20,1238,108,"[90.8, 19.0]",,"{'statsbomb_xg': 0.0068775634, 'end_location':...",16,0.006878,"[119.0, 38.7, 1.3]",...,Right Foot,Indian Super league,2021/2022,From Throw In,Hyderabad,Souvik Chakrabarti,Right Defensive Midfield,Normal,35.967207,10.361348
1,3827767,2022-03-20,1238,108,"[111.8, 39.3]",,"{'statsbomb_xg': 0.19758572, 'end_location': [...",16,0.197586,"[120.0, 35.4, 1.6]",...,Head,Indian Super league,2021/2022,From Throw In,Kerala Blasters,Jorge Rolando Pereyra Díaz,Right Center Forward,Normal,8.229824,51.741851
2,3827767,2022-03-20,1238,108,"[93.9, 49.1]",,"{'statsbomb_xg': 0.03577913, 'end_location': [...",16,0.035779,"[120.0, 45.6, 5.9]",...,Right Foot,Indian Super league,2021/2022,From Free Kick,Kerala Blasters,Rahul Kannoly Praveen,Right Midfield,Normal,27.640912,15.596381
3,3827767,2022-03-20,1238,108,"[90.2, 18.8]",,"{'statsbomb_xg': 0.006838626, 'end_location': ...",16,0.006839,"[94.5, 22.3]",...,Right Foot,Indian Super league,2021/2022,From Throw In,Kerala Blasters,Adrián Nicolás Luna Retamar,Left Midfield,Normal,36.571574,10.226382
4,3827767,2022-03-20,1238,108,"[104.5, 55.9]",,"{'statsbomb_xg': 0.031175608, 'end_location': ...",16,0.031176,"[119.9, 37.6, 2.8]",...,Right Foot,Indian Super league,2021/2022,From Goal Kick,Kerala Blasters,Álvaro Vázquez García,Left Center Forward,Normal,22.204954,14.570211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33364,3749431,2004-04-11,2,44,"[78.2, 38.9]",,"{'statsbomb_xg': 0.0052038087, 'end_location':...",16,0.005204,"[103.8, 40.0]",...,Left Foot,Premier League,2003/2004,Regular Play,Newcastle United,Laurent Robert,Left Midfield,Normal,41.814471,10.924958
33365,3749431,2004-04-11,2,44,"[113.8, 33.3]",,"{'one_on_one': True, 'statsbomb_xg': 0.1367552...",16,0.136755,"[118.3, 38.0, 1.3]",...,Head,Premier League,2003/2004,Regular Play,Arsenal,José Antonio Reyes Calderón,Left Center Forward,Normal,9.128527,36.377965
33366,3749431,2004-04-11,2,44,"[96.6, 24.7]",,"{'statsbomb_xg': 0.023022754, 'end_location': ...",16,0.023023,"[120.0, 45.4, 4.5]",...,Right Foot,Premier League,2003/2004,Regular Play,Arsenal,Thierry Henry,Right Center Forward,Normal,27.958004,13.739171
33367,3749431,2004-04-11,2,44,"[109.7, 25.1]",,"{'statsbomb_xg': 0.038628004, 'end_location': ...",16,0.038628,"[120.0, 36.1, 5.3]",...,Left Foot,Premier League,2003/2004,Regular Play,Newcastle United,Craig Bellamy,Right Center Forward,Half Volley,18.113531,14.789639


### Extracting defense Density information using freeze frame

### Using the Baricentric technique, find if a point is located in the shot angle view formed by the shot taker to the goal post.

In [263]:
dataframe['frame_freezed_v2'] = dataframe['frame_freezed']

In [71]:
def is_point_in_triangle(x1, y1, x2, y2, x3, y3, px, py):
    """
    x1 and y1 are the x and y coordinates of the shot taker
    x2 and y2 are the fixed coordinates of the goal post a
    x3 and y3 are the fixed coordinates of the goal post b
    px and py are the x and y coordinates of the opposing defender directly in 
    view of shot taker, captured from freeze frame.
    """
    
    # Calculate the barycentric coordinates
    denominator = (y2 - y3)*(x1 - x3) + (x3 - x2)*(y1 - y3)
    lambda1 = ((y2 - y3)*(px - x3) + (x3 - x2)*(py - y3)) / denominator
    lambda2 = ((y3 - y1)*(px - x3) + (x1 - x3)*(py - y3)) / denominator
    lambda3 = 1 - lambda1 - lambda2

    # Check if the point is inside the triangle
    if 0 <= lambda1 <= 1 and 0 <= lambda2 <= 1 and 0 <= lambda3 <= 1:
        return True
    else:
        return False

is_point_in_triangle(102,40,120,36,120,44, 115, 40)

True

In [72]:
def is_point_in_triangle_v2(x1, y1, x2, y2, x3, y3, px, py):
    #Find the area of the main triangle
    area_1 = abs(x1*(y2-y3) + x2*(y3-y1) + x3*(y1-y2))
    area_1 = int(area_1/2)
    
    #Find area the point makes with 2 other sides forming 3 different triangles
    tri_1 = abs(x1*(y2-py) + x2*(py-y1) + px*(y1-y2)) #px connected to x1 and x2
    tri_1 = int(tri_1/2)
    tri_2 = abs(px*(y2-y3) + x2*(y3-py) + x3*(py-y2)) #px connected to x2 and x3
    tri_2 = int(tri_2/2)
    tri_3 = abs(x1*(y3-py) + x3*(py-y1) + px*(y1-y3)) #px connected to x1 and x3
    tri_3 = int(tri_3/2)
    
    if area_1 == tri_1 + tri_2 + tri_3:
        return True
    else:
        return False
is_point_in_triangle_v2(90.8, 19.0,120,36,120,44, 118.7, 38.6)

True

In [73]:
def count_opp_v1(shot_loc, frame):
    try:
        total_players = 0
        for player in frame:
            loc_p = player['location']
            if not player['teammate']:
                if is_point_in_triangle(shot_loc[0], shot_loc[1], 120, 36, 120, 44, loc_p[0], loc_p[1]):
                    total_players += 1
    except Exception as e:
        total_players = 0
        
    return total_players


In [74]:
def count_opp_v2(shot_loc, frame):
    try:
        total_players = 0
        for player in frame:
            loc_p = player['location']
            if not player['teammate']:
                if is_point_in_triangle_v2(shot_loc[0], shot_loc[1], 120, 36, 120, 44, loc_p[0], loc_p[1]):
                    total_players += 1
    except Exception as e:
        total_players = 0
        
    return total_players


In [75]:
defence_dens = []
for i in range(len(dataframe)):
    defence_dens.append(count_opp_v1(dataframe['location'][i], dataframe['frame_freezed'][i]))
    
defence_dens_2 = []
for i in range(len(dataframe)):
    defence_dens_2.append(count_opp_v1(dataframe['location'][i], dataframe['frame_freezed'][i]))

In [76]:
for i in range(len(defence_dens)):
    assert defence_dens[i] == defence_dens_2[i]

In [77]:
dataframe['defence_density'] = defence_dens

In [78]:
new_list = [False if i != True else True for i in dataframe["under_pressure"]]
new_goals = ["No Goal" if i != "Goal" else "Goal" for i in dataframe["shot_outcome"]]

In [79]:
dataframe["under_pressure"] = new_list
dataframe["shot_outcome"] = new_goals

In [80]:
dataframe.drop(columns=["frame_freezed", 'shot'], inplace=True)

In [81]:
#seperate x and y coordinates
x_list = [loc_list[0] for loc_list in dataframe["location"]]
y_list = [loc_list[1] for loc_list in dataframe["location"]]

dataframe["x_list"] = x_list
dataframe["y_list"] = y_list

In [82]:
dataframe.to_csv("cleaned_football_v2.csv", index=False)

In [83]:
dataframe

Unnamed: 0,match_id,match_date,competition_id,season_id,location,under_pressure,event_id,stats_xg,loc_end,shot_outcome,...,play_pattern_name,team_name,player_name,position_name,shot_technique,shot_distance,shot_angle,defence_density,x_list,y_list
0,3827767,2022-03-20,1238,108,"[90.8, 19.0]",False,16,0.006878,"[119.0, 38.7, 1.3]",No Goal,...,From Throw In,Hyderabad,Souvik Chakrabarti,Right Defensive Midfield,Normal,35.967207,10.361348,2,90.8,19.0
1,3827767,2022-03-20,1238,108,"[111.8, 39.3]",False,16,0.197586,"[120.0, 35.4, 1.6]",No Goal,...,From Throw In,Kerala Blasters,Jorge Rolando Pereyra Díaz,Right Center Forward,Normal,8.229824,51.741851,1,111.8,39.3
2,3827767,2022-03-20,1238,108,"[93.9, 49.1]",False,16,0.035779,"[120.0, 45.6, 5.9]",No Goal,...,From Free Kick,Kerala Blasters,Rahul Kannoly Praveen,Right Midfield,Normal,27.640912,15.596381,1,93.9,49.1
3,3827767,2022-03-20,1238,108,"[90.2, 18.8]",False,16,0.006839,"[94.5, 22.3]",No Goal,...,From Throw In,Kerala Blasters,Adrián Nicolás Luna Retamar,Left Midfield,Normal,36.571574,10.226382,2,90.2,18.8
4,3827767,2022-03-20,1238,108,"[104.5, 55.9]",False,16,0.031176,"[119.9, 37.6, 2.8]",No Goal,...,From Goal Kick,Kerala Blasters,Álvaro Vázquez García,Left Center Forward,Normal,22.204954,14.570211,1,104.5,55.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33364,3749431,2004-04-11,2,44,"[78.2, 38.9]",False,16,0.005204,"[103.8, 40.0]",No Goal,...,Regular Play,Newcastle United,Laurent Robert,Left Midfield,Normal,41.814471,10.924958,3,78.2,38.9
33365,3749431,2004-04-11,2,44,"[113.8, 33.3]",False,16,0.136755,"[118.3, 38.0, 1.3]",No Goal,...,Regular Play,Arsenal,José Antonio Reyes Calderón,Left Center Forward,Normal,9.128527,36.377965,1,113.8,33.3
33366,3749431,2004-04-11,2,44,"[96.6, 24.7]",False,16,0.023023,"[120.0, 45.4, 4.5]",No Goal,...,Regular Play,Arsenal,Thierry Henry,Right Center Forward,Normal,27.958004,13.739171,2,96.6,24.7
33367,3749431,2004-04-11,2,44,"[109.7, 25.1]",False,16,0.038628,"[120.0, 36.1, 5.3]",No Goal,...,Regular Play,Newcastle United,Craig Bellamy,Right Center Forward,Half Volley,18.113531,14.789639,1,109.7,25.1
