# Preprocessing

Load World Cup 2018 data from StatsBomb JSON, filter to shots, and create the cleaned shot-level dataset with engineered features (x, y, distance, angle, etc.). Finally, save the result to CSV for use in the modeling notebook.

## Imports and configuration

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

pd.set_option('display.max_columns', None)


## Load matches and events for World Cup 2018


In [2]:
# Base path to StatsBomb open-data on your machine
BASE = Path(r"C:\\Users\\traik\\Desktop\\Final project data\\open-data-master\\data")

# World Cup 2018: competition_id=43, season_id=3
matches_file = BASE / "matches" / "43" / "3.json"

with open(matches_file, "r", encoding="utf-8") as f:
    matches_wc = json.load(f)

match_ids_wc = [m["match_id"] for m in matches_wc]

events_folder = BASE / "events"
all_events_wc = []

for mid in match_ids_wc:
    fp = events_folder / f"{mid}.json"
    with open(fp, "r", encoding="utf-8") as f:
        all_events_wc.extend(json.load(f))

df_wc = pd.json_normalize(all_events_wc, sep="_")
df_wc.head()


Unnamed: 0,id,index,period,timestamp,minute,second,possession,duration,type_id,type_name,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name,team_id,team_name,tactics_formation,tactics_lineup,related_events,location,player_id,player_name,position_id,position_name,pass_recipient_id,pass_recipient_name,pass_length,pass_angle,pass_height_id,pass_height_name,pass_end_location,pass_type_id,pass_type_name,pass_body_part_id,pass_body_part_name,under_pressure,carry_end_location,pass_outcome_id,pass_outcome_name,counterpress,ball_receipt_outcome_id,ball_receipt_outcome_name,50_50_outcome_id,50_50_outcome_name,pass_cross,ball_recovery_recovery_failure,pass_switch,dribble_outcome_id,dribble_outcome_name,duel_type_id,duel_type_name,duel_outcome_id,duel_outcome_name,foul_committed_type_id,foul_committed_type_name,shot_statsbomb_xg,shot_end_location,shot_follows_dribble,shot_type_id,shot_type_name,shot_technique_id,shot_technique_name,shot_outcome_id,shot_outcome_name,shot_body_part_id,shot_body_part_name,shot_freeze_frame,goalkeeper_position_id,goalkeeper_position_name,goalkeeper_type_id,goalkeeper_type_name,goalkeeper_technique_id,goalkeeper_technique_name,goalkeeper_outcome_id,goalkeeper_outcome_name,goalkeeper_body_part_id,goalkeeper_body_part_name,interception_outcome_id,interception_outcome_name,pass_assisted_shot_id,pass_shot_assist,shot_key_pass_id,goalkeeper_end_location,clearance_aerial_won,foul_won_defensive,shot_open_goal,pass_aerial_won,foul_committed_advantage,foul_won_advantage,pass_backheel,dribble_nutmeg,pass_deflected,block_deflection,bad_behaviour_card_id,bad_behaviour_card_name,foul_committed_card_id,foul_committed_card_name,foul_committed_penalty,foul_won_penalty,block_offensive,substitution_outcome_id,substitution_outcome_name,substitution_replacement_id,substitution_replacement_name,shot_aerial_won,shot_first_time,pass_goal_assist,shot_deflected,dribble_overrun,block_save_block,foul_committed_offensive,injury_stoppage_in_chain,pass_technique_id,pass_technique_name,pass_through_ball,shot_one_on_one,pass_cut_back,ball_recovery_offensive,miscontrol_aerial_won,pass_miscommunication,shot_redirect
0,de3be98d-e227-475b-bd55-f57a6a89d308,1,1,00:00:00.000,0,0,1,0.0,35,Starting XI,769,Colombia,1,Regular Play,769,Colombia,433.0,"[{'player': {'id': 4276, 'name': 'David Ospina...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,f50ccda4-b768-4f07-9136-8f79fd17dac5,2,1,00:00:00.000,0,0,1,0.754,35,Starting XI,769,Colombia,1,Regular Play,768,England,352.0,"[{'player': {'id': 3468, 'name': 'Jordan Pickf...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,b5e98805-0a22-4a5e-a306-7d40651a0f6e,3,1,00:00:00.000,0,0,1,9.32,18,Half Start,769,Colombia,1,Regular Play,768,England,,,[762b829f-5f24-4dd7-bfe2-da7e289838bb],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,762b829f-5f24-4dd7-bfe2-da7e289838bb,4,1,00:00:00.000,0,0,1,9.053,18,Half Start,769,Colombia,1,Regular Play,769,Colombia,,,[b5e98805-0a22-4a5e-a306-7d40651a0f6e],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,d4883f20-ce68-4f84-b26a-a049a13cb6be,5,1,00:00:00.240,0,0,2,0.24,30,Pass,769,Colombia,9,From Kick Off,769,Colombia,,,[5fc9acb8-88c3-4cfb-ad9c-fc250c0dffde],"[60.0, 40.0]",3445.0,Radamel Falcao García Zárate,24.0,Left Center Forward,5692.0,Juan Fernando Quintero Paniagua,10.049875,3.041924,1.0,Ground Pass,"[50.0, 41.0]",65.0,Kick Off,38.0,Left Foot,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [3]:
df_wc['id'].duplicated().sum()


0

## Filter to shots only

In [4]:
df_shots = df_wc[df_wc["type_name"] == "Shot"].copy()
df_shots.shape


(1706, 121)

### Filter out penalty shootouts

In [5]:
df_penalty_shootouts = df_wc[df_wc["period"] == 5]
display(df_penalty_shootouts.shape)
df_penalty_shootouts[["type_name", "shot_type_name", "location", "period"]].head(15)


(94, 121)

Unnamed: 0,type_name,shot_type_name,location,period
3995,Half Start,,,5
3996,Half Start,,,5
3997,Shot,Penalty,"[109.0, 41.0]",5
3998,Goal Keeper,,"[1.0, 40.0]",5
3999,Shot,Penalty,"[109.0, 41.0]",5
4000,Goal Keeper,,"[1.0, 40.0]",5
4001,Shot,Penalty,"[109.0, 41.0]",5
4002,Goal Keeper,,"[1.0, 40.0]",5
4003,Shot,Penalty,"[109.0, 41.0]",5
4004,Goal Keeper,,"[1.0, 40.0]",5


In [6]:
shots = df_shots[df_shots["period"] != 5]


## Select relevant columns


In [7]:
cols = [
    "location",
    "counterpress",
    "shot_statsbomb_xg",
    "shot_end_location",
    "shot_type_id",
    "shot_technique_id",
    "shot_outcome_id",
    "shot_body_part_id",
    "shot_open_goal",
    "shot_first_time",
    "shot_one_on_one",
    "shot_aerial_won",
]

shots = shots[cols]
shots.head()


Unnamed: 0,location,counterpress,shot_statsbomb_xg,shot_end_location,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won
239,"[115.0, 18.0]",,0.009816,"[120.0, 42.2, 2.0]",62.0,93.0,100.0,40.0,,,,
270,"[112.0, 54.0]",,0.038204,"[113.0, 53.0]",87.0,93.0,96.0,40.0,,,,
439,"[98.0, 37.0]",,0.045128,"[105.0, 37.0]",87.0,93.0,96.0,40.0,,,,
548,"[119.0, 36.0]",,0.625074,"[120.0, 40.5, 3.1]",87.0,93.0,98.0,37.0,True,,,
815,"[97.0, 56.0]",,0.02176,"[100.0, 54.0]",87.0,93.0,96.0,40.0,,,,


## Drop unreliable columns



In [8]:
shots.isna().sum()

location                0
counterpress         1667
shot_statsbomb_xg       0
shot_end_location       0
shot_type_id            0
shot_technique_id       0
shot_outcome_id         0
shot_body_part_id       0
shot_open_goal       1650
shot_first_time      1310
shot_one_on_one      1612
shot_aerial_won      1543
dtype: int64

In [9]:
# Drop counterpress

shots.drop(["counterpress"], axis=1, inplace=True)

shots.isna().sum()


location                0
shot_statsbomb_xg       0
shot_end_location       0
shot_type_id            0
shot_technique_id       0
shot_outcome_id         0
shot_body_part_id       0
shot_open_goal       1650
shot_first_time      1310
shot_one_on_one      1612
shot_aerial_won      1543
dtype: int64

In [10]:
# Drop shot_end_location as execution of shot should not play a role in measuring the quality of shot.

shots.drop(["shot_end_location"], axis=1, inplace=True)

## Clean boolean shot flags

`shot_open_goal`, `shot_first_time`, `shot_one_on_one`, `shot_aerial_won` are mapped from `NaN` → 0 and cast to integer 0/1.

In [11]:
binary_cols = [
    "shot_open_goal",
    "shot_first_time",
    "shot_one_on_one",
    "shot_aerial_won",
]

for col in binary_cols:
    shots[col] = shots[col].fillna(0).astype(int)

shots[binary_cols].head()


Unnamed: 0,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won
239,0,0,0,0
270,0,0,0,0
439,0,0,0,0
548,1,0,0,0
815,0,0,0,0


## Extract shot location (x, y)

We convert the StatsBomb `location` list `[x, y]` into separate numeric `x` and `y` columns, then drop the original `location` column.

In [12]:
shots["x"] = shots["location"].apply(lambda loc: loc[0])
shots["y"] = shots["location"].apply(lambda loc: loc[1])

shots.drop("location", axis=1, inplace=True)
shots.head()


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y
239,0.009816,62.0,93.0,100.0,40.0,0,0,0,0,115.0,18.0
270,0.038204,87.0,93.0,96.0,40.0,0,0,0,0,112.0,54.0
439,0.045128,87.0,93.0,96.0,40.0,0,0,0,0,98.0,37.0
548,0.625074,87.0,93.0,98.0,37.0,1,0,0,0,119.0,36.0
815,0.02176,87.0,93.0,96.0,40.0,0,0,0,0,97.0,56.0


## Create goal label `is_goal`

We use your mapping: `shot_outcome_id == 97` corresponds to goals in this dataset.

In [13]:
shots["is_goal"] = (shots["shot_outcome_id"] == 97).astype(int)
shots["is_goal"].value_counts()


is_goal
0    1510
1     157
Name: count, dtype: int64

In [14]:
# Drop the outcome_id now that we have the label
shots.drop("shot_outcome_id", axis=1, inplace=True)
shots.head()


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal
239,0.009816,62.0,93.0,40.0,0,0,0,0,115.0,18.0,0
270,0.038204,87.0,93.0,40.0,0,0,0,0,112.0,54.0,0
439,0.045128,87.0,93.0,40.0,0,0,0,0,98.0,37.0,0
548,0.625074,87.0,93.0,37.0,1,0,0,0,119.0,36.0,0
815,0.02176,87.0,93.0,40.0,0,0,0,0,97.0,56.0,0


## Add geometry features: distance and angle

We compute:
- `distance`: distance from shot location to the centre of the goal
- `angle`: angle between lines from the shot location to the two goalposts

Pitch and goal coordinates follow the StatsBomb convention: x ∈ [0,120], y ∈ [0,80], goal centered at (120, 40).

In [15]:
# Goal coordinates (StatsBomb pitch)
goal_x = 120
goal_y = 40
left_post_y = 36.8
right_post_y = 43.2

# Distance to goal centre
shots["distance"] = np.sqrt((goal_x - shots["x"])**2 + (goal_y - shots["y"])**2)

def calc_angle(row):
    x = row["x"]
    y = row["y"]
    angle_left = np.arctan2(left_post_y - y, goal_x - x)
    angle_right = np.arctan2(right_post_y - y, goal_x - x)
    return abs(angle_right - angle_left)



## Create final featured shots dataframe

We drop `shot_end_location` so the features only describe the chance **before** the shot outcome. This final dataframe `shots_featured` will be saved to CSV and used in the modeling notebook.

In [16]:
shots_featured = shots.copy()

shots_featured["angle"] = shots.apply(calc_angle, axis=1)
shots_featured[["x", "y", "distance", "angle"]].head()


shots_featured.head()


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle
239,0.009816,62.0,93.0,40.0,0,0,0,0,115.0,18.0,0,22.561028,0.064071
270,0.038204,87.0,93.0,40.0,0,0,0,0,112.0,54.0,0,16.124515,0.202196
439,0.045128,87.0,93.0,40.0,0,0,0,0,98.0,37.0,0,22.203603,0.283785
548,0.625074,87.0,93.0,37.0,1,0,0,0,119.0,36.0,0,4.123106,0.758049
815,0.02176,87.0,93.0,40.0,0,0,0,0,97.0,56.0,0,28.017851,0.187756


### Checking for duplicate columns

In [17]:
shots_featured.duplicated().sum()

22

In [18]:
dupes = shots_featured[shots_featured.duplicated(keep=False)]
dupes.head(60)



Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle
1972,0.7835,88.0,93.0,40.0,0,0,0,0,108.0,40.0,1,12.0,0.521205
13367,0.7835,88.0,93.0,40.0,0,0,0,0,108.0,40.0,0,12.0,0.521205
14420,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
22589,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
23240,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
60457,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
61587,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
74316,0.7835,88.0,93.0,40.0,0,0,0,0,108.0,40.0,1,12.0,0.521205
77105,0.7835,88.0,93.0,38.0,0,0,0,0,108.0,40.0,1,12.0,0.521205
90799,0.7835,88.0,93.0,40.0,0,0,0,0,108.0,40.0,0,12.0,0.521205


In [19]:
# duplicates are mostly penalty shots. we dont drop them.


In [20]:
shots_featured.iloc[32:45,:]

Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle
4454,0.257208,87.0,93.0,40.0,0,0,0,0,117.0,40.0,0,3.0,1.63529
4540,0.00695,87.0,93.0,40.0,0,0,0,0,91.0,20.0,0,35.22783,0.149673
4604,0.173503,87.0,93.0,37.0,0,0,0,0,112.0,42.0,0,8.246211,0.725265
5209,0.037858,87.0,93.0,40.0,0,0,0,0,109.0,55.0,0,18.601075,0.20668
5215,0.068467,87.0,93.0,40.0,0,0,0,0,107.0,33.0,0,14.764823,0.380905
5228,0.019955,87.0,93.0,40.0,0,0,0,0,91.0,49.0,0,30.364453,0.200819
5497,0.085507,87.0,93.0,37.0,0,0,0,0,113.0,40.0,0,7.0,0.857556
5511,0.024078,87.0,93.0,38.0,0,0,0,0,90.0,41.0,0,30.016662,0.212298
5516,0.10148,87.0,90.0,37.0,0,0,0,0,111.0,43.0,0,9.486833,0.625448
5559,0.071149,87.0,93.0,37.0,0,0,0,1,109.0,35.0,0,12.083046,0.478387


In [21]:
shots_featured.shape

(1667, 13)

In [22]:
shots_featured.is_goal.value_counts()

is_goal
0    1510
1     157
Name: count, dtype: int64

## Save preprocessed shots to CSV

In [23]:
output_path = Path("shots_featured_wc2018.csv")
shots_featured.to_csv(output_path, index=False)
print(f"Saved preprocessed shots to {output_path.resolve()}")


Saved preprocessed shots to C:\Users\traik\Desktop\ML_Bootcamp\Final_Project\Final_Project_Ironhack\shots_featured_wc2018.csv


# Loading Euro 2020 Data to test models on

In [24]:


# EURO 2020
matches_file_euro = BASE / "matches" / "55" / "43.json"
with open(matches_file_euro, "r", encoding="utf-8") as f:
    matches_euro = json.load(f)

match_ids_euro = [m["match_id"] for m in matches_euro]

# Load events
events_folder = BASE / "events"
all_events_euro = []

for mid in match_ids_euro:
    fp = events_folder / f"{mid}.json"
    with open(fp, "r", encoding="utf-8") as f:
        all_events_euro.extend(json.load(f))

df_euro = pd.json_normalize(all_events_euro, sep="_")
df_euro.head()

Unnamed: 0,id,index,period,timestamp,minute,second,possession,duration,type_id,type_name,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name,team_id,team_name,tactics_formation,tactics_lineup,related_events,location,player_id,player_name,position_id,position_name,pass_recipient_id,pass_recipient_name,pass_length,pass_angle,pass_height_id,pass_height_name,pass_end_location,pass_body_part_id,pass_body_part_name,pass_type_id,pass_type_name,carry_end_location,under_pressure,pass_outcome_id,pass_outcome_name,ball_receipt_outcome_id,ball_receipt_outcome_name,duel_type_id,duel_type_name,clearance_head,clearance_body_part_id,clearance_body_part_name,clearance_aerial_won,duel_outcome_id,duel_outcome_name,interception_outcome_id,interception_outcome_name,shot_statsbomb_xg,shot_end_location,shot_technique_id,shot_technique_name,shot_body_part_id,shot_body_part_name,shot_type_id,shot_type_name,shot_outcome_id,shot_outcome_name,shot_freeze_frame,goalkeeper_end_location,goalkeeper_position_id,goalkeeper_position_name,goalkeeper_type_id,goalkeeper_type_name,off_camera,pass_cross,clearance_left_foot,goalkeeper_outcome_id,goalkeeper_outcome_name,counterpress,out,miscontrol_aerial_won,ball_recovery_recovery_failure,pass_switch,pass_assisted_shot_id,pass_shot_assist,pass_outswinging,pass_technique_id,pass_technique_name,shot_key_pass_id,shot_first_time,dribble_outcome_id,dribble_outcome_name,pass_aerial_won,clearance_right_foot,foul_committed_type_id,foul_committed_type_name,foul_won_defensive,pass_inswinging,substitution_outcome_id,substitution_outcome_name,substitution_replacement_id,substitution_replacement_name,goalkeeper_technique_id,goalkeeper_technique_name,goalkeeper_body_part_id,goalkeeper_body_part_name,shot_aerial_won,50_50_outcome_id,50_50_outcome_name,dribble_overrun,ball_recovery_offensive,foul_committed_advantage,foul_won_advantage,block_deflection,pass_through_ball,foul_committed_card_id,foul_committed_card_name,pass_goal_assist,block_offensive,pass_cut_back,pass_deflected,shot_one_on_one,foul_committed_offensive,injury_stoppage_in_chain,pass_straight,pass_no_touch,pass_miscommunication,goalkeeper_punched_out,foul_committed_penalty,foul_won_penalty,block_save_block,shot_redirect,dribble_nutmeg,shot_deflected,bad_behaviour_card_id,bad_behaviour_card_name,clearance_other,shot_open_goal,shot_saved_to_post,goalkeeper_shot_saved_to_post,shot_saved_off_target,goalkeeper_shot_saved_off_target,goalkeeper_lost_in_play,goalkeeper_success_in_play,dribble_no_touch,goalkeeper_penalty_saved_to_post,shot_follows_dribble,player_off_permanent
0,2e4b1b7f-e67e-4bef-8cf1-2eab2ab81af2,1,1,00:00:00.000,0,0,1,0.0,35,Starting XI,773,Switzerland,1,Regular Play,773,Switzerland,4231.0,"[{'player': {'id': 5550, 'name': 'Yann Sommer'...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,f73c3f38-8c45-4802-8cb9-5a87234f385b,2,1,00:00:00.000,0,0,1,0.0,35,Starting XI,773,Switzerland,1,Regular Play,772,Spain,433.0,"[{'player': {'id': 11748, 'name': 'Unai Simón ...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,9098375d-3f07-4241-bdc9-3602a2e5c754,3,1,00:00:00.000,0,0,1,0.0,18,Half Start,773,Switzerland,1,Regular Play,772,Spain,,,[18099197-a92a-4398-ac5e-39dad539c3eb],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,18099197-a92a-4398-ac5e-39dad539c3eb,4,1,00:00:00.000,0,0,1,0.0,18,Half Start,773,Switzerland,1,Regular Play,773,Switzerland,,,[9098375d-3f07-4241-bdc9-3602a2e5c754],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,ea57ee78-a4ef-4d62-81c7-98cdf1c11925,5,1,00:00:00.967,0,0,2,1.511008,30,Pass,772,Spain,9,From Kick Off,772,Spain,,,[66655fd3-2764-4532-b1bb-0a01074fed30],"[61.0, 40.1]",3477.0,Álvaro Borja Morata Martín,23.0,Center Forward,6892.0,Pau Francisco Torres,26.68108,-2.883887,1.0,Ground Pass,"[35.2, 33.3]",40.0,Right Foot,65.0,Kick Off,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### checking for duplicates in original data frame

In [25]:
df_euro["id"].duplicated().sum()

0

# Filter Data Frame to type shot

In [26]:
df_shots_euro = df_euro[df_euro["type_name"] == "Shot"].copy()
df_shots_euro.shape

(1289, 142)

remove penalty shootouts

In [27]:
df_penalty_shootouts_euro = df_euro[df_euro["period"] == 5]
display(df_penalty_shootouts_euro.shape)
df_penalty_shootouts_euro[["type_name", "shot_type_name", "location", "period"]].head(15)


(92, 142)

Unnamed: 0,type_name,shot_type_name,location,period
4968,Half Start,,,5
4969,Half Start,,,5
4970,Shot,Penalty,"[108.0, 40.0]",5
4971,Goal Keeper,,"[1.0, 40.0]",5
4972,Shot,Penalty,"[108.0, 40.0]",5
4973,Goal Keeper,,"[1.0, 40.0]",5
4974,Shot,Penalty,"[108.0, 40.0]",5
4975,Goal Keeper,,"[1.0, 40.0]",5
4976,Shot,Penalty,"[108.0, 40.0]",5
4977,Goal Keeper,,"[1.0, 40.0]",5


In [28]:
shots_euro = df_shots_euro[df_shots_euro["period"] != 5]
shots_euro.shape


(1251, 142)

Euro data set will only be used for evaluation. Thus, the same feature and pre-processing steps will be maintained and applied as before.

In [29]:
cols = [
    "location",
    "counterpress",
    "shot_statsbomb_xg",
    "shot_end_location",
    "shot_type_id",
    "shot_technique_id",
    "shot_outcome_id",
    "shot_body_part_id",
    "shot_open_goal",
    "shot_first_time",
    "shot_one_on_one",
    "shot_aerial_won",
]

shots_euro = shots_euro[cols]
display(shots_euro.head())
shots_euro.shape


Unnamed: 0,location,counterpress,shot_statsbomb_xg,shot_end_location,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won
72,"[91.9, 39.0]",,0.030169,"[99.7, 40.0]",87.0,93.0,96.0,38.0,,,,
400,"[96.5, 29.7]",,0.021315,"[110.2, 37.8]",87.0,91.0,96.0,38.0,,True,,
748,"[97.8, 25.9]",,0.047888,"[120.0, 38.3, 3.5]",62.0,93.0,98.0,40.0,,,,
1016,"[112.0, 40.7]",,0.100356,"[118.8, 38.7, 2.2]",87.0,93.0,100.0,37.0,,,,
1449,"[99.7, 58.6]",,0.008758,"[100.7, 57.7]",87.0,93.0,96.0,38.0,,,,


(1251, 12)

In [30]:
display(shots_euro.isna().sum())
shots_euro.shape

location                0
counterpress         1251
shot_statsbomb_xg       0
shot_end_location       0
shot_type_id            0
shot_technique_id       0
shot_outcome_id         0
shot_body_part_id       0
shot_open_goal       1236
shot_first_time       857
shot_one_on_one      1206
shot_aerial_won      1117
dtype: int64

(1251, 12)

In [31]:
# Drop counterpress

shots_euro.drop(["counterpress"], axis=1, inplace=True)

shots_euro.isna().sum()


location                0
shot_statsbomb_xg       0
shot_end_location       0
shot_type_id            0
shot_technique_id       0
shot_outcome_id         0
shot_body_part_id       0
shot_open_goal       1236
shot_first_time       857
shot_one_on_one      1206
shot_aerial_won      1117
dtype: int64

In [32]:
# Drop Shot_end_location
shots_euro.drop(["shot_end_location"], axis=1, inplace=True)


In [33]:
display(shots_euro.shape)
shots.shape


(1251, 10)

(1667, 12)

In [34]:
# map false to 0 for binary columns. apply astype(int)

binary_cols = [
    "shot_open_goal",
    "shot_first_time",
    "shot_one_on_one",
    "shot_aerial_won",
]

for col in binary_cols:
    shots_euro[col] = shots_euro[col].fillna(0).astype(int)

display(shots_euro[binary_cols].head())

shots_euro.isna().sum()



Unnamed: 0,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won
72,0,0,0,0
400,0,1,0,0
748,0,0,0,0
1016,0,0,0,0
1449,0,0,0,0


location             0
shot_statsbomb_xg    0
shot_type_id         0
shot_technique_id    0
shot_outcome_id      0
shot_body_part_id    0
shot_open_goal       0
shot_first_time      0
shot_one_on_one      0
shot_aerial_won      0
dtype: int64

In [35]:
# Extracting shot locations

shots_euro["x"] = shots_euro["location"].apply(lambda loc: loc[0])
shots_euro["y"] = shots_euro["location"].apply(lambda loc: loc[1])

shots_euro.drop("location", axis=1, inplace=True)
display(shots_euro.head())
shots_euro.shape


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y
72,0.030169,87.0,93.0,96.0,38.0,0,0,0,0,91.9,39.0
400,0.021315,87.0,91.0,96.0,38.0,0,1,0,0,96.5,29.7
748,0.047888,62.0,93.0,98.0,40.0,0,0,0,0,97.8,25.9
1016,0.100356,87.0,93.0,100.0,37.0,0,0,0,0,112.0,40.7
1449,0.008758,87.0,93.0,96.0,38.0,0,0,0,0,99.7,58.6


(1251, 11)

In [36]:
# Replacing shou_outcome with is_goal feature

shots_euro["is_goal"] = (shots_euro["shot_outcome_id"] == 97).astype(int)
shots_euro["is_goal"].value_counts()


is_goal
0    1120
1     131
Name: count, dtype: int64

In [37]:
# Drop the outcome_id now that we have the label
shots_euro.drop("shot_outcome_id", axis=1, inplace=True)
display(shots_euro.head())
shots_euro.shape


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal
72,0.030169,87.0,93.0,38.0,0,0,0,0,91.9,39.0,0
400,0.021315,87.0,91.0,38.0,0,1,0,0,96.5,29.7,0
748,0.047888,62.0,93.0,40.0,0,0,0,0,97.8,25.9,0
1016,0.100356,87.0,93.0,37.0,0,0,0,0,112.0,40.7,0
1449,0.008758,87.0,93.0,38.0,0,0,0,0,99.7,58.6,0


(1251, 11)

In [38]:
# Adding geometry features

# Distance to goal centre

shots_euro["distance"] = np.sqrt((goal_x - shots_euro["x"])**2 + (goal_y - shots_euro["y"])**2)

shots_euro["angle"] = shots_euro.apply(calc_angle,axis=1)

In [39]:
shots_euro.head()

Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle
72,0.030169,87.0,93.0,38.0,0,0,0,0,91.9,39.0,0,28.117788,0.2265
400,0.021315,87.0,91.0,38.0,0,1,0,0,96.5,29.7,0,25.658137,0.228027
748,0.047888,62.0,93.0,40.0,0,0,0,0,97.8,25.9,0,26.29924,0.205564
1016,0.100356,87.0,93.0,37.0,0,0,0,0,112.0,40.7,0,8.030567,0.756483
1449,0.008758,87.0,93.0,38.0,0,0,0,0,99.7,58.6,0,27.532708,0.172017


In [40]:
display(shots_featured.shape)
display(shots.shape)
shots_euro.shape

(1667, 13)

(1667, 12)

(1251, 13)

In [41]:
display(shots_featured.columns)
display(shots.columns)
display(shots_euro.columns)

Index(['shot_statsbomb_xg', 'shot_type_id', 'shot_technique_id',
       'shot_body_part_id', 'shot_open_goal', 'shot_first_time',
       'shot_one_on_one', 'shot_aerial_won', 'x', 'y', 'is_goal', 'distance',
       'angle'],
      dtype='object')

Index(['shot_statsbomb_xg', 'shot_type_id', 'shot_technique_id',
       'shot_body_part_id', 'shot_open_goal', 'shot_first_time',
       'shot_one_on_one', 'shot_aerial_won', 'x', 'y', 'is_goal', 'distance'],
      dtype='object')

Index(['shot_statsbomb_xg', 'shot_type_id', 'shot_technique_id',
       'shot_body_part_id', 'shot_open_goal', 'shot_first_time',
       'shot_one_on_one', 'shot_aerial_won', 'x', 'y', 'is_goal', 'distance',
       'angle'],
      dtype='object')

In [42]:
# checking for duplicates before saving file

shots_euro.duplicated().sum()

6

In [43]:
shots_euro.duplicated

<bound method DataFrame.duplicated of         shot_statsbomb_xg  shot_type_id  shot_technique_id  shot_body_part_id  \
72               0.030169          87.0               93.0               38.0   
400              0.021315          87.0               91.0               38.0   
748              0.047888          62.0               93.0               40.0   
1016             0.100356          87.0               93.0               37.0   
1449             0.008758          87.0               93.0               38.0   
...                   ...           ...                ...                ...   
192306           0.030229          87.0               93.0               38.0   
192352           0.043607          87.0               93.0               40.0   
192386           0.025420          87.0               91.0               40.0   
192531           0.058556          87.0               91.0               40.0   
192561           0.127872          87.0               93.0             

These are not duplicates but penalty shots taken in games. Penatly shootouts are already removed

# Save Euro 2020 Data to csv file


In [44]:
# save data frame to csv file

output_path = Path("shots_featured_eu2020.csv")
shots_euro.to_csv(output_path, index=False)
print(f"Saved Euro preprocessed shots to {output_path.resolve()}")


Saved Euro preprocessed shots to C:\Users\traik\Desktop\ML_Bootcamp\Final_Project\Final_Project_Ironhack\shots_featured_eu2020.csv


# WOrld Cup 2022 Data Set

In [45]:
# EURO 2020
matches_file_WC_22 = BASE / "matches" / "43" / "106.json"
with open(matches_file_WC_22, "r", encoding="utf-8") as f:
    matches_WC_22 = json.load(f)

match_ids_WC_22 = [m["match_id"] for m in matches_WC_22]

# Load events
events_folder = BASE / "events"
all_events_WC_22 = []

for mid in match_ids_WC_22:
    fp = events_folder / f"{mid}.json"
    with open(fp, "r", encoding="utf-8") as f:
        all_events_WC_22.extend(json.load(f))

df_WC_22 = pd.json_normalize(all_events_WC_22, sep="_")
df_WC_22.head()

Unnamed: 0,id,index,period,timestamp,minute,second,possession,duration,type_id,type_name,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name,team_id,team_name,tactics_formation,tactics_lineup,related_events,location,player_id,player_name,position_id,position_name,pass_recipient_id,pass_recipient_name,pass_length,pass_angle,pass_height_id,pass_height_name,pass_end_location,pass_body_part_id,pass_body_part_name,pass_type_id,pass_type_name,carry_end_location,pass_cross,pass_outcome_id,pass_outcome_name,ball_receipt_outcome_id,ball_receipt_outcome_name,under_pressure,clearance_right_foot,clearance_body_part_id,clearance_body_part_name,shot_statsbomb_xg,shot_end_location,shot_technique_id,shot_technique_name,shot_body_part_id,shot_body_part_name,shot_type_id,shot_type_name,shot_outcome_id,shot_outcome_name,shot_first_time,shot_freeze_frame,goalkeeper_end_location,goalkeeper_position_id,goalkeeper_position_name,goalkeeper_type_id,goalkeeper_type_name,pass_assisted_shot_id,pass_shot_assist,shot_key_pass_id,goalkeeper_technique_id,goalkeeper_technique_name,goalkeeper_body_part_id,goalkeeper_body_part_name,goalkeeper_outcome_id,goalkeeper_outcome_name,off_camera,pass_deflected,counterpress,duel_type_id,duel_type_name,pass_aerial_won,interception_outcome_id,interception_outcome_name,clearance_left_foot,pass_switch,clearance_aerial_won,clearance_head,out,pass_outswinging,pass_technique_id,pass_technique_name,foul_won_defensive,duel_outcome_id,duel_outcome_name,dribble_outcome_id,dribble_outcome_name,shot_one_on_one,pass_cut_back,block_offensive,foul_committed_card_id,foul_committed_card_name,pass_goal_assist,shot_deflected,block_deflection,pass_through_ball,foul_committed_advantage,foul_won_advantage,pass_miscommunication,ball_recovery_recovery_failure,dribble_nutmeg,shot_open_goal,substitution_outcome_id,substitution_outcome_name,substitution_replacement_id,substitution_replacement_name,foul_committed_type_id,foul_committed_type_name,injury_stoppage_in_chain,bad_behaviour_card_id,bad_behaviour_card_name,shot_aerial_won,pass_no_touch,miscontrol_aerial_won,dribble_overrun,foul_committed_offensive,50_50_outcome_id,50_50_outcome_name,pass_straight,pass_inswinging,ball_recovery_offensive,clearance_other,foul_committed_penalty,foul_won_penalty,dribble_no_touch,shot_follows_dribble,goalkeeper_punched_out,block_save_block,shot_saved_to_post,goalkeeper_shot_saved_to_post,half_start_late_video_start,shot_saved_off_target,goalkeeper_shot_saved_off_target,goalkeeper_success_in_play,shot_redirect,goalkeeper_lost_in_play
0,093f898d-33b1-4425-b591-37dd9c9bf70b,1,1,00:00:00.000,0,0,1,0.0,35,Starting XI,786,Serbia,1,Regular Play,786,Serbia,3412.0,"[{'player': {'id': 20600, 'name': 'Vanja Milin...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,79a9efbb-b5e5-45b0-983d-8729bcc4a0e1,2,1,00:00:00.000,0,0,1,0.0,35,Starting XI,786,Serbia,1,Regular Play,773,Switzerland,4231.0,"[{'player': {'id': 17974, 'name': 'Gregor Kobe...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,c4cdfc71-b9f4-4751-a714-e816c1419457,3,1,00:00:00.000,0,0,1,0.0,18,Half Start,786,Serbia,1,Regular Play,773,Switzerland,,,[51f295b8-da96-428e-ab47-90f380f8cf53],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,51f295b8-da96-428e-ab47-90f380f8cf53,4,1,00:00:00.000,0,0,1,0.0,18,Half Start,786,Serbia,1,Regular Play,786,Serbia,,,[c4cdfc71-b9f4-4751-a714-e816c1419457],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4acb4fd2-f46a-4d73-993c-e06597873924,5,1,00:00:00.521,0,0,2,1.220932,30,Pass,773,Switzerland,9,From Kick Off,773,Switzerland,,,[b0d0583a-d544-4ab1-9043-6b42a57c27a8],"[61.0, 40.1]",5545.0,Breel-Donald Embolo,23.0,Center Forward,6983.0,Remo Freuler,18.258423,2.943123,1.0,Ground Pass,"[43.1, 43.7]",40.0,Right Foot,65.0,Kick Off,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [46]:
df_WC_22.shape

(234652, 141)

In [47]:
df_shots_WC_22 = df_WC_22[df_WC_22["type_name"] == "Shot"].copy()
df_shots_WC_22.shape

(1494, 141)

In [48]:
# Remove penalty shootout shots
shots_WC_22 = df_shots_WC_22[df_shots_WC_22["period"] != 5]
print(shots_WC_22.shape)

# Keep selected cols as before
shots_WC_22 = shots_WC_22[cols].copy()
display(shots_WC_22.head())
print(shots_WC_22.shape)
display(shots_WC_22.isna().sum())

# Drop counterpress
shots_WC_22.drop(["counterpress"], axis=1, inplace=True)
print(shots_WC_22.isna().sum())

# Drop shot_end_location
shots_WC_22.drop(["shot_end_location"], axis=1, inplace=True)

# Binary columns — fill NaN with 0 and convert to int
for col in binary_cols:
    shots_WC_22[col] = shots_WC_22[col].fillna(0).astype(int)

display(shots_WC_22[binary_cols].head())
print(shots_WC_22.isna().sum())

# Extract x, y from shot location
shots_WC_22["x"] = shots_WC_22["location"].apply(lambda loc: loc[0])
shots_WC_22["y"] = shots_WC_22["location"].apply(lambda loc: loc[1])

shots_WC_22.drop("location", axis=1, inplace=True)
display(shots_WC_22.head())
print(shots_WC_22.shape)

# Create is_goal label
shots_WC_22["is_goal"] = (shots_WC_22["shot_outcome_id"] == 97).astype(int)
print(shots_WC_22["is_goal"].value_counts())

# Drop the original outcome ID
shots_WC_22.drop("shot_outcome_id", axis=1, inplace=True)
display(shots_WC_22.head())
print(shots_WC_22.shape)

# Add geometry features — using YOUR same functions & goal coordinates
shots_WC_22["distance"] = np.sqrt((goal_x - shots_WC_22["x"])**2 + (goal_y - shots_WC_22["y"])**2)

shots_WC_22["angle"] = shots_WC_22.apply(calc_angle, axis=1)

# Final check
display(shots_WC_22.head())
print(shots_WC_22.shape)


(1453, 141)


Unnamed: 0,location,counterpress,shot_statsbomb_xg,shot_end_location,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won
19,"[96.0, 38.8]",,0.036566,"[108.2, 38.5]",87.0,91.0,96.0,38.0,,True,,
24,"[113.1, 40.7]",,0.353289,"[114.8, 40.6, 1.6]",87.0,91.0,100.0,38.0,,True,,
27,"[103.8, 41.9]",,0.069527,"[115.5, 39.1, 1.0]",87.0,91.0,100.0,40.0,,True,,
195,"[112.2, 36.8]",,0.081609,"[120.0, 35.3, 3.5]",87.0,93.0,98.0,37.0,,,,
355,"[97.8, 51.5]",,0.030002,"[120.0, 36.1, 0.6]",87.0,93.0,99.0,38.0,,,,


(1453, 12)


location                0
counterpress         1453
shot_statsbomb_xg       0
shot_end_location       0
shot_type_id            0
shot_technique_id       0
shot_outcome_id         0
shot_body_part_id       0
shot_open_goal       1440
shot_first_time       985
shot_one_on_one      1373
shot_aerial_won      1324
dtype: int64

location                0
shot_statsbomb_xg       0
shot_end_location       0
shot_type_id            0
shot_technique_id       0
shot_outcome_id         0
shot_body_part_id       0
shot_open_goal       1440
shot_first_time       985
shot_one_on_one      1373
shot_aerial_won      1324
dtype: int64


Unnamed: 0,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won
19,0,1,0,0
24,0,1,0,0
27,0,1,0,0
195,0,0,0,0
355,0,0,0,0


location             0
shot_statsbomb_xg    0
shot_type_id         0
shot_technique_id    0
shot_outcome_id      0
shot_body_part_id    0
shot_open_goal       0
shot_first_time      0
shot_one_on_one      0
shot_aerial_won      0
dtype: int64


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y
19,0.036566,87.0,91.0,96.0,38.0,0,1,0,0,96.0,38.8
24,0.353289,87.0,91.0,100.0,38.0,0,1,0,0,113.1,40.7
27,0.069527,87.0,91.0,100.0,40.0,0,1,0,0,103.8,41.9
195,0.081609,87.0,93.0,98.0,37.0,0,0,0,0,112.2,36.8
355,0.030002,87.0,93.0,99.0,38.0,0,0,0,0,97.8,51.5


(1453, 11)
is_goal
0    1284
1     169
Name: count, dtype: int64


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal
19,0.036566,87.0,91.0,38.0,0,1,0,0,96.0,38.8,0
24,0.353289,87.0,91.0,38.0,0,1,0,0,113.1,40.7,0
27,0.069527,87.0,91.0,40.0,0,1,0,0,103.8,41.9,0
195,0.081609,87.0,93.0,37.0,0,0,0,0,112.2,36.8,0
355,0.030002,87.0,93.0,38.0,0,0,0,0,97.8,51.5,0


(1453, 11)


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle
19,0.036566,87.0,91.0,38.0,0,1,0,0,96.0,38.8,0,24.029981,0.264461
24,0.353289,87.0,91.0,38.0,0,1,0,0,113.1,40.7,0,6.935416,0.862058
27,0.069527,87.0,91.0,40.0,0,1,0,0,103.8,41.9,0,16.311039,0.385068
195,0.081609,87.0,93.0,37.0,0,0,0,0,112.2,36.8,0,8.430896,0.687124
355,0.030002,87.0,93.0,38.0,0,0,0,0,97.8,51.5,0,25.0018,0.227095


(1453, 13)


In [49]:
shots_WC_22.is_goal.value_counts()

is_goal
0    1284
1     169
Name: count, dtype: int64

In [50]:
# remove shots with type_id 61 (Corners) as they only appear in the World cup 2022 set.
# they only appear twice so they are very rare and we can confidently take them out. 


shots_WC_22 = shots_WC_22[shots_WC_22["shot_type_id"] != 61.0]


In [51]:
# save data frame to csv file

output_path = Path("shots_WC_22.csv")
shots_WC_22.to_csv(output_path, index=False)
print(f"Saved WC 2022 preprocessed shots to {output_path.resolve()}")

Saved WC 2022 preprocessed shots to C:\Users\traik\Desktop\ML_Bootcamp\Final_Project\Final_Project_Ironhack\shots_WC_22.csv


# One-Hot Encoding

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


In [53]:
categorical_cols = [
    "shot_type_id",
    "shot_technique_id",
    "shot_body_part_id",
]


In [54]:
shots_featured_enc = shots_featured.copy()
shots_euro_enc = shots_euro.copy()
shots_WC_22_enc = shots_WC_22.copy()

In [55]:
print("WC18:")
print(shots_featured_enc["shot_type_id"].value_counts())
print()

print("EURO:")
print(shots_euro_enc["shot_type_id"].value_counts())
print()

print("WC22:")
print(shots_WC_22_enc["shot_type_id"].value_counts())


WC18:
shot_type_id
87.0    1556
62.0      82
88.0      29
Name: count, dtype: int64

EURO:
shot_type_id
87.0    1193
62.0      41
88.0      17
Name: count, dtype: int64

WC22:
shot_type_id
87.0    1382
62.0      46
88.0      23
Name: count, dtype: int64


In [56]:
shots_featured_enc = pd.get_dummies(
    shots_featured_enc,
    columns=categorical_cols,
    drop_first=True
)


In [57]:
shots_featured_enc.shape

(1667, 21)

In [58]:
shots_featured_enc.head()

Unnamed: 0,shot_statsbomb_xg,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle,shot_type_id_87.0,shot_type_id_88.0,shot_technique_id_90.0,shot_technique_id_91.0,shot_technique_id_92.0,shot_technique_id_93.0,shot_technique_id_94.0,shot_technique_id_95.0,shot_body_part_id_38.0,shot_body_part_id_40.0,shot_body_part_id_70.0
239,0.009816,0,0,0,0,115.0,18.0,0,22.561028,0.064071,False,False,False,False,False,True,False,False,False,True,False
270,0.038204,0,0,0,0,112.0,54.0,0,16.124515,0.202196,True,False,False,False,False,True,False,False,False,True,False
439,0.045128,0,0,0,0,98.0,37.0,0,22.203603,0.283785,True,False,False,False,False,True,False,False,False,True,False
548,0.625074,1,0,0,0,119.0,36.0,0,4.123106,0.758049,True,False,False,False,False,True,False,False,False,False,False
815,0.02176,0,0,0,0,97.0,56.0,0,28.017851,0.187756,True,False,False,False,False,True,False,False,False,True,False


In [59]:
shots_euro_enc = pd.get_dummies(
    shots_euro_enc,
    columns=categorical_cols,
    drop_first=True
)

shots_WC_22_enc = pd.get_dummies(
    shots_WC_22_enc,
    columns=categorical_cols,
    drop_first=True
)


In [60]:
display(shots_featured_enc.shape,shots_euro_enc.shape,shots_WC_22_enc.shape)

(1667, 21)

(1251, 21)

(1451, 21)

In [61]:
shots_featured_enc.columns

Index(['shot_statsbomb_xg', 'shot_open_goal', 'shot_first_time',
       'shot_one_on_one', 'shot_aerial_won', 'x', 'y', 'is_goal', 'distance',
       'angle', 'shot_type_id_87.0', 'shot_type_id_88.0',
       'shot_technique_id_90.0', 'shot_technique_id_91.0',
       'shot_technique_id_92.0', 'shot_technique_id_93.0',
       'shot_technique_id_94.0', 'shot_technique_id_95.0',
       'shot_body_part_id_38.0', 'shot_body_part_id_40.0',
       'shot_body_part_id_70.0'],
      dtype='object')

In [62]:
shots_WC_22_enc.columns

Index(['shot_statsbomb_xg', 'shot_open_goal', 'shot_first_time',
       'shot_one_on_one', 'shot_aerial_won', 'x', 'y', 'is_goal', 'distance',
       'angle', 'shot_type_id_87.0', 'shot_type_id_88.0',
       'shot_technique_id_90.0', 'shot_technique_id_91.0',
       'shot_technique_id_92.0', 'shot_technique_id_93.0',
       'shot_technique_id_94.0', 'shot_technique_id_95.0',
       'shot_body_part_id_38.0', 'shot_body_part_id_40.0',
       'shot_body_part_id_70.0'],
      dtype='object')

In [63]:
shots_euro_enc.columns

Index(['shot_statsbomb_xg', 'shot_open_goal', 'shot_first_time',
       'shot_one_on_one', 'shot_aerial_won', 'x', 'y', 'is_goal', 'distance',
       'angle', 'shot_type_id_87.0', 'shot_type_id_88.0',
       'shot_technique_id_90.0', 'shot_technique_id_91.0',
       'shot_technique_id_92.0', 'shot_technique_id_93.0',
       'shot_technique_id_94.0', 'shot_technique_id_95.0',
       'shot_body_part_id_38.0', 'shot_body_part_id_40.0',
       'shot_body_part_id_70.0'],
      dtype='object')

In [78]:
# saving our data frames to csv files and exporting them
# dictionary of names → dataframes
dfs_to_save = {
    "shots_featured_enc.csv": shots_featured_enc,
    "shots_euro_enc.csv": shots_euro_enc,
    "shots_WC_22_enc.csv": shots_WC_22_enc,
}

for filename, df in dfs_to_save.items():
    output_path = Path(filename)
    df.to_csv(output_path, index=False)
    print(f"Saved {filename} to {output_path.resolve()}")


Saved shots_featured_enc.csv to C:\Users\traik\Desktop\ML_Bootcamp\Final_Project\Final_Project_Ironhack\shots_featured_enc.csv
Saved shots_euro_enc.csv to C:\Users\traik\Desktop\ML_Bootcamp\Final_Project\Final_Project_Ironhack\shots_euro_enc.csv
Saved shots_WC_22_enc.csv to C:\Users\traik\Desktop\ML_Bootcamp\Final_Project\Final_Project_Ironhack\shots_WC_22_enc.csv


# Adding Freeze Frame feature to try and improve model performances

In [64]:
# Filter only shots
df_shots_wc18 = df_wc[df_wc["type_name"] == "Shot"].copy()

print("Total WC18 shot events:", df_shots_wc18.shape)


Total WC18 shot events: (1706, 121)


In [65]:
cols_wc18 = cols + ["shot_freeze_frame"]

shots_wc18 = df_shots_wc18[cols_wc18].copy()
display(shots_wc18.head())


Unnamed: 0,location,counterpress,shot_statsbomb_xg,shot_end_location,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,shot_freeze_frame
239,"[115.0, 18.0]",,0.009816,"[120.0, 42.2, 2.0]",62.0,93.0,100.0,40.0,,,,,"[{'location': [116.0, 41.0], 'player': {'id': ..."
270,"[112.0, 54.0]",,0.038204,"[113.0, 53.0]",87.0,93.0,96.0,40.0,,,,,"[{'location': [102.0, 40.0], 'player': {'id': ..."
439,"[98.0, 37.0]",,0.045128,"[105.0, 37.0]",87.0,93.0,96.0,40.0,,,,,"[{'location': [96.0, 25.0], 'player': {'id': 1..."
548,"[119.0, 36.0]",,0.625074,"[120.0, 40.5, 3.1]",87.0,93.0,98.0,37.0,True,,,,"[{'location': [115.0, 45.0], 'player': {'id': ..."
815,"[97.0, 56.0]",,0.02176,"[100.0, 54.0]",87.0,93.0,96.0,40.0,,,,,"[{'location': [100.0, 48.0], 'player': {'id': ..."


In [66]:
shots_wc18["shot_freeze_frame"].head()

239    [{'location': [116.0, 41.0], 'player': {'id': ...
270    [{'location': [102.0, 40.0], 'player': {'id': ...
439    [{'location': [96.0, 25.0], 'player': {'id': 1...
548    [{'location': [115.0, 45.0], 'player': {'id': ...
815    [{'location': [100.0, 48.0], 'player': {'id': ...
Name: shot_freeze_frame, dtype: object

In [67]:
# Remove penalty shootout shots
shots_wc18 = shots_wc18[df_shots_wc18["period"] != 5]

# Drop counterpress
shots_wc18.drop(["counterpress"], axis=1, inplace=True)

# Drop shot_end_location
shots_wc18.drop(["shot_end_location"], axis=1, inplace=True)

# Binary columns
binary_cols = [
    "shot_open_goal",
    "shot_first_time",
    "shot_one_on_one",
    "shot_aerial_won",
]

for col in binary_cols:
    shots_wc18[col] = shots_wc18[col].fillna(0).astype(int)

# Extract X and Y
shots_wc18["x"] = shots_wc18["location"].apply(lambda loc: loc[0])
shots_wc18["y"] = shots_wc18["location"].apply(lambda loc: loc[1])
shots_wc18.drop("location", axis=1, inplace=True)

# Create is_goal
shots_wc18["is_goal"] = (shots_wc18["shot_outcome_id"] == 97).astype(int)
shots_wc18.drop("shot_outcome_id", axis=1, inplace=True)

# Distance + Angle (your existing functions)
shots_wc18["distance"] = np.sqrt((goal_x - shots_wc18["x"])**2 +
                                 (goal_y - shots_wc18["y"])**2)

shots_wc18["angle"] = shots_wc18.apply(calc_angle, axis=1)


In [68]:
shots_wc18.head()

Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,shot_freeze_frame,x,y,is_goal,distance,angle
239,0.009816,62.0,93.0,40.0,0,0,0,0,"[{'location': [116.0, 41.0], 'player': {'id': ...",115.0,18.0,0,22.561028,0.064071
270,0.038204,87.0,93.0,40.0,0,0,0,0,"[{'location': [102.0, 40.0], 'player': {'id': ...",112.0,54.0,0,16.124515,0.202196
439,0.045128,87.0,93.0,40.0,0,0,0,0,"[{'location': [96.0, 25.0], 'player': {'id': 1...",98.0,37.0,0,22.203603,0.283785
548,0.625074,87.0,93.0,37.0,1,0,0,0,"[{'location': [115.0, 45.0], 'player': {'id': ...",119.0,36.0,0,4.123106,0.758049
815,0.02176,87.0,93.0,40.0,0,0,0,0,"[{'location': [100.0, 48.0], 'player': {'id': ...",97.0,56.0,0,28.017851,0.187756


In [69]:
import pprint
pprint.pprint(shots_wc18.iloc[10]["shot_freeze_frame"])


[{'location': [83.0, 63.0],
  'player': {'id': 10955, 'name': 'Harry Kane'},
  'position': {'id': 24, 'name': 'Left Center Forward'},
  'teammate': False},
 {'location': [118.0, 40.0],
  'player': {'id': 3468, 'name': 'Jordan Pickford'},
  'position': {'id': 1, 'name': 'Goalkeeper'},
  'teammate': False},
 {'location': [108.0, 43.0],
  'player': {'id': 3244, 'name': 'John Stones'},
  'position': {'id': 4, 'name': 'Center Back'},
  'teammate': False},
 {'location': [104.0, 29.0],
  'player': {'id': 5691, 'name': 'Johan Andrés Mojica Palacio'},
  'position': {'id': 6, 'name': 'Left Back'},
  'teammate': True},
 {'location': [106.0, 34.0],
  'player': {'id': 3308, 'name': 'Kieran Trippier'},
  'position': {'id': 12, 'name': 'Right Midfield'},
  'teammate': False},
 {'location': [107.0, 37.0],
  'player': {'id': 3205, 'name': 'Kyle Walker'},
  'position': {'id': 3, 'name': 'Right Center Back'},
  'teammate': False},
 {'location': [105.0, 53.0],
  'player': {'id': 3336, 'name': 'Harry Magui

In [70]:
shots_wc18.head()


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,shot_freeze_frame,x,y,is_goal,distance,angle
239,0.009816,62.0,93.0,40.0,0,0,0,0,"[{'location': [116.0, 41.0], 'player': {'id': ...",115.0,18.0,0,22.561028,0.064071
270,0.038204,87.0,93.0,40.0,0,0,0,0,"[{'location': [102.0, 40.0], 'player': {'id': ...",112.0,54.0,0,16.124515,0.202196
439,0.045128,87.0,93.0,40.0,0,0,0,0,"[{'location': [96.0, 25.0], 'player': {'id': 1...",98.0,37.0,0,22.203603,0.283785
548,0.625074,87.0,93.0,37.0,1,0,0,0,"[{'location': [115.0, 45.0], 'player': {'id': ...",119.0,36.0,0,4.123106,0.758049
815,0.02176,87.0,93.0,40.0,0,0,0,0,"[{'location': [100.0, 48.0], 'player': {'id': ...",97.0,56.0,0,28.017851,0.187756


In [71]:
def count_defenders_between(goal_x, shooter_x, freeze):
    if freeze is None:
        return 0
    return sum(
        (not ff["teammate"]) and (ff["location"][0] > shooter_x)
        for ff in freeze
    )


In [72]:
def min_defender_distance(shooter_x, shooter_y, freeze):
    dists = []
    for ff in freeze:
        if ff["teammate"]:
            continue
        dx, dy = ff["location"]
        d = np.sqrt((dx - shooter_x)**2 + (dy - shooter_y)**2)
        dists.append(d)
    return min(dists) if len(dists) else 20.0


In [73]:
def defenders_in_cone(row):
    freeze = row["shot_freeze_frame"]
    if freeze is None:
        return 0
    
    shooter_x, shooter_y = row["x"], row["y"]
    gx, gy = goal_x, goal_y
    
    # shooter -> goal vector
    vec_goal = np.array([gx - shooter_x, gy - shooter_y])
    
    count = 0
    for ff in freeze:
        if ff["teammate"]:
            continue
        
        dx, dy = ff["location"]
        vec_def = np.array([dx - shooter_x, dy - shooter_y])
        
        # cosine similarity
        cosang = np.dot(vec_goal, vec_def) / (
            np.linalg.norm(vec_goal) * np.linalg.norm(vec_def)
        )
        
        # angle threshold (cos(20°)=0.94)
        if cosang > 0.94:
            count += 1
            
    return count


In [74]:
nan_rows = shots_wc18[shots_wc18["shot_freeze_frame"].isna()]
nan_rows.head(20)


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,shot_freeze_frame,x,y,is_goal,distance,angle
1972,0.7835,88.0,93.0,40.0,0,0,0,0,,108.0,40.0,1,12.0,0.521205
13367,0.7835,88.0,93.0,40.0,0,0,0,0,,108.0,40.0,0,12.0,0.521205
14420,0.7835,88.0,93.0,40.0,0,0,0,0,,109.0,41.0,1,11.045361,0.56213
22589,0.7835,88.0,93.0,40.0,0,0,0,0,,109.0,41.0,1,11.045361,0.56213
23240,0.7835,88.0,93.0,40.0,0,0,0,0,,109.0,41.0,1,11.045361,0.56213
37294,0.7835,88.0,93.0,38.0,0,0,0,0,,109.0,41.0,1,11.045361,0.56213
60457,0.7835,88.0,93.0,40.0,0,0,0,0,,109.0,41.0,1,11.045361,0.56213
61587,0.7835,88.0,93.0,40.0,0,0,0,0,,109.0,41.0,1,11.045361,0.56213
74316,0.7835,88.0,93.0,40.0,0,0,0,0,,108.0,40.0,1,12.0,0.521205
77105,0.7835,88.0,93.0,38.0,0,0,0,0,,108.0,40.0,1,12.0,0.521205


In [75]:
shots_wc18["shot_freeze_frame"] = shots_wc18["shot_freeze_frame"].apply(
    lambda x: x if isinstance(x, list) else []
)


In [76]:
shots_wc18["defenders_between"] = shots_wc18.apply(
    lambda row: count_defenders_between(goal_x, row["x"], row["shot_freeze_frame"]),
    axis=1
)

shots_wc18["min_defender_dist"] = shots_wc18.apply(
    lambda row: min_defender_distance(row["x"], row["y"], row["shot_freeze_frame"]),
    axis=1
)

shots_wc18["defenders_in_cone"] = shots_wc18.apply(
    defenders_in_cone,
    axis=1
)

display(shots_wc18.head())


  cosang = np.dot(vec_goal, vec_def) / (


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,shot_freeze_frame,x,y,is_goal,distance,angle,defenders_between,min_defender_dist,defenders_in_cone
239,0.009816,62.0,93.0,40.0,0,0,0,0,"[{'location': [116.0, 41.0], 'player': {'id': ...",115.0,18.0,0,22.561028,0.064071,6,10.049876,8
270,0.038204,87.0,93.0,40.0,0,0,0,0,"[{'location': [102.0, 40.0], 'player': {'id': ...",112.0,54.0,0,16.124515,0.202196,2,1.0,1
439,0.045128,87.0,93.0,40.0,0,0,0,0,"[{'location': [96.0, 25.0], 'player': {'id': 1...",98.0,37.0,0,22.203603,0.283785,5,1.414214,2
548,0.625074,87.0,93.0,37.0,1,0,0,0,"[{'location': [115.0, 45.0], 'player': {'id': ...",119.0,36.0,0,4.123106,0.758049,2,2.828427,2
815,0.02176,87.0,93.0,40.0,0,0,0,0,"[{'location': [100.0, 48.0], 'player': {'id': ...",97.0,56.0,0,28.017851,0.187756,6,2.0,2


In [None]:
shots_wc18["shot_freeze_frame"].apply(lambda x: type(x)).value_counts()



shot_freeze_frame
<class 'list'>    1667
Name: count, dtype: int64

In [None]:
def process_ff_dataset(df_shots, cols, binary_cols):
    

    # Only shots
    df = df_shots[df_shots["type_name"] == "Shot"].copy()

    # Filter penalty shootout (period 5)
    df = df[df["period"] != 5]

    # Keep base cols + freeze-frame
    df = df[cols + ["shot_freeze_frame"]].copy()

    # Drop unused
    df.drop(["counterpress", "shot_end_location"], axis=1, inplace=True)

    # Fix binary columns
    for col in binary_cols:
        df[col] = df[col].fillna(0).astype(int)

    # Extract x,y
    df["x"] = df["location"].apply(lambda loc: loc[0])
    df["y"] = df["location"].apply(lambda loc: loc[1])
    df.drop("location", axis=1, inplace=True)

    # is_goal
    df["is_goal"] = (df["shot_outcome_id"] == 97).astype(int)
    df.drop("shot_outcome_id", axis=1, inplace=True)

    # Geometry
    df["distance"] = np.sqrt((goal_x - df["x"])**2 + (goal_y - df["y"])**2)
    df["angle"] = df.apply(calc_angle, axis=1)

    # Freeze-frame -> ensure list
    df["shot_freeze_frame"] = df["shot_freeze_frame"].apply(
        lambda x: x if isinstance(x, list) else []
    )

    # FF features
    df["defenders_between"] = df.apply(
        lambda row: count_defenders_between(goal_x, row["x"], row["shot_freeze_frame"]),
        axis=1
    )

    df["min_defender_dist"] = df.apply(
        lambda row: min_defender_distance(row["x"], row["y"], row["shot_freeze_frame"]),
        axis=1
    )

    df["defenders_in_cone"] = df.apply(
        defenders_in_cone,
        axis=1
    )

    return df


In [81]:
shots_wc18_ff  = process_ff_dataset(df_wc,    cols, binary_cols)
shots_euro_ff  = process_ff_dataset(df_euro,  cols, binary_cols)
shots_wc22_ff  = process_ff_dataset(df_WC_22, cols, binary_cols)


  cosang = np.dot(vec_goal, vec_def) / (


In [82]:
categorical_cols = ["shot_type_id", "shot_technique_id", "shot_body_part_id"]

ff_datasets = {
    "WC18": shots_wc18_ff,
    "EURO": shots_euro_ff,
    "WC22": shots_wc22_ff,
}

def encode_and_align(dfs, categorical_cols):
    encoded = {}
    all_cols = set()

    # 1. Encode each dataset
    for name, df in dfs.items():
        df_enc = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
        encoded[name] = df_enc
        all_cols.update(df_enc.columns)

    # 2. Align all datasets to the same column set
    all_cols = sorted(list(all_cols))
    aligned = {}
    for name, df in encoded.items():
        missing = set(all_cols) - set(df.columns)
        for col in missing:
            df[col] = 0
        aligned[name] = df[all_cols]

    return aligned

encoded_ff = encode_and_align(ff_datasets, categorical_cols)

shots_wc18_ff_enc  = encoded_ff["WC18"]
shots_euro_ff_enc  = encoded_ff["EURO"]
shots_wc22_ff_enc  = encoded_ff["WC22"]


In [84]:
display(shots_wc18_ff_enc.shape,shots_euro_ff_enc.shape,shots_wc22_ff_enc.shape)

(1667, 29)

(1251, 29)

(1453, 29)

In [85]:
display(shots_wc18_ff_enc.head(),shots_euro_ff_enc.head(),shots_wc22_ff_enc.head())

Unnamed: 0,angle,defenders_between,defenders_in_cone,distance,is_goal,min_defender_dist,shot_aerial_won,shot_body_part_id_37.0,shot_body_part_id_38.0,shot_body_part_id_40.0,shot_body_part_id_70.0,shot_first_time,shot_freeze_frame,shot_one_on_one,shot_open_goal,shot_statsbomb_xg,shot_technique_id_89.0,shot_technique_id_90.0,shot_technique_id_91.0,shot_technique_id_92.0,shot_technique_id_93.0,shot_technique_id_94.0,shot_technique_id_95.0,shot_type_id_61.0,shot_type_id_62.0,shot_type_id_87.0,shot_type_id_88.0,x,y
239,0.064071,6,8,22.561028,0,10.049876,0,False,False,True,False,0,"[{'location': [116.0, 41.0], 'player': {'id': ...",0,0,0.009816,False,False,False,False,True,False,False,0,True,False,False,115.0,18.0
270,0.202196,2,1,16.124515,0,1.0,0,False,False,True,False,0,"[{'location': [102.0, 40.0], 'player': {'id': ...",0,0,0.038204,False,False,False,False,True,False,False,0,False,True,False,112.0,54.0
439,0.283785,5,2,22.203603,0,1.414214,0,False,False,True,False,0,"[{'location': [96.0, 25.0], 'player': {'id': 1...",0,0,0.045128,False,False,False,False,True,False,False,0,False,True,False,98.0,37.0
548,0.758049,2,2,4.123106,0,2.828427,0,True,False,False,False,0,"[{'location': [115.0, 45.0], 'player': {'id': ...",0,1,0.625074,False,False,False,False,True,False,False,0,False,True,False,119.0,36.0
815,0.187756,6,2,28.017851,0,2.0,0,False,False,True,False,0,"[{'location': [100.0, 48.0], 'player': {'id': ...",0,0,0.02176,False,False,False,False,True,False,False,0,False,True,False,97.0,56.0


Unnamed: 0,angle,defenders_between,defenders_in_cone,distance,is_goal,min_defender_dist,shot_aerial_won,shot_body_part_id_37.0,shot_body_part_id_38.0,shot_body_part_id_40.0,shot_body_part_id_70.0,shot_first_time,shot_freeze_frame,shot_one_on_one,shot_open_goal,shot_statsbomb_xg,shot_technique_id_89.0,shot_technique_id_90.0,shot_technique_id_91.0,shot_technique_id_92.0,shot_technique_id_93.0,shot_technique_id_94.0,shot_technique_id_95.0,shot_type_id_61.0,shot_type_id_62.0,shot_type_id_87.0,shot_type_id_88.0,x,y
72,0.2265,6,2,28.117788,0,4.178516,0,False,True,False,False,0,"[{'location': [100.3, 40.3], 'player': {'id': ...",0,0,0.030169,False,False,False,False,True,False,False,0,False,True,False,91.9,39.0
400,0.228027,11,6,25.658137,0,3.612478,0,False,True,False,False,1,"[{'location': [114.6, 76.2], 'player': {'id': ...",0,0,0.021315,False,False,True,False,False,False,False,0,False,True,False,96.5,29.7
748,0.205564,11,7,26.29924,0,8.207923,0,False,False,True,False,0,"[{'location': [86.2, 26.5], 'player': {'id': 5...",0,0,0.047888,False,False,False,False,True,False,False,0,True,False,False,97.8,25.9
1016,0.756483,6,2,8.030567,0,1.529706,0,True,False,False,False,0,"[{'location': [116.1, 73.8], 'player': {'id': ...",0,0,0.100356,False,False,False,False,True,False,False,0,False,True,False,112.0,40.7
1449,0.172017,6,2,27.532708,0,1.442221,0,False,True,False,False,0,"[{'location': [103.4, 65.1], 'player': {'id': ...",0,0,0.008758,False,False,False,False,True,False,False,0,False,True,False,99.7,58.6


Unnamed: 0,angle,defenders_between,defenders_in_cone,distance,is_goal,min_defender_dist,shot_aerial_won,shot_body_part_id_37.0,shot_body_part_id_38.0,shot_body_part_id_40.0,shot_body_part_id_70.0,shot_first_time,shot_freeze_frame,shot_one_on_one,shot_open_goal,shot_statsbomb_xg,shot_technique_id_89.0,shot_technique_id_90.0,shot_technique_id_91.0,shot_technique_id_92.0,shot_technique_id_93.0,shot_technique_id_94.0,shot_technique_id_95.0,shot_type_id_61.0,shot_type_id_62.0,shot_type_id_87.0,shot_type_id_88.0,x,y
19,0.264461,7,3,24.029981,0,7.200694,0,False,True,False,False,1,"[{'location': [86.2, 51.6], 'player': {'id': 3...",0,0,0.036566,False,False,True,False,False,False,False,False,False,True,False,96.0,38.8
24,0.862058,1,1,6.935416,0,2.0,0,False,True,False,False,1,"[{'location': [104.9, 50.6], 'player': {'id': ...",0,0,0.353289,False,False,True,False,False,False,False,False,False,True,False,113.1,40.7
27,0.385068,7,4,16.311039,0,3.862642,0,False,False,True,False,1,"[{'location': [115.6, 40.2], 'player': {'id': ...",0,0,0.069527,False,False,True,False,False,False,False,False,False,True,False,103.8,41.9
195,0.687124,8,2,8.430896,0,1.0,0,True,False,False,False,0,"[{'location': [113.0, 36.2], 'player': {'id': ...",0,0,0.081609,False,False,False,False,True,False,False,False,False,True,False,112.2,36.8
355,0.227095,8,4,25.0018,0,2.968164,0,False,True,False,False,0,"[{'location': [100.1, 56.9], 'player': {'id': ...",0,0,0.030002,False,False,False,False,True,False,False,False,False,True,False,97.8,51.5


In [86]:
# save to csv files
output_dir = Path("ff_preprocessed_encoded")
output_dir.mkdir(exist_ok=True)

files = {
    "shots_wc18_ff_enc.csv": shots_wc18_ff_enc,
    "shots_euro_ff_enc.csv": shots_euro_ff_enc,
    "shots_wc22_ff_enc.csv": shots_wc22_ff_enc,
}

for fname, df in files.items():
    df.to_csv(output_dir / fname, index=False)
    print("Saved:", fname)


Saved: shots_wc18_ff_enc.csv
Saved: shots_euro_ff_enc.csv
Saved: shots_wc22_ff_enc.csv
