# Preprocessing

Load World Cup 2018 data from StatsBomb JSON, filter to shots, and create the cleaned shot-level dataset with engineered features (x, y, distance, angle, etc.). Finally, save the result to CSV for use in the modeling notebook.

## Imports and configuration

In [108]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

pd.set_option('display.max_columns', None)


## Load matches and events for World Cup 2018


In [109]:
# Base path to StatsBomb open-data on your machine
BASE = Path(r"C:\\Users\\traik\\Desktop\\Final project data\\open-data-master\\data")

# World Cup 2018: competition_id=43, season_id=3
matches_file = BASE / "matches" / "43" / "3.json"

with open(matches_file, "r", encoding="utf-8") as f:
    matches_wc = json.load(f)

match_ids_wc = [m["match_id"] for m in matches_wc]

events_folder = BASE / "events"
all_events_wc = []

for mid in match_ids_wc:
    fp = events_folder / f"{mid}.json"
    with open(fp, "r", encoding="utf-8") as f:
        all_events_wc.extend(json.load(f))

df_wc = pd.json_normalize(all_events_wc, sep="_")
df_wc.head()


Unnamed: 0,id,index,period,timestamp,minute,second,possession,duration,type_id,type_name,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name,team_id,team_name,tactics_formation,tactics_lineup,related_events,location,player_id,player_name,position_id,position_name,pass_recipient_id,pass_recipient_name,pass_length,pass_angle,pass_height_id,pass_height_name,pass_end_location,pass_type_id,pass_type_name,pass_body_part_id,pass_body_part_name,under_pressure,carry_end_location,pass_outcome_id,pass_outcome_name,counterpress,ball_receipt_outcome_id,ball_receipt_outcome_name,50_50_outcome_id,50_50_outcome_name,pass_cross,ball_recovery_recovery_failure,pass_switch,dribble_outcome_id,dribble_outcome_name,duel_type_id,duel_type_name,duel_outcome_id,duel_outcome_name,foul_committed_type_id,foul_committed_type_name,shot_statsbomb_xg,shot_end_location,shot_follows_dribble,shot_type_id,shot_type_name,shot_technique_id,shot_technique_name,shot_outcome_id,shot_outcome_name,shot_body_part_id,shot_body_part_name,shot_freeze_frame,goalkeeper_position_id,goalkeeper_position_name,goalkeeper_type_id,goalkeeper_type_name,goalkeeper_technique_id,goalkeeper_technique_name,goalkeeper_outcome_id,goalkeeper_outcome_name,goalkeeper_body_part_id,goalkeeper_body_part_name,interception_outcome_id,interception_outcome_name,pass_assisted_shot_id,pass_shot_assist,shot_key_pass_id,goalkeeper_end_location,clearance_aerial_won,foul_won_defensive,shot_open_goal,pass_aerial_won,foul_committed_advantage,foul_won_advantage,pass_backheel,dribble_nutmeg,pass_deflected,block_deflection,bad_behaviour_card_id,bad_behaviour_card_name,foul_committed_card_id,foul_committed_card_name,foul_committed_penalty,foul_won_penalty,block_offensive,substitution_outcome_id,substitution_outcome_name,substitution_replacement_id,substitution_replacement_name,shot_aerial_won,shot_first_time,pass_goal_assist,shot_deflected,dribble_overrun,block_save_block,foul_committed_offensive,injury_stoppage_in_chain,pass_technique_id,pass_technique_name,pass_through_ball,shot_one_on_one,pass_cut_back,ball_recovery_offensive,miscontrol_aerial_won,pass_miscommunication,shot_redirect
0,de3be98d-e227-475b-bd55-f57a6a89d308,1,1,00:00:00.000,0,0,1,0.0,35,Starting XI,769,Colombia,1,Regular Play,769,Colombia,433.0,"[{'player': {'id': 4276, 'name': 'David Ospina...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,f50ccda4-b768-4f07-9136-8f79fd17dac5,2,1,00:00:00.000,0,0,1,0.754,35,Starting XI,769,Colombia,1,Regular Play,768,England,352.0,"[{'player': {'id': 3468, 'name': 'Jordan Pickf...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,b5e98805-0a22-4a5e-a306-7d40651a0f6e,3,1,00:00:00.000,0,0,1,9.32,18,Half Start,769,Colombia,1,Regular Play,768,England,,,[762b829f-5f24-4dd7-bfe2-da7e289838bb],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,762b829f-5f24-4dd7-bfe2-da7e289838bb,4,1,00:00:00.000,0,0,1,9.053,18,Half Start,769,Colombia,1,Regular Play,769,Colombia,,,[b5e98805-0a22-4a5e-a306-7d40651a0f6e],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,d4883f20-ce68-4f84-b26a-a049a13cb6be,5,1,00:00:00.240,0,0,2,0.24,30,Pass,769,Colombia,9,From Kick Off,769,Colombia,,,[5fc9acb8-88c3-4cfb-ad9c-fc250c0dffde],"[60.0, 40.0]",3445.0,Radamel Falcao García Zárate,24.0,Left Center Forward,5692.0,Juan Fernando Quintero Paniagua,10.049875,3.041924,1.0,Ground Pass,"[50.0, 41.0]",65.0,Kick Off,38.0,Left Foot,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [110]:
df_wc['id'].duplicated().sum()


0

## Filter to shots only

In [111]:
df_shots = df_wc[df_wc["type_name"] == "Shot"].copy()
df_shots.shape


(1706, 121)

## Select relevant columns


In [112]:
cols = [
    "location",
    "counterpress",
    "shot_statsbomb_xg",
    "shot_end_location",
    "shot_type_id",
    "shot_technique_id",
    "shot_outcome_id",
    "shot_body_part_id",
    "shot_open_goal",
    "shot_first_time",
    "shot_one_on_one",
    "shot_aerial_won",
]

shots = df_shots[cols].copy()
shots.head()


Unnamed: 0,location,counterpress,shot_statsbomb_xg,shot_end_location,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won
239,"[115.0, 18.0]",,0.009816,"[120.0, 42.2, 2.0]",62.0,93.0,100.0,40.0,,,,
270,"[112.0, 54.0]",,0.038204,"[113.0, 53.0]",87.0,93.0,96.0,40.0,,,,
439,"[98.0, 37.0]",,0.045128,"[105.0, 37.0]",87.0,93.0,96.0,40.0,,,,
548,"[119.0, 36.0]",,0.625074,"[120.0, 40.5, 3.1]",87.0,93.0,98.0,37.0,True,,,
815,"[97.0, 56.0]",,0.02176,"[100.0, 54.0]",87.0,93.0,96.0,40.0,,,,


## Drop unreliable columns



In [113]:
shots.isna().sum()

location                0
counterpress         1706
shot_statsbomb_xg       0
shot_end_location       0
shot_type_id            0
shot_technique_id       0
shot_outcome_id         0
shot_body_part_id       0
shot_open_goal       1689
shot_first_time      1349
shot_one_on_one      1651
shot_aerial_won      1582
dtype: int64

In [114]:
# Drop counterpress

shots.drop(["counterpress"], axis=1, inplace=True)

shots.isna().sum()


location                0
shot_statsbomb_xg       0
shot_end_location       0
shot_type_id            0
shot_technique_id       0
shot_outcome_id         0
shot_body_part_id       0
shot_open_goal       1689
shot_first_time      1349
shot_one_on_one      1651
shot_aerial_won      1582
dtype: int64

In [115]:
# Drop shot_end_location as execution of shot should not play a role in measuring the quality of shot.

shots.drop(["shot_end_location"], axis=1, inplace=True)

## Clean boolean shot flags

`shot_open_goal`, `shot_first_time`, `shot_one_on_one`, `shot_aerial_won` are mapped from `NaN` → 0 and cast to integer 0/1.

In [116]:
binary_cols = [
    "shot_open_goal",
    "shot_first_time",
    "shot_one_on_one",
    "shot_aerial_won",
]

for col in binary_cols:
    shots[col] = shots[col].fillna(0).astype(int)

shots[binary_cols].head()


Unnamed: 0,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won
239,0,0,0,0
270,0,0,0,0
439,0,0,0,0
548,1,0,0,0
815,0,0,0,0


## Extract shot location (x, y)

We convert the StatsBomb `location` list `[x, y]` into separate numeric `x` and `y` columns, then drop the original `location` column.

In [117]:
shots["x"] = shots["location"].apply(lambda loc: loc[0])
shots["y"] = shots["location"].apply(lambda loc: loc[1])

shots.drop("location", axis=1, inplace=True)
shots.head()


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_outcome_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y
239,0.009816,62.0,93.0,100.0,40.0,0,0,0,0,115.0,18.0
270,0.038204,87.0,93.0,96.0,40.0,0,0,0,0,112.0,54.0
439,0.045128,87.0,93.0,96.0,40.0,0,0,0,0,98.0,37.0
548,0.625074,87.0,93.0,98.0,37.0,1,0,0,0,119.0,36.0
815,0.02176,87.0,93.0,96.0,40.0,0,0,0,0,97.0,56.0


## Create goal label `is_goal`

We use your mapping: `shot_outcome_id == 97` corresponds to goals in this dataset.

In [118]:
shots["is_goal"] = (shots["shot_outcome_id"] == 97).astype(int)
shots["is_goal"].value_counts()


is_goal
0    1523
1     183
Name: count, dtype: int64

In [119]:
# Drop the outcome_id now that we have the label
shots.drop("shot_outcome_id", axis=1, inplace=True)
shots.head()


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal
239,0.009816,62.0,93.0,40.0,0,0,0,0,115.0,18.0,0
270,0.038204,87.0,93.0,40.0,0,0,0,0,112.0,54.0,0
439,0.045128,87.0,93.0,40.0,0,0,0,0,98.0,37.0,0
548,0.625074,87.0,93.0,37.0,1,0,0,0,119.0,36.0,0
815,0.02176,87.0,93.0,40.0,0,0,0,0,97.0,56.0,0


## Add geometry features: distance and angle

We compute:
- `distance`: distance from shot location to the centre of the goal
- `angle`: angle between lines from the shot location to the two goalposts

Pitch and goal coordinates follow the StatsBomb convention: x ∈ [0,120], y ∈ [0,80], goal centered at (120, 40).

In [120]:
# Goal coordinates (StatsBomb pitch)
goal_x = 120
goal_y = 40
left_post_y = 36.8
right_post_y = 43.2

# Distance to goal centre
shots["distance"] = np.sqrt((goal_x - shots["x"])**2 + (goal_y - shots["y"])**2)

def calc_angle(row):
    x = row["x"]
    y = row["y"]
    angle_left = np.arctan2(left_post_y - y, goal_x - x)
    angle_right = np.arctan2(right_post_y - y, goal_x - x)
    return abs(angle_right - angle_left)



## Create final featured shots dataframe

We drop `shot_end_location` so the features only describe the chance **before** the shot outcome. This final dataframe `shots_featured` will be saved to CSV and used in the modeling notebook.

In [121]:
shots_featured = shots.copy()

shots_featured["angle"] = shots.apply(calc_angle, axis=1)
shots_featured[["x", "y", "distance", "angle"]].head()


shots_featured.head()


Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle
239,0.009816,62.0,93.0,40.0,0,0,0,0,115.0,18.0,0,22.561028,0.064071
270,0.038204,87.0,93.0,40.0,0,0,0,0,112.0,54.0,0,16.124515,0.202196
439,0.045128,87.0,93.0,40.0,0,0,0,0,98.0,37.0,0,22.203603,0.283785
548,0.625074,87.0,93.0,37.0,1,0,0,0,119.0,36.0,0,4.123106,0.758049
815,0.02176,87.0,93.0,40.0,0,0,0,0,97.0,56.0,0,28.017851,0.187756


### Checking for duplicate columns

In [122]:
shots_featured.duplicated().sum()

60

In [123]:
dupes = shots_featured[shots_featured.duplicated(keep=False)]
dupes.head(60)



Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle
1972,0.7835,88.0,93.0,40.0,0,0,0,0,108.0,40.0,1,12.0,0.521205
3997,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
3999,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4001,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4003,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4005,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4007,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,0,11.045361,0.56213
4009,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,0,11.045361,0.56213
4011,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4013,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,0,11.045361,0.56213


In [124]:
# duplicates are mostly penalty shots. we dont drop them.


In [125]:
shots_featured.iloc[32:45,:]

Unnamed: 0,shot_statsbomb_xg,shot_type_id,shot_technique_id,shot_body_part_id,shot_open_goal,shot_first_time,shot_one_on_one,shot_aerial_won,x,y,is_goal,distance,angle
3999,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4001,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4003,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4005,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4007,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,0,11.045361,0.56213
4009,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,0,11.045361,0.56213
4011,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4013,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,0,11.045361,0.56213
4015,0.7835,88.0,93.0,40.0,0,0,0,0,109.0,41.0,1,11.045361,0.56213
4320,0.022963,87.0,93.0,40.0,0,0,0,0,93.0,52.0,0,29.546573,0.197672


In [126]:
shots_featured.shape

(1706, 13)

In [127]:
shots_featured.is_goal.value_counts()

is_goal
0    1523
1     183
Name: count, dtype: int64

## Save preprocessed shots to CSV

In [128]:
output_path = Path("shots_featured_wc2018.csv")
shots_featured.to_csv(output_path, index=False)
print(f"Saved preprocessed shots to {output_path.resolve()}")


Saved preprocessed shots to C:\Users\traik\Desktop\ML_Bootcamp\Final_Project\Final_Project_Ironhack\shots_featured_wc2018.csv
