In [67]:
import sys
import os
import pandas as pd

# Add the project root to the Python path
sys.path.append(os.path.join(os.getcwd(), '..'))

# Import the project modules
from src.config import setup_logging
from src.extract import fetch_statsbomb_event_data
from src.transform import transform_to_progressive_actions

# Reload modules when code is changed (uncomment for development)
%load_ext autoreload
%autoreload 2

# Init logging
logger = setup_logging(log_file="../logs/build_up_heatmaps.log")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
events = fetch_statsbomb_event_data()
#list(events.columns)

2025-10-02 14:11:41,888 - src.extract.statsbomb_data - INFO - Fetching StatsBomb event data for Europe - UEFA Euro - 2024 - male
2025-10-02 14:11:56,974 - src.extract.statsbomb_data - INFO - Found 187858 events!


In [None]:
prog_actions = transform_to_progressive_actions(events)
prog_actions.info()

2025-10-02 14:57:46,418 - src.transform.build_up_events - INFO - Transforming 187858 records from events data to progressive actions...
2025-10-02 14:57:47,132 - src.transform.build_up_events - INFO - Found 87760 actions (passes and carries).
2025-10-02 14:57:47,373 - src.transform.build_up_events - INFO - Found 14870 progressive actions (passes and carries).
2025-10-02 14:57:47,400 - src.transform.build_up_events - INFO - Done! Found 8771 progressive actions in own half (x < 60).


<class 'pandas.core.frame.DataFrame'>
Index: 8771 entries, 340 to 149941
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              8771 non-null   object 
 1   match_id        8771 non-null   int64  
 2   team            8771 non-null   object 
 3   player          8771 non-null   object 
 4   position        8771 non-null   object 
 5   timestamp       8771 non-null   object 
 6   x               8771 non-null   float64
 7   y               8771 non-null   float64
 8   end_x           8771 non-null   float64
 9   end_y           8771 non-null   float64
 10  progression     8771 non-null   float64
 11  type            8771 non-null   object 
 12  under_pressure  1408 non-null   object 
 13  possession      8771 non-null   int64  
dtypes: float64(5), int64(2), object(7)
memory usage: 1.0+ MB


In [None]:
progression_cols = [
    "id", "match_id", "team", "player", "position", "timestamp",
    "location",
    "type", # Carry, Pass
    "carry_end_location",
    "pass_outcome", "pass_end_location", "pass_type", # Filter out goal kicks, corners, free kicks and throw ins.
    "under_pressure", "possession"
]

# Dribbles don't matter as they don't have end locations. Impossible to say if it's a progressive action or not.

turnover_cols = [
    "id", "match_id", "team", "player", "position", "timestamp",
    "location",
    "type", # Dispossessed, Miscontrol, Dribble, 50/50
    "50_50", # TODO: extract "Outcome" column. Use "Lost", "Success To Opposition" values for turnovers
    "ball_receipt_outcome", # "Incomplete"
    "dribble_outcome", # "Incomplete"
    "duel_type", "duel_outcome", # Areail Lost/Tackle; "Lost/Lost In Play"
    "pass_outcome", "pass_end_location", "pass_type", # No need for end_location, turnover is at origin point. # Filter out goal kicks, corners, free kicks and throw ins. Filter out injury clearances for outcomes.
    "under_pressure", "counterpress", "possession"
]

In [21]:
spain_events = events[events["team"] == "Spain"][turnover_cols]
spain_events.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15056 entries, 2 to 187847
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   match_id               15056 non-null  int64 
 1   team                   15056 non-null  object
 2   player                 15000 non-null  object
 3   position               15000 non-null  object
 4   timestamp              15056 non-null  object
 5   location               14944 non-null  object
 6   type                   15056 non-null  object
 7   50_50                  23 non-null     object
 8   ball_receipt_outcome   383 non-null    object
 9   dribble_outcome        120 non-null    object
 10  duel_type              192 non-null    object
 11  duel_outcome           100 non-null    object
 12  pass_outcome           527 non-null    object
 13  pass_end_location      4335 non-null   object
 14  pass_miscommunication  0 non-null      object
 15  pass_type              

In [65]:
test = spain_events[spain_events["type"] == "Miscontrol"][turnover_cols]

In [66]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74 entries, 164539 to 165566
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   match_id               74 non-null     int64 
 1   team                   74 non-null     object
 2   player                 74 non-null     object
 3   position               74 non-null     object
 4   timestamp              74 non-null     object
 5   location               74 non-null     object
 6   type                   74 non-null     object
 7   50_50                  0 non-null      object
 8   ball_receipt_outcome   0 non-null      object
 9   dribble_outcome        0 non-null      object
 10  duel_type              0 non-null      object
 11  duel_outcome           0 non-null      object
 12  pass_outcome           0 non-null      object
 13  pass_end_location      0 non-null      object
 14  pass_miscommunication  0 non-null      object
 15  pass_type            

In [49]:
test["pass_outcome"].value_counts()

pass_outcome
Incomplete          443
Out                  53
Unknown              17
Pass Offside          9
Injury Clearance      5
Name: count, dtype: int64

In [32]:
match = spain_events[spain_events["match_id"] == 3943043]
match[match["possession"] == 3].sort_values(by="timestamp")

Unnamed: 0,match_id,team,player,position,timestamp,location,type,50_50,ball_receipt_outcome,dribble_outcome,duel_type,duel_outcome,pass_outcome,pass_end_location,pass_miscommunication,pass_type,under_pressure,counterpress,possession
1399,3943043,Spain,Unai Simón Mendibil,Goalkeeper,00:00:34.440,"[6.9, 39.6]",Pass,,,,,,,"[9.1, 57.2]",,Goal Kick,,,3
55244,3943043,Spain,Robin Aime Robert Le Normand,Right Center Back,00:00:35.658,"[9.1, 57.2]",Ball Receipt*,,,,,,,,,,,,3
106751,3943043,Spain,Robin Aime Robert Le Normand,Right Center Back,00:00:35.658,"[9.1, 57.2]",Carry,,,,,,,,,,,,3
1400,3943043,Spain,Robin Aime Robert Le Normand,Right Center Back,00:00:36.279,"[8.9, 57.2]",Pass,,,,,,,"[20.6, 75.8]",,,,,3
55245,3943043,Spain,Daniel Carvajal Ramos,Right Back,00:00:37.670,"[20.6, 75.8]",Ball Receipt*,,,,,,,,,,,,3
106752,3943043,Spain,Daniel Carvajal Ramos,Right Back,00:00:37.670,"[20.6, 75.8]",Carry,,,,,,,,,,True,,3
1401,3943043,Spain,Daniel Carvajal Ramos,Right Back,00:00:39.436,"[28.6, 76.4]",Pass,,,,,,,"[50.2, 68.5]",,,,,3
55246,3943043,Spain,Daniel Olmo Carvajal,Center Attacking Midfield,00:00:40.513,"[50.2, 68.5]",Ball Receipt*,,,,,,,,,,True,,3
106753,3943043,Spain,Daniel Olmo Carvajal,Center Attacking Midfield,00:00:40.513,"[50.2, 68.5]",Carry,,,,,,,,,,True,,3
164539,3943043,Spain,Daniel Olmo Carvajal,Center Attacking Midfield,00:00:40.645,"[49.3, 68.9]",Miscontrol,,,,,,,,,,,,3


In [None]:
events[events["50_50"] != "Lost"]

Unnamed: 0,50_50,bad_behaviour_card,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,block_deflection,block_offensive,block_save_block,carry_end_location,clearance_aerial_won,...,substitution_outcome,substitution_outcome_id,substitution_replacement,substitution_replacement_id,tactics,team,team_id,timestamp,type,under_pressure
