<a href="https://colab.research.google.com/github/hannesstuehrenberg/Probabilistic-Machine-Learning_lecture-PROJECTS/blob/main/projects/08-1SHXXXX_football_analytics/notebooks/03_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 3. Data Preprocessing
- Steps taken to clean or transform the data


In [1]:
!pip install mplsoccer

Collecting mplsoccer
  Downloading mplsoccer-1.5.0-py3-none-any.whl.metadata (4.8 kB)
Downloading mplsoccer-1.5.0-py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.2/86.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mplsoccer
Successfully installed mplsoccer-1.5.0


In [8]:
#Import necessary libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from mplsoccer import VerticalPitch
from mplsoccer import Sbopen
from tqdm import tqdm

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Initialize parser
parser = Sbopen()

In [9]:
matches_df = parser.match(competition_id=72, season_id=107)
match_ids = matches_df['match_id'].tolist()

all_shots = []

for match_id in match_ids:
    df_event, df_related, df_freeze, df_tactics = parser.event(match_id)

    df_shots = df_event[df_event['type_name'] == 'Shot'].copy()

    df_shots['match_id'] = match_id

    all_shots.append(df_shots)

shots_master_women_world_cup_df = pd.concat(all_shots, ignore_index=True)

KeyboardInterrupt: 

In [7]:
#Lets look at the example dataset from the Women's World Cup 2023 again. It consisted of 1680 shots. 184 of them were goals.
num_goals = shots_master_women_world_cup_df[shots_master_women_world_cup_df['outcome_name'] == 'Goal'].shape[0]
print(f"Number of goals: {num_goals}")


Number of goals: 184


In [12]:
#As many more competitions are available the next step is to create a shots_master_df for all shots from all competitions to enrich the dataset on which to train the models later.

#During data loading and exploration these columns were discovered to always be NaN for type_name = Shot. Therefore, they are not of interest for the shots_master_df
irrelevant_columns = [
    'tactics_formation', 'pass_recipient_id', 'pass_recipient_name', 'pass_length',
    'pass_angle', 'pass_height_id', 'pass_height_name', 'counterpress', 'pass_switch',
    'block_deflection', 'pass_cross', 'pass_assisted_shot_id', 'pass_shot_assist',
    'goalkeeper_position_id', 'goalkeeper_position_name', 'foul_committed_card_id',
    'foul_committed_card_name', 'ball_recovery_recovery_failure', 'foul_committed_advantage',
    'foul_won_advantage', 'foul_won_defensive', 'dribble_nutmeg', 'pass_goal_assist',
    'substitution_replacement_id', 'substitution_replacement_name', 'bad_behaviour_card_id',
    'bad_behaviour_card_name', 'pass_cut_back', 'dribble_overrun', 'ball_recovery_offensive',
    'pass_no_touch', 'pass_deflected', 'foul_committed_penalty', 'foul_won_penalty',
    'injury_stoppage_in_chain', 'foul_committed_offensive', 'block_offensive',
    'pass_miscommunication', 'block_save_block', 'dribble_no_touch', 'player_off_permanent'
]

all_shots = []

df_competitions = parser.competition()

# Add mininterval or miniters to slow down tqdm output
for _, row in tqdm(df_competitions.iterrows(), total=len(df_competitions), desc="Competitions", mininterval=2.0):
    comp_id = row['competition_id']
    season_id = row['season_id']
    comp_name = row['competition_name']
    season_name = row['season_name']

    tqdm.write(f"\n🔄 Processing {comp_name} - {season_name}...")

    try:
        matches_df = parser.match(competition_id=comp_id, season_id=season_id)
        match_ids = matches_df['match_id'].tolist()

        for match_id in tqdm(
            match_ids,
            desc=f"  Matches in {comp_name[:15]} {season_name}",
            leave=False,
            mininterval=20,
        ):
            try:
                df_event, df_related, df_freeze, df_tactics = parser.event(match_id)
                df_shots = df_event[df_event['type_name'] == 'Shot'].copy()
                df_shots['match_id'] = match_id
                all_shots.append(df_shots)
            except Exception as e:
                print(f"     ❌ Error in match_id {match_id}: {e}")

    except Exception as e:
        print(f"❌ Could not fetch matches for {comp_name} - {season_name}: {e}")

shots_master_df = pd.concat(all_shots, ignore_index=True)
shots_master_df = shots_master_df.drop(columns=[col for col in irrelevant_columns if col in shots_master_df.columns])

print("\n✅ All done!")
print("📊 Final shape of shots_master_df:", shots_master_df.shape)


Competitions:   0%|          | 0/74 [00:00<?, ?it/s]


🔄 Processing 1. Bundesliga - 2023/2024...



  Matches in 1. Bundesliga 2023/2024:   0%|          | 0/34 [00:00<?, ?it/s][A
  Matches in 1. Bundesliga 2023/2024:  94%|█████████▍| 32/34 [00:20<00:01,  1.57it/s][A
Competitions:   1%|▏         | 1/74 [00:22<26:59, 22.19s/it]


🔄 Processing 1. Bundesliga - 2015/2016...



  Matches in 1. Bundesliga 2015/2016:   0%|          | 0/306 [00:00<?, ?it/s][A
  Matches in 1. Bundesliga 2015/2016:  12%|█▏        | 38/306 [00:20<02:23,  1.86it/s][A
  Matches in 1. Bundesliga 2015/2016:  12%|█▏        | 38/306 [00:40<02:23,  1.86it/s][A
  Matches in 1. Bundesliga 2015/2016:  24%|██▍       | 73/306 [00:40<02:09,  1.79it/s][A
  Matches in 1. Bundesliga 2015/2016:  24%|██▍       | 73/306 [00:50<02:09,  1.79it/s][A
  Matches in 1. Bundesliga 2015/2016:  37%|███▋      | 113/306 [01:00<01:43,  1.87it/s][A
  Matches in 1. Bundesliga 2015/2016:  37%|███▋      | 113/306 [01:20<01:43,  1.87it/s][A
  Matches in 1. Bundesliga 2015/2016:  50%|█████     | 154/306 [01:21<01:19,  1.91it/s][A
  Matches in 1. Bundesliga 2015/2016:  50%|█████     | 154/306 [01:40<01:19,  1.91it/s][A
  Matches in 1. Bundesliga 2015/2016:  59%|█████▉    | 182/306 [01:41<01:11,  1.72it/s][A
  Matches in 1. Bundesliga 2015/2016:  59%|█████▉    | 182/306 [02:00<01:11,  1.72it/s][A
  Matches in


🔄 Processing African Cup of Nations - 2023...



  Matches in African Cup of  2023:   0%|          | 0/52 [00:00<?, ?it/s][A
  Matches in African Cup of  2023:  54%|█████▍    | 28/52 [00:20<00:17,  1.38it/s][A
Competitions:   4%|▍         | 3/74 [04:13<1:39:14, 83.86s/it]


🔄 Processing Champions League - 2018/2019...



  Matches in Champions Leagu 2018/2019:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:   4%|▍         | 3/74 [04:14<1:39:14, 83.86s/it]


🔄 Processing Champions League - 2017/2018...



  Matches in Champions Leagu 2017/2018:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:   7%|▋         | 5/74 [04:15<42:33, 37.00s/it]


🔄 Processing Champions League - 2016/2017...



  Matches in Champions Leagu 2016/2017:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:   7%|▋         | 5/74 [04:16<42:33, 37.00s/it]


🔄 Processing Champions League - 2015/2016...



  Matches in Champions Leagu 2015/2016:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:   7%|▋         | 5/74 [04:17<42:33, 37.00s/it]


🔄 Processing Champions League - 2014/2015...



  Matches in Champions Leagu 2014/2015:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  11%|█         | 8/74 [04:18<18:56, 17.21s/it]


🔄 Processing Champions League - 2013/2014...



  Matches in Champions Leagu 2013/2014:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  11%|█         | 8/74 [04:18<18:56, 17.21s/it]


🔄 Processing Champions League - 2012/2013...



  Matches in Champions Leagu 2012/2013:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  11%|█         | 8/74 [04:19<18:56, 17.21s/it]


🔄 Processing Champions League - 2011/2012...



  Matches in Champions Leagu 2011/2012:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  15%|█▍        | 11/74 [04:20<10:31, 10.03s/it]


🔄 Processing Champions League - 2010/2011...



  Matches in Champions Leagu 2010/2011:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  15%|█▍        | 11/74 [04:21<10:31, 10.03s/it]


🔄 Processing Champions League - 2009/2010...



  Matches in Champions Leagu 2009/2010:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  15%|█▍        | 11/74 [04:22<10:31, 10.03s/it]


🔄 Processing Champions League - 2008/2009...



  Matches in Champions Leagu 2008/2009:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  19%|█▉        | 14/74 [04:23<06:30,  6.50s/it]


🔄 Processing Champions League - 2006/2007...



  Matches in Champions Leagu 2006/2007:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  19%|█▉        | 14/74 [04:24<06:30,  6.50s/it]


🔄 Processing Champions League - 2004/2005...



  Matches in Champions Leagu 2004/2005:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  19%|█▉        | 14/74 [04:25<06:30,  6.50s/it]


🔄 Processing Champions League - 2003/2004...



  Matches in Champions Leagu 2003/2004:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  23%|██▎       | 17/74 [04:26<04:18,  4.54s/it]


🔄 Processing Champions League - 1999/2000...



  Matches in Champions Leagu 1999/2000:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  23%|██▎       | 17/74 [04:27<04:18,  4.54s/it]


🔄 Processing Champions League - 1972/1973...



  Matches in Champions Leagu 1972/1973:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  23%|██▎       | 17/74 [04:28<04:18,  4.54s/it]


🔄 Processing Champions League - 1971/1972...



  Matches in Champions Leagu 1971/1972:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  27%|██▋       | 20/74 [04:29<02:59,  3.33s/it]


🔄 Processing Champions League - 1970/1971...



  Matches in Champions Leagu 1970/1971:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  27%|██▋       | 20/74 [04:29<02:59,  3.33s/it]


🔄 Processing Copa America - 2024...



Competitions:  27%|██▋       | 20/74 [04:43<02:59,  3.33s/it]
  Matches in Copa America 2024:  94%|█████████▍| 30/32 [00:20<00:01,  1.46it/s][A
Competitions:  30%|██▉       | 22/74 [04:52<04:38,  5.35s/it]


🔄 Processing Copa del Rey - 1983/1984...



  Matches in Copa del Rey 1983/1984:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  30%|██▉       | 22/74 [04:53<04:38,  5.35s/it]


🔄 Processing Copa del Rey - 1982/1983...



  Matches in Copa del Rey 1982/1983:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  30%|██▉       | 22/74 [04:54<04:38,  5.35s/it]


🔄 Processing Copa del Rey - 1977/1978...



  Matches in Copa del Rey 1977/1978:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  34%|███▍      | 25/74 [04:54<03:06,  3.81s/it]


🔄 Processing FA Women's Super League - 2020/2021...



Competitions:  34%|███▍      | 25/74 [05:13<03:06,  3.81s/it]
  Matches in FA Women's Supe 2020/2021:  21%|██▏       | 28/131 [00:20<01:14,  1.39it/s][A
  Matches in FA Women's Supe 2020/2021:  21%|██▏       | 28/131 [00:38<01:14,  1.39it/s][A
  Matches in FA Women's Supe 2020/2021:  44%|████▎     | 57/131 [00:40<00:52,  1.41it/s][A
  Matches in FA Women's Supe 2020/2021:  44%|████▎     | 57/131 [00:58<00:52,  1.41it/s][A
  Matches in FA Women's Supe 2020/2021:  65%|██████▍   | 85/131 [01:00<00:32,  1.40it/s][A
  Matches in FA Women's Supe 2020/2021:  65%|██████▍   | 85/131 [01:18<00:32,  1.40it/s][A
  Matches in FA Women's Supe 2020/2021:  86%|████████▋ | 113/131 [01:21<00:13,  1.38it/s][A
Competitions:  35%|███▌      | 26/74 [06:29<13:10, 16.47s/it]


🔄 Processing FA Women's Super League - 2019/2020...



  Matches in FA Women's Supe 2019/2020:   0%|          | 0/87 [00:00<?, ?it/s][A
  Matches in FA Women's Supe 2019/2020:  33%|███▎      | 29/87 [00:20<00:41,  1.39it/s][A
  Matches in FA Women's Supe 2019/2020:  33%|███▎      | 29/87 [00:34<00:41,  1.39it/s][A
  Matches in FA Women's Supe 2019/2020:  64%|██████▍   | 56/87 [00:40<00:22,  1.36it/s][A
  Matches in FA Women's Supe 2019/2020:  64%|██████▍   | 56/87 [00:54<00:22,  1.36it/s][A
  Matches in FA Women's Supe 2019/2020:  94%|█████████▍| 82/87 [01:01<00:03,  1.32it/s][A
Competitions:  36%|███▋      | 27/74 [07:34<19:14, 24.57s/it]


🔄 Processing FA Women's Super League - 2018/2019...



  Matches in FA Women's Supe 2018/2019:   0%|          | 0/108 [00:00<?, ?it/s][A
  Matches in FA Women's Supe 2018/2019:  25%|██▌       | 27/108 [00:20<01:01,  1.33it/s][A
  Matches in FA Women's Supe 2018/2019:  25%|██▌       | 27/108 [00:39<01:01,  1.33it/s][A
  Matches in FA Women's Supe 2018/2019:  52%|█████▏    | 56/108 [00:40<00:37,  1.39it/s][A
  Matches in FA Women's Supe 2018/2019:  52%|█████▏    | 56/108 [00:59<00:37,  1.39it/s][A
  Matches in FA Women's Supe 2018/2019:  80%|███████▉  | 86/108 [01:01<00:15,  1.41it/s][A
Competitions:  38%|███▊      | 28/74 [08:51<26:38, 34.74s/it]


🔄 Processing FIFA U20 World Cup - 1979...



  Matches in FIFA U20 World  1979:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  38%|███▊      | 28/74 [08:52<26:38, 34.74s/it]


🔄 Processing FIFA World Cup - 2022...



  Matches in FIFA World Cup 2022:   0%|          | 0/64 [00:00<?, ?it/s][A
  Matches in FIFA World Cup 2022:  41%|████      | 26/64 [00:20<00:29,  1.27it/s][A
  Matches in FIFA World Cup 2022:  41%|████      | 26/64 [00:30<00:29,  1.27it/s][A
  Matches in FIFA World Cup 2022:  83%|████████▎ | 53/64 [00:40<00:08,  1.32it/s][A
Competitions:  41%|████      | 30/74 [09:41<22:53, 31.22s/it]


🔄 Processing FIFA World Cup - 2018...



  Matches in FIFA World Cup 2018:   0%|          | 0/64 [00:00<?, ?it/s][A
  Matches in FIFA World Cup 2018:  44%|████▍     | 28/64 [00:20<00:26,  1.35it/s][A
  Matches in FIFA World Cup 2018:  44%|████▍     | 28/64 [00:32<00:26,  1.35it/s][A
  Matches in FIFA World Cup 2018:  88%|████████▊ | 56/64 [00:41<00:05,  1.36it/s][A
Competitions:  42%|████▏     | 31/74 [10:28<24:41, 34.46s/it]


🔄 Processing FIFA World Cup - 1990...



  Matches in FIFA World Cup 1990:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  42%|████▏     | 31/74 [10:29<24:41, 34.46s/it]


🔄 Processing FIFA World Cup - 1986...



  Matches in FIFA World Cup 1986:   0%|          | 0/3 [00:00<?, ?it/s][A
Competitions:  45%|████▍     | 33/74 [10:31<15:20, 22.45s/it]


🔄 Processing FIFA World Cup - 1974...



  Matches in FIFA World Cup 1974:   0%|          | 0/6 [00:00<?, ?it/s][A
Competitions:  45%|████▍     | 33/74 [10:36<15:20, 22.45s/it]


🔄 Processing FIFA World Cup - 1970...



  Matches in FIFA World Cup 1970:   0%|          | 0/6 [00:00<?, ?it/s][A
Competitions:  47%|████▋     | 35/74 [10:40<10:33, 16.25s/it]


🔄 Processing FIFA World Cup - 1962...



  Matches in FIFA World Cup 1962:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  47%|████▋     | 35/74 [10:41<10:33, 16.25s/it]


🔄 Processing FIFA World Cup - 1958...



  Matches in FIFA World Cup 1958:   0%|          | 0/2 [00:00<?, ?it/s][A
Competitions:  50%|█████     | 37/74 [10:43<07:01, 11.40s/it]


🔄 Processing Indian Super league - 2021/2022...



Competitions:  50%|█████     | 37/74 [10:53<07:01, 11.40s/it]
  Matches in Indian Super le 2021/2022:  25%|██▌       | 29/115 [00:20<01:00,  1.43it/s][A
  Matches in Indian Super le 2021/2022:  25%|██▌       | 29/115 [00:39<01:00,  1.43it/s][A
  Matches in Indian Super le 2021/2022:  50%|████▉     | 57/115 [00:40<00:41,  1.41it/s][A
  Matches in Indian Super le 2021/2022:  50%|████▉     | 57/115 [00:59<00:41,  1.41it/s][A
  Matches in Indian Super le 2021/2022:  75%|███████▍  | 86/115 [01:00<00:20,  1.41it/s][A
  Matches in Indian Super le 2021/2022:  75%|███████▍  | 86/115 [01:19<00:20,  1.41it/s][A
Competitions:  51%|█████▏    | 38/74 [12:04<14:47, 24.65s/it]


🔄 Processing La Liga - 2020/2021...



  Matches in La Liga 2020/2021:   0%|          | 0/35 [00:00<?, ?it/s][A
  Matches in La Liga 2020/2021:  71%|███████▏  | 25/35 [00:20<00:08,  1.23it/s][A
Competitions:  53%|█████▎    | 39/74 [12:33<14:55, 25.59s/it]


🔄 Processing La Liga - 2019/2020...



  Matches in La Liga 2019/2020:   0%|          | 0/33 [00:00<?, ?it/s][A
  Matches in La Liga 2019/2020:  76%|███████▌  | 25/33 [00:20<00:06,  1.24it/s][A
Competitions:  54%|█████▍    | 40/74 [13:00<14:40, 25.91s/it]


🔄 Processing La Liga - 2018/2019...



  Matches in La Liga 2018/2019:   0%|          | 0/34 [00:00<?, ?it/s][A
  Matches in La Liga 2018/2019:  74%|███████▎  | 25/34 [00:20<00:07,  1.25it/s][A
Competitions:  55%|█████▌    | 41/74 [13:27<14:26, 26.26s/it]


🔄 Processing La Liga - 2017/2018...



  Matches in La Liga 2017/2018:   0%|          | 0/36 [00:00<?, ?it/s][A
  Matches in La Liga 2017/2018:  72%|███████▏  | 26/36 [00:20<00:07,  1.25it/s][A
Competitions:  57%|█████▋    | 42/74 [13:56<14:16, 26.78s/it]


🔄 Processing La Liga - 2016/2017...



  Matches in La Liga 2016/2017:   0%|          | 0/34 [00:00<?, ?it/s][A
  Matches in La Liga 2016/2017:  79%|███████▉  | 27/34 [00:20<00:05,  1.34it/s][A
Competitions:  58%|█████▊    | 43/74 [14:21<13:40, 26.48s/it]


🔄 Processing La Liga - 2015/2016...



  Matches in La Liga 2015/2016:   0%|          | 0/380 [00:00<?, ?it/s][A
  Matches in La Liga 2015/2016:   8%|▊         | 29/380 [00:20<04:03,  1.44it/s][A
  Matches in La Liga 2015/2016:   8%|▊         | 29/380 [00:31<04:03,  1.44it/s][A
  Matches in La Liga 2015/2016:  15%|█▌        | 57/380 [00:40<03:50,  1.40it/s][A
  Matches in La Liga 2015/2016:  15%|█▌        | 57/380 [00:51<03:50,  1.40it/s][A
  Matches in La Liga 2015/2016:  22%|██▏       | 85/380 [01:01<03:35,  1.37it/s][A
  Matches in La Liga 2015/2016:  22%|██▏       | 85/380 [01:11<03:35,  1.37it/s][A
  Matches in La Liga 2015/2016:  30%|██▉       | 113/380 [01:21<03:14,  1.37it/s][A
  Matches in La Liga 2015/2016:  30%|██▉       | 113/380 [01:31<03:14,  1.37it/s][A
  Matches in La Liga 2015/2016:  37%|███▋      | 141/380 [01:42<02:53,  1.37it/s][A
  Matches in La Liga 2015/2016:  37%|███▋      | 141/380 [02:01<02:53,  1.37it/s][A
  Matches in La Liga 2015/2016:  44%|████▍     | 169/380 [02:02<02:32,  1.38it/s


🔄 Processing La Liga - 2014/2015...



  Matches in La Liga 2014/2015:   0%|          | 0/38 [00:00<?, ?it/s][A
  Matches in La Liga 2014/2015:  68%|██████▊   | 26/38 [00:20<00:09,  1.29it/s][A
Competitions:  61%|██████    | 45/74 [19:33<37:59, 78.61s/it]


🔄 Processing La Liga - 2013/2014...



  Matches in La Liga 2013/2014:   0%|          | 0/31 [00:00<?, ?it/s][A
  Matches in La Liga 2013/2014:  81%|████████  | 25/31 [00:20<00:04,  1.23it/s][A
Competitions:  62%|██████▏   | 46/74 [19:58<29:26, 63.10s/it]


🔄 Processing La Liga - 2012/2013...



  Matches in La Liga 2012/2013:   0%|          | 0/32 [00:00<?, ?it/s][A
  Matches in La Liga 2012/2013:  78%|███████▊  | 25/32 [00:20<00:05,  1.22it/s][A
Competitions:  64%|██████▎   | 47/74 [20:24<23:33, 52.34s/it]


🔄 Processing La Liga - 2011/2012...



  Matches in La Liga 2011/2012:   0%|          | 0/37 [00:00<?, ?it/s][A
  Matches in La Liga 2011/2012:  68%|██████▊   | 25/37 [00:20<00:09,  1.23it/s][A
Competitions:  65%|██████▍   | 48/74 [20:55<19:53, 45.89s/it]


🔄 Processing La Liga - 2010/2011...



  Matches in La Liga 2010/2011:   0%|          | 0/33 [00:00<?, ?it/s][A
  Matches in La Liga 2010/2011:  79%|███████▉  | 26/33 [00:20<00:05,  1.26it/s][A
Competitions:  66%|██████▌   | 49/74 [21:21<16:44, 40.17s/it]


🔄 Processing La Liga - 2009/2010...



  Matches in La Liga 2009/2010:   0%|          | 0/35 [00:00<?, ?it/s][A
  Matches in La Liga 2009/2010:  77%|███████▋  | 27/35 [00:20<00:05,  1.33it/s][A
Competitions:  68%|██████▊   | 50/74 [21:47<14:23, 35.98s/it]


🔄 Processing La Liga - 2008/2009...



  Matches in La Liga 2008/2009:   0%|          | 0/31 [00:00<?, ?it/s][A
  Matches in La Liga 2008/2009:  84%|████████▍ | 26/31 [00:20<00:03,  1.29it/s][A
Competitions:  69%|██████▉   | 51/74 [22:11<12:22, 32.28s/it]


🔄 Processing La Liga - 2007/2008...



  Matches in La Liga 2007/2008:   0%|          | 0/28 [00:00<?, ?it/s][A
  Matches in La Liga 2007/2008: 100%|██████████| 28/28 [00:20<00:00,  1.36it/s][A
Competitions:  70%|███████   | 52/74 [22:32<10:34, 28.85s/it]


🔄 Processing La Liga - 2006/2007...



  Matches in La Liga 2006/2007:   0%|          | 0/26 [00:00<?, ?it/s][A
Competitions:  72%|███████▏  | 53/74 [22:52<09:08, 26.12s/it]


🔄 Processing La Liga - 2005/2006...



  Matches in La Liga 2005/2006:   0%|          | 0/17 [00:00<?, ?it/s][A
Competitions:  73%|███████▎  | 54/74 [23:04<07:21, 22.06s/it]


🔄 Processing La Liga - 2004/2005...



  Matches in La Liga 2004/2005:   0%|          | 0/7 [00:00<?, ?it/s][A
Competitions:  74%|███████▍  | 55/74 [23:09<05:23, 17.01s/it]


🔄 Processing La Liga - 1973/1974...



  Matches in La Liga 1973/1974:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  74%|███████▍  | 55/74 [23:10<05:23, 17.01s/it]


🔄 Processing Liga Profesional - 1997/1998...



  Matches in Liga Profesiona 1997/1998:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  74%|███████▍  | 55/74 [23:11<05:23, 17.01s/it]


🔄 Processing Liga Profesional - 1981...



  Matches in Liga Profesiona 1981:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  78%|███████▊  | 58/74 [23:12<02:07,  7.94s/it]


🔄 Processing Ligue 1 - 2022/2023...



Competitions:  78%|███████▊  | 58/74 [23:24<02:07,  7.94s/it]
  Matches in Ligue 1 2022/2023:  75%|███████▌  | 24/32 [00:20<00:06,  1.16it/s][A
Competitions:  80%|███████▉  | 59/74 [23:39<02:59, 11.99s/it]


🔄 Processing Ligue 1 - 2021/2022...



  Matches in Ligue 1 2021/2022:   0%|          | 0/26 [00:00<?, ?it/s][A
  Matches in Ligue 1 2021/2022:  92%|█████████▏| 24/26 [00:20<00:01,  1.18it/s][A
Competitions:  81%|████████  | 60/74 [24:01<03:20, 14.32s/it]


🔄 Processing Ligue 1 - 2015/2016...



  Matches in Ligue 1 2015/2016:   0%|          | 0/377 [00:00<?, ?it/s][A
  Matches in Ligue 1 2015/2016:   7%|▋         | 26/377 [00:20<04:37,  1.27it/s][A
  Matches in Ligue 1 2015/2016:   7%|▋         | 26/377 [00:33<04:37,  1.27it/s][A
  Matches in Ligue 1 2015/2016:  14%|█▍        | 53/377 [00:40<04:09,  1.30it/s][A
  Matches in Ligue 1 2015/2016:  14%|█▍        | 53/377 [00:53<04:09,  1.30it/s][A
  Matches in Ligue 1 2015/2016:  21%|██        | 79/377 [01:01<03:51,  1.28it/s][A
  Matches in Ligue 1 2015/2016:  21%|██        | 79/377 [01:13<03:51,  1.28it/s][A
  Matches in Ligue 1 2015/2016:  28%|██▊       | 105/377 [01:22<03:33,  1.28it/s][A
  Matches in Ligue 1 2015/2016:  28%|██▊       | 105/377 [01:33<03:33,  1.28it/s][A
  Matches in Ligue 1 2015/2016:  35%|███▍      | 131/377 [01:42<03:12,  1.28it/s][A
  Matches in Ligue 1 2015/2016:  35%|███▍      | 131/377 [01:53<03:12,  1.28it/s][A
  Matches in Ligue 1 2015/2016:  42%|████▏     | 157/377 [02:02<02:51,  1.28it/s


🔄 Processing Major League Soccer - 2023...



  Matches in Major League So 2023:   0%|          | 0/6 [00:00<?, ?it/s][A
Competitions:  84%|████████▍ | 62/74 [28:59<12:36, 63.05s/it]


🔄 Processing North American League - 1977...



  Matches in North American  1977:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  84%|████████▍ | 62/74 [29:00<12:36, 63.05s/it]


🔄 Processing NWSL - 2018...



Competitions:  84%|████████▍ | 62/74 [29:16<12:36, 63.05s/it]
  Matches in NWSL 2018:  83%|████████▎ | 30/36 [00:20<00:04,  1.46it/s][A
Competitions:  86%|████████▋ | 64/74 [29:25<06:56, 41.63s/it]


🔄 Processing Premier League - 2015/2016...



  Matches in Premier League 2015/2016:   0%|          | 0/380 [00:00<?, ?it/s][A
  Matches in Premier League 2015/2016:   7%|▋         | 28/380 [00:20<04:12,  1.39it/s][A
  Matches in Premier League 2015/2016:   7%|▋         | 28/380 [00:40<04:12,  1.39it/s][A
  Matches in Premier League 2015/2016:  15%|█▌        | 57/380 [00:40<03:49,  1.41it/s][A
  Matches in Premier League 2015/2016:  15%|█▌        | 57/380 [01:00<03:49,  1.41it/s][A
  Matches in Premier League 2015/2016:  22%|██▏       | 85/380 [01:01<03:34,  1.38it/s][A
  Matches in Premier League 2015/2016:  22%|██▏       | 85/380 [01:20<03:34,  1.38it/s][A
  Matches in Premier League 2015/2016:  29%|██▉       | 111/380 [01:21<03:20,  1.34it/s][A
  Matches in Premier League 2015/2016:  29%|██▉       | 111/380 [01:40<03:20,  1.34it/s][A
  Matches in Premier League 2015/2016:  37%|███▋      | 140/380 [01:42<02:56,  1.36it/s][A
  Matches in Premier League 2015/2016:  37%|███▋      | 140/380 [02:00<02:56,  1.36it/s][A
  M


🔄 Processing Premier League - 2003/2004...



  Matches in Premier League 2003/2004:   0%|          | 0/38 [00:00<?, ?it/s][A
  Matches in Premier League 2003/2004:  74%|███████▎  | 28/38 [00:20<00:07,  1.37it/s][A
Competitions:  89%|████████▉ | 66/74 [34:35<10:41, 80.23s/it]


🔄 Processing Serie A - 2015/2016...



  Matches in Serie A 2015/2016:   0%|          | 0/380 [00:00<?, ?it/s][A
  Matches in Serie A 2015/2016:   7%|▋         | 27/380 [00:20<04:34,  1.29it/s][A
  Matches in Serie A 2015/2016:   7%|▋         | 27/380 [00:31<04:34,  1.29it/s][A
  Matches in Serie A 2015/2016:  14%|█▍        | 54/380 [00:41<04:08,  1.31it/s][A
  Matches in Serie A 2015/2016:  14%|█▍        | 54/380 [01:01<04:08,  1.31it/s][A
  Matches in Serie A 2015/2016:  22%|██▏       | 82/380 [01:01<03:43,  1.34it/s][A
  Matches in Serie A 2015/2016:  22%|██▏       | 82/380 [01:21<03:43,  1.34it/s][A
  Matches in Serie A 2015/2016:  29%|██▉       | 110/380 [01:22<03:19,  1.35it/s][A
  Matches in Serie A 2015/2016:  29%|██▉       | 110/380 [01:41<03:19,  1.35it/s][A
  Matches in Serie A 2015/2016:  36%|███▌      | 137/380 [01:42<03:00,  1.35it/s][A
  Matches in Serie A 2015/2016:  36%|███▌      | 137/380 [02:01<03:00,  1.35it/s][A
  Matches in Serie A 2015/2016:  43%|████▎     | 164/380 [02:02<02:41,  1.34it/s


🔄 Processing Serie A - 1986/1987...



  Matches in Serie A 1986/1987:   0%|          | 0/1 [00:00<?, ?it/s][A
Competitions:  91%|█████████ | 67/74 [39:36<16:07, 138.18s/it]


🔄 Processing UEFA Euro - 2024...



  Matches in UEFA Euro 2024:   0%|          | 0/51 [00:00<?, ?it/s][A
  Matches in UEFA Euro 2024:  53%|█████▎    | 27/51 [00:20<00:18,  1.32it/s][A
  Matches in UEFA Euro 2024:  53%|█████▎    | 27/51 [00:30<00:18,  1.32it/s][A
Competitions:  93%|█████████▎| 69/74 [40:15<07:16, 87.36s/it]


🔄 Processing UEFA Euro - 2020...



  Matches in UEFA Euro 2020:   0%|          | 0/51 [00:00<?, ?it/s][A
  Matches in UEFA Euro 2020:  51%|█████     | 26/51 [00:20<00:19,  1.28it/s][A
  Matches in UEFA Euro 2020:  51%|█████     | 26/51 [00:32<00:19,  1.28it/s][A
Competitions:  95%|█████████▍| 70/74 [40:54<05:04, 76.04s/it]


🔄 Processing UEFA Europa League - 1988/1989...



  Matches in UEFA Europa Lea 1988/1989:   0%|          | 0/3 [00:00<?, ?it/s][A
Competitions:  96%|█████████▌| 71/74 [40:56<02:52, 57.51s/it]


🔄 Processing UEFA Women's Euro - 2022...



Competitions:  96%|█████████▌| 71/74 [41:07<02:52, 57.51s/it]
  Matches in UEFA Women's Eu 2022:  90%|█████████ | 28/31 [00:20<00:02,  1.38it/s][A
Competitions:  97%|█████████▋| 72/74 [41:19<01:36, 48.36s/it]


🔄 Processing Women's World Cup - 2023...



  Matches in Women's World C 2023:   0%|          | 0/64 [00:00<?, ?it/s][A
  Matches in Women's World C 2023:  52%|█████▏    | 33/64 [00:20<00:19,  1.61it/s][A
  Matches in Women's World C 2023:  52%|█████▏    | 33/64 [00:38<00:19,  1.61it/s][A
Competitions:  99%|█████████▊| 73/74 [41:58<00:45, 45.85s/it]


🔄 Processing Women's World Cup - 2019...



  Matches in Women's World C 2019:   0%|          | 0/52 [00:00<?, ?it/s][A
  Matches in Women's World C 2019:  54%|█████▍    | 28/52 [00:20<00:17,  1.35it/s][A
Competitions: 100%|██████████| 74/74 [42:36<00:00, 34.55s/it]



✅ All done!
📊 Final shape of shots_master_df: (87111, 49)


In [15]:
#Since creating the dataset took more than 40 minutes I want to permanently safe it to a .csv, that I can always access. Before saving I reorder the columns.

ordered_columns = [
    #identifiers
    'id',
    'index',
    'match_id',

    #time information
    'period',
    'timestamp',

    #team and player columns
    'team_id',
    'team_name',
    'player_id',
    'player_name',
    'position_id',
    'position_name',

    #location columns
    'x',
    'y',
    'end_x',
    'end_y',
    'end_z',

    #shot details
    'body_part_id',
    'body_part_name',
    'sub_type_id',
    'sub_type_name',
    'technique_id',
    'technique_name',
    'shot_statsbomb_xg',
    'shot_key_pass_id',
    'under_pressure',
    'off_camera',
    'out',
    'aerial_won',

    #special attributes
    'shot_first_time',
    'shot_one_on_one',
    'shot_deflected',
    'shot_open_goal',
    'shot_redirect',
    'shot_follows_dribble',

    #outcome
    'outcome_id',
    'outcome_name',

    #rare or redundant
    'minute',
    'second',
    'duration',
    'half_start_late_video_start',
    'half_end_early_video_end',
    'pass_backheel',
    'possession',
    'possession_team_id',
    'possession_team_name',
    'play_pattern_id',
    'play_pattern_name'
]

shots_master_df = shots_master_df[[col for col in ordered_columns if col in shots_master_df.columns]]

In [17]:
shots_master_df.to_csv("shots_master_df.csv", index=False)

In [21]:
shots_master_df.shape
#shots_master_df.head()

(87111, 47)

In [22]:
# Load the CSV into a DataFrame
csv_shots_master_df = pd.read_csv("shots_master_df.csv")

# Optional: Check the shape and head
print(csv_shots_master_df.shape)
csv_shots_master_df.head()


(87111, 47)


Unnamed: 0,id,index,match_id,period,timestamp,team_id,team_name,player_id,player_name,position_id,...,second,duration,half_start_late_video_start,half_end_early_video_end,pass_backheel,possession,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name
0,c577e730-b9f5-44f2-9257-9e7730c23d7b,436,3895302,1,00:06:48.773000,176,Werder Bremen,8826.0,Leonardo Bittencourt,13.0,...,48,0.052872,,,,13,176,Werder Bremen,3,From Free Kick
1,bbc2c68d-c096-483d-abf4-32c0175a0f55,480,3895302,1,00:07:40.953000,904,Bayer Leverkusen,38004.0,Piero Martín Hincapié Reyna,8.0,...,40,0.217872,,,,14,904,Bayer Leverkusen,1,Regular Play
2,12b5206b-9ed0-4b1e-9ec3-f2028187e09f,597,3895302,1,00:11:08.471000,176,Werder Bremen,51769.0,Julián Malatini,3.0,...,8,0.445768,,,,22,176,Werder Bremen,3,From Free Kick
3,b2c3d59d-3bef-4f8a-ad86-26b69940c64e,684,3895302,1,00:13:16.073000,904,Bayer Leverkusen,8221.0,Jonathan Tah,4.0,...,16,0.085298,,,,26,904,Bayer Leverkusen,2,From Corner
4,bb53b537-1685-4019-9e8f-98f3805828eb,848,3895302,1,00:16:00.956000,904,Bayer Leverkusen,3500.0,Granit Xhaka,9.0,...,0,0.402989,,,,33,904,Bayer Leverkusen,1,Regular Play


Recovered work

In [None]:
import numpy as np

shots_master_women_world_cup_df['distance_to_goal'] = np.sqrt(
    (GOAL_X - shots_master_women_world_cup_df['x'])**2 +
    (GOAL_CENTER_Y - shots_master_women_world_cup_df['y'])**2
)

In [None]:
import numpy as np

# Constants
GOAL_WIDTH = 7.32  # meters
GOAL_CENTER_X = 120
GOAL_CENTER_Y = 40

In [None]:
distance_to_goal = np.sqrt((GOAL_CENTER_X - x)**2 + (GOAL_CENTER_Y - y)**2)
shots_master_women_world_cup_df['distance_to_goal'] = distance_to_goal

In [None]:
# Goalpost y-coordinates
goal_y1 = GOAL_CENTER_Y - GOAL_WIDTH / 2  # left post
goal_y2 = GOAL_CENTER_Y + GOAL_WIDTH / 2  # right post

# Shot coordinates
x = shots_master_women_world_cup_df['x'].to_numpy()
y = shots_master_women_world_cup_df['y'].to_numpy()

# Distances from shot to left and right goalposts
a = np.sqrt((GOAL_CENTER_X - x)**2 + (goal_y1 - y)**2)
b = np.sqrt((GOAL_CENTER_X - x)**2 + (goal_y2 - y)**2)
c = GOAL_WIDTH

# Avoid divide-by-zero
denominator = 2 * a * b
cos_angle = (a**2 + b**2 - c**2) / denominator
cos_angle = np.clip(cos_angle, -1.0, 1.0)  # Ensure within valid arccos range

# Calculate angle in radians
angle_radians = np.arccos(cos_angle)

# Store in DataFrame
shots_master_women_world_cup_df['angle_to_goal'] = angle_radians
shots_master_women_world_cup_df['angle_to_goal_deg'] = np.degrees(angle_radians)

In [None]:
from mplsoccer import VerticalPitch
import matplotlib.pyplot as plt

test_plot = shots_master_women_world_cup_df[['id','period', 'timestamp', 'team_id', 'team_name', 'player_id', 'player_name', 'x', 'y', 'distance_to_goal', 'angle_to_goal', 'angle_to_goal_deg', 'match_id', 'outcome_name']]

goals_df = test_plot[
    (test_plot['match_id'] == 69301) &
    (test_plot['outcome_name'] == 'Goal')
]


##############################

goal_x = 120
goal_y = 40

# Create the pitch
pitch = VerticalPitch(pitch_type='statsbomb', line_color='black', half = True)
fig, ax = pitch.draw(
    #figsize=(6, 10)
    )

pitch.scatter(goal_x, goal_y, s=50, c='red', ax=ax, label='Goal Center')

for _, row in goals_df.iterrows():
    shot_x = row['x']
    shot_y = row['y']
    distance = row['distance_to_goal']

    # Shot location
    pitch.scatter(shot_x, shot_y, ax=ax, c='gold', s=50, label='Goal')

    # Line to goal center
    pitch.lines(shot_x, shot_y, goal_x, goal_y, ax=ax, color='green', lw=1.5)

    mid_x = (shot_x + goal_x) / 2
    mid_y = (shot_y + goal_y) / 2
    pitch.annotate(
      f"{distance:.1f}u",
      xy=(mid_x, mid_y),
      ax=ax,
      fontsize=8,
      ha='center',
      color='black',
      bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', boxstyle='round', pad=0.2)
    )



# Title and legend
ax.set_title('Distance to Goal', fontsize=16)
ax.legend()
plt.show()

In [None]:
goal_width = 7.32
left_post_y = goal_y - goal_width / 2
right_post_y = goal_y + goal_width / 2

In [None]:
from mplsoccer import VerticalPitch
import matplotlib.pyplot as plt

goal_x, goal_y = 120, 40
goal_width = 7.32
left_post_y = goal_y - goal_width / 2
right_post_y = goal_y + goal_width / 2

pitch = VerticalPitch(pitch_type='statsbomb', line_color='black', half=True)
fig, ax = pitch.draw()

for i, row in goals_df.iterrows():
    shot_x = row['x']
    shot_y = row['y']
    angle = row['angle_to_goal_deg']

    pitch.scatter(shot_x, shot_y, ax=ax, c='gold', s=50, label='Goal' if i == 0 else None)

    pitch.lines(shot_x, shot_y, goal_x, left_post_y, ax=ax, color='blue', lw=1.5, linestyle='--')
    pitch.lines(shot_x, shot_y, goal_x, right_post_y, ax=ax, color='blue', lw=1.5, linestyle='--')

    pitch.annotate(
        f"{angle:.2f} rad",
        xy=(shot_x, shot_y),
        ax=ax,
        fontsize=8,
        ha='left',
        color='blue',
        xytext=(5, 5),
        textcoords='offset points',
        bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', boxstyle='round' ,pad=0.2)
    )

ax.set_title('Shot Angles for Goals', fontsize=16)
ax.legend()
plt.show()

In [None]:
shots_master_women_world_cup_df.info()
shots_master_women_world_cup_df.head(50)

In [None]:
shots_master_women_world_cup_df['goal'] = (shots_master_women_world_cup_df['outcome_name'] == 'Goal').astype(int)

shots_200 = shots_master_women_world_cup_df.iloc[:200]

fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(
    shots_200['angle_to_goal_deg'],
    shots_200['goal'],
    linestyle='none',
    marker='.',
    markersize=12,
    color='black'
)

# Axis labels and formatting
ax.set_ylabel('Goal Scored', fontsize=12)
ax.set_xlabel('Shot Angle (degrees)', fontsize=12)
ax.set_yticks([0, 1])
ax.set_yticklabels(['No', 'Yes'])
plt.ylim(-0.05, 1.05)
ax.set_title('Goal Outcome vs. Shot Angle (First 200 Shots)', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Get all available competitions
competitions_df = parser.competition()

# Step 2: Prepare storage
all_shots = []

# Step 3: Define your desired shot fields
shot_fields = [
    'id',
    'team_id',
    'team_name',
    'shot_key_pass_id',
    'x',
    'y',
    'end_x',
    'end_y',
    'end_z',
    'aerial_won',
    'shot_first_time',
    'shot_statsbomb_xg',
    'block_deflection',
    'technique_id',
    'technique_name',
    'body_part_id',
    'body_part_name',
    'sub_type_id',
    'sub_type_name',
    'outcome_id',
    'outcome_name'
]

# Step 4: Loop through competitions and matches
for _, comp_row in competitions_df.iterrows():
    comp_id = comp_row['competition_id']
    season_id = comp_row['season_id']

    try:
        matches_df = parser.match(competition_id=comp_id, season_id=season_id)
    except Exception as e:
        print(f"Skipping competition {comp_id}-{season_id} due to error: {e}")
        continue

    for match_id in matches_df['match_id']:
        try:
            df_event, df_related, df_freeze, df_tactics = parser.event(match_id)
            df_shots = df_event[df_event['type_name'] == 'Shot']

            for col in shot_fields:
                if col not in df_shots.columns:
                    df_shots[col] = None

            df_shots_clean = df_shots[shot_fields].copy()
            df_shots_clean['match_id'] = match_id
            df_shots_clean['competition_id'] = comp_id
            df_shots_clean['season_id'] = season_id

            all_shots.append(df_shots_clean)

        except Exception as e:
            print(f"Error processing match {match_id}: {e}")
            continue


shots_master_df = pd.concat(all_shots, ignore_index=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

In [None]:
X = shots_master_women_world_cup_df[['angle_to_goal_deg', 'distance_to_goal', 'x', 'y']]


    #'body_part_id',
    #'body_part_name',
    #'shot_key_pass_id',
    #'x',
    #'y',
    #'end_x',
    #'end_y',
    #'end_z',
    #'aerial_won',
    #'shot_first_time',
    #'shot_statsbomb_xg',
    #'block_deflection',
    #'technique_id',
    #'technique_name',
    #'body_part_id',
    #'body_part_name',
    #'sub_type_id',
    #'sub_type_name',

y = shots_master_women_world_cup_df['goal']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # probabilities

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Probability of class = 1 (goal)
y_probs = model.predict_proba(X)[:, 1]

fpr, tpr, thresholds = roc_curve(y, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import statsmodels.api as sm

# Define your features and target
X = shots_master_women_world_cup_df[['angle_to_goal', 'distance_to_goal']]
y = shots_master_women_world_cup_df['goal']

# Add intercept
X_with_const = sm.add_constant(X)

# Fit logistic regression
model = sm.Logit(y, X_with_const)
result = model.fit()

# Calculate McFadden's R²
ll_model = result.llf               # Log-likelihood of fitted model
ll_null = result.llnull             # Log-likelihood of null model

mcfadden_r2 = 1 - (ll_model / ll_null)
print(f"McFadden's R²: {mcfadden_r2:.4f}")