# Exploration of Shots Across All Competitions  

In the code below, all **shot events** from each competition and each season present in the StatsBomb Open Data are collected. The objective is to determine:  

- the **total number of shots** in the dataset,  
- how many of them include a **freeze frame**,  
- and the proportion of shots with and without this contextual information.  

In [None]:
from statsbombpy import sb
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")


# Retrieve all available competitions in StatsBomb Open Data
competitions = sb.competitions()

# Store all shots here
all_shots = []

# Iterate over each competition-season pair
for _, row in tqdm(competitions.iterrows(), total=len(competitions), desc="Processing competitions"):
    comp_id = row['competition_id']
    season_id = row['season_id']

    try:
        # Retrieve all matches for the given competition and season
        matches = sb.matches(comp_id, season_id)

        # Iterate through each match
        for match_id in tqdm(matches['match_id'], desc=f"Comp {comp_id}, Season {season_id}", leave=False):
            # Retrieve all events for this match
            events = sb.events(match_id=match_id)

            # Filter only "Shot" events
            shots = events[events['type'] == 'Shot']

            # If there are shots in this match, append them to our global list
            if not shots.empty:
                all_shots.append(shots)

    except Exception as e:
        # Some competitions may fail (rare), we skip them safely
        print(f"Skipping comp {comp_id}, season {season_id}: {e}")

# Concatenate all shots into a single DataFrame
shots_df = pd.concat(all_shots, ignore_index=True)

# Count statistics
total_shots = len(shots_df)
shots_with_ff = shots_df['shot_freeze_frame'].notna().sum()
shots_without_ff = total_shots - shots_with_ff

# Print summary statistics
print("----------------------------------")
print(f"Total shots         : {total_shots}")
print(f"With freeze frame   : {shots_with_ff}")
print(f"Without freeze frame: {shots_without_ff}")
print(f"Percentage with FF  : {shots_with_ff / total_shots:.2%}")


Processing competitions: 100%|██████████| 75/75 [42:10<00:00, 33.74s/it]  


----------------------------------
Total shots         : 88023
With freeze frame   : 86833
Without freeze frame: 1190
Percentage with FF  : 98.65%


#### Save the df with all the shots from all competitions and seasons

In [None]:
# Save the DataFrame to a CSV file
from pathlib import Path

# Define the source path for the saved DataFrame
src_path = Path("../task1_xg/data/shots_df.csv")
print(f"Saving {src_path.name}")

# Save the DataFrame
shots_df.to_csv(src_path, index=False)


Saving shots_df.csv


## Analysis of Shots Missing Freeze Frames

This script examines all StatsBomb open-data shots and identifies matches where at least one shot has no freeze frame. It builds per-match summaries with the number and percentage of missing freeze frames, merges this with match metadata, and then reports both match-level details and global statistics restricted to these affected matches. The output highlights which matches have incomplete freeze frame data and quantifies its overall impact.


In [None]:
from statsbombpy import sb
import numpy as np
import pandas as pd
from tqdm import tqdm

# Helper: check for missing/empty freeze frame
def is_empty_ff(x):
    """True if freeze frame is missing or empty (handles NaN, None, list, ndarray)."""
    if x is None:
        return True
    # pandas NA scalars (covers float NaN and pandas NA)
    try:
        if pd.api.types.is_scalar(x) and pd.isna(x):
            return True
    except Exception:
        pass
    # list or numpy array
    if isinstance(x, (list, np.ndarray)):
        return len(x) == 0
    # treat anything else (e.g., dict) as present
    return False


# Build per-match summary on shots_df
shots_df = shots_df.copy()
shots_df['no_ff'] = shots_df['shot_freeze_frame'].apply(is_empty_ff)

per_match = (
    shots_df.groupby('match_id', as_index=False)
            .agg(total_shots=('no_ff', 'size'),
                 shots_without_ff=('no_ff', 'sum'))
)
per_match = per_match.loc[per_match['shots_without_ff'] > 0].copy()
per_match['pct_without_ff'] = (per_match['shots_without_ff'] / per_match['total_shots'] * 100).round(2)


# Retrieve all match metadata across open-data (competition + season)
comps = sb.competitions()
matches_all = []

for comp_id, season_id in tqdm(comps[['competition_id', 'season_id']].itertuples(index=False),
                               total=len(comps), desc="Fetching matches metadata"):
    try:
        m = sb.matches(competition_id=comp_id, season_id=season_id)
        if m is not None and not m.empty:
            matches_all.append(m)
    except Exception as e:
        tqdm.write(f"Warning: failed comp={comp_id}, season={season_id} -> {e}")

if matches_all:
    matches_all = pd.concat(matches_all, ignore_index=True).drop_duplicates(subset='match_id')
else:
    matches_all = pd.DataFrame()

# Useful metadata columns
meta_cols = [
    'match_id', 'competition', 'competition_name', 'season', 'season_name',
    'match_date', 'kick_off', 'home_team', 'away_team', 'home_score', 'away_score',
    'stadium', 'stadium_name'
]
use_cols = [c for c in meta_cols if c in matches_all.columns]
matches_meta = matches_all[use_cols].copy()


# Merge and sort for reporting
report = per_match.merge(matches_meta, on='match_id', how='left')
report = report.sort_values(['pct_without_ff', 'shots_without_ff'], ascending=False).reset_index(drop=True)

print("Matches with at least one shot missing freeze frame:")
cols_to_show = [
    'match_id', 'total_shots', 'shots_without_ff', 'pct_without_ff',
    'competition_name', 'season_name', 'match_date', 'home_team', 'away_team'
]
cols_to_show = [c for c in cols_to_show if c in report.columns]
print(report[cols_to_show].to_string(index=False))

# Global totals restricted to matches with at least 1 missing FF
restricted_total = report['total_shots'].sum()
restricted_missing = report['shots_without_ff'].sum()
restricted_pct = round(restricted_missing / restricted_total * 100, 2) if restricted_total > 0 else 0

print("\nGlobal Stats for matches with ≥1 missing freeze frame:")
print(f"Total shots in those matches     : {restricted_total}")
print(f"Without freeze frame (in matches): {restricted_missing}")
print(f"Percentage missing               : {restricted_pct}%")

Fetching matches metadata: 100%|██████████| 75/75 [00:15<00:00,  4.78it/s]

Matches with at least one shot missing freeze frame:
 match_id  total_shots  shots_without_ff  pct_without_ff match_date                  home_team                   away_team
  3923880           33                13           39.39 2024-02-10               South Africa                    Congo DR
  3922240           41                16           39.02 2024-01-28                      Egypt                    Congo DR
  4018357           39                15           38.46 2025-07-19             France Women's             Germany Women's
  3869321           31                10           32.26 2022-12-09                Netherlands                   Argentina
  3922242           31                10           32.26 2024-01-29                    Senegal               Côte d'Ivoire
  3902968           58                18           31.03 2023-08-12          Australia Women's              France Women's
  3942228           26                 8           30.77 2024-07-05                  A




## Analysis of Shot Statsbomb xG

Below, the `shots_df.csv` file is loaded with the column `shot_statsbomb_xg` explicitly cast to type `float64`.  
This guarantees that numeric values are stored as floats, while invalid entries (e.g., empty strings or `"null"`) are automatically converted to `NaN`.   The column data type and the total number of missing values are then displayed to verify that no missing values are retrieved.

In [None]:
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

src_path = Path("../task1_xg/data/shots_df.csv")

# Force 'shot_statsbomb_xg' to float, invalid values will be converted to NaN
shots_df = pd.read_csv(src_path, dtype={"shot_statsbomb_xg": "float64"})

print("Column type:", shots_df["shot_statsbomb_xg"].dtype)
print("Total NaN values:", shots_df["shot_statsbomb_xg"].isna().sum())


Column type: float64
Total NaN values: 0
