<a href="https://colab.research.google.com/github/hannesstuehrenberg/Probabilistic-Machine-Learning_lecture-PROJECTS/blob/main/projects/08-1SHXXXX_football_analytics/notebooks/03_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 3. Data Preprocessing
- Steps taken to clean or transform the data


In [1]:
!pip install mplsoccer

Collecting mplsoccer
  Downloading mplsoccer-1.5.0-py3-none-any.whl.metadata (4.8 kB)
Downloading mplsoccer-1.5.0-py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.2/86.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mplsoccer
Successfully installed mplsoccer-1.5.0


In [8]:
#Import necessary libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from mplsoccer import VerticalPitch
from mplsoccer import Sbopen
from tqdm import tqdm

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Initialize parser
parser = Sbopen()

In [9]:
matches_df = parser.match(competition_id=72, season_id=107)
match_ids = matches_df['match_id'].tolist()

all_shots = []

for match_id in match_ids:
    df_event, df_related, df_freeze, df_tactics = parser.event(match_id)

    df_shots = df_event[df_event['type_name'] == 'Shot'].copy()

    df_shots['match_id'] = match_id

    all_shots.append(df_shots)

shots_master_women_world_cup_df = pd.concat(all_shots, ignore_index=True)

KeyboardInterrupt: 

In [7]:
#Lets look at the example dataset from the Women's World Cup 2023 again. It consisted of 1680 shots. 184 of them were goals.
num_goals = shots_master_women_world_cup_df[shots_master_women_world_cup_df['outcome_name'] == 'Goal'].shape[0]
print(f"Number of goals: {num_goals}")


Number of goals: 184


In [None]:
#As many more competitions are available the next step is to create a shots_master_df for all shots from all competitions to enrich the dataset on which to train the models later.

#During data loading and exploration these columns were discovered to always be NaN for type_name = Shot. Therefore, they are not of interest for the shots_master_df
irrelevant_columns = [
    'tactics_formation', 'pass_recipient_id', 'pass_recipient_name', 'pass_length',
    'pass_angle', 'pass_height_id', 'pass_height_name', 'counterpress', 'pass_switch',
    'block_deflection', 'pass_cross', 'pass_assisted_shot_id', 'pass_shot_assist',
    'goalkeeper_position_id', 'goalkeeper_position_name', 'foul_committed_card_id',
    'foul_committed_card_name', 'ball_recovery_recovery_failure', 'foul_committed_advantage',
    'foul_won_advantage', 'foul_won_defensive', 'dribble_nutmeg', 'pass_goal_assist',
    'substitution_replacement_id', 'substitution_replacement_name', 'bad_behaviour_card_id',
    'bad_behaviour_card_name', 'pass_cut_back', 'dribble_overrun', 'ball_recovery_offensive',
    'pass_no_touch', 'pass_deflected', 'foul_committed_penalty', 'foul_won_penalty',
    'injury_stoppage_in_chain', 'foul_committed_offensive', 'block_offensive',
    'pass_miscommunication', 'block_save_block', 'dribble_no_touch', 'player_off_permanent'
]

all_shots = []

df_competitions = parser.competition()

# Add mininterval or miniters to slow down tqdm output
for _, row in tqdm(df_competitions.iterrows(), total=len(df_competitions), desc="Competitions", mininterval=2.0):
    comp_id = row['competition_id']
    season_id = row['season_id']
    comp_name = row['competition_name']
    season_name = row['season_name']

    tqdm.write(f"\n🔄 Processing {comp_name} - {season_name}...")

    try:
        matches_df = parser.match(competition_id=comp_id, season_id=season_id)
        match_ids = matches_df['match_id'].tolist()

        for match_id in tqdm(
            match_ids,
            desc=f"  Matches in {comp_name[:15]} {season_name}",
            leave=False,
            mininterval=20,
        ):
            try:
                df_event, df_related, df_freeze, df_tactics = parser.event(match_id)
                df_shots = df_event[df_event['type_name'] == 'Shot'].copy()
                df_shots['match_id'] = match_id
                all_shots.append(df_shots)
            except Exception as e:
                print(f"     ❌ Error in match_id {match_id}: {e}")

    except Exception as e:
        print(f"❌ Could not fetch matches for {comp_name} - {season_name}: {e}")

shots_master_df = pd.concat(all_shots, ignore_index=True)
shots_master_df = shots_master_df.drop(columns=[col for col in irrelevant_columns if col in shots_master_df.columns])

print("\n✅ All done!")
print("📊 Final shape of shots_master_df:", shots_master_df.shape)


Competitions:   0%|          | 0/74 [00:00<?, ?it/s]


🔄 Processing 1. Bundesliga - 2023/2024...



  Matches in 1. Bundesliga 2023/2024:   0%|          | 0/34 [00:00<?, ?it/s][A
  Matches in 1. Bundesliga 2023/2024:  94%|█████████▍| 32/34 [00:20<00:01,  1.57it/s][A
Competitions:   1%|▏         | 1/74 [00:22<26:59, 22.19s/it]


🔄 Processing 1. Bundesliga - 2015/2016...



  Matches in 1. Bundesliga 2015/2016:   0%|          | 0/306 [00:00<?, ?it/s][A
  Matches in 1. Bundesliga 2015/2016:  12%|█▏        | 38/306 [00:20<02:23,  1.86it/s][A
  Matches in 1. Bundesliga 2015/2016:  12%|█▏        | 38/306 [00:40<02:23,  1.86it/s][A
  Matches in 1. Bundesliga 2015/2016:  24%|██▍       | 73/306 [00:40<02:09,  1.79it/s][A
  Matches in 1. Bundesliga 2015/2016:  24%|██▍       | 73/306 [00:50<02:09,  1.79it/s][A
  Matches in 1. Bundesliga 2015/2016:  37%|███▋      | 113/306 [01:00<01:43,  1.87it/s][A
  Matches in 1. Bundesliga 2015/2016:  37%|███▋      | 113/306 [01:20<01:43,  1.87it/s][A
  Matches in 1. Bundesliga 2015/2016:  50%|█████     | 154/306 [01:21<01:19,  1.91it/s][A
  Matches in 1. Bundesliga 2015/2016:  50%|█████     | 154/306 [01:40<01:19,  1.91it/s][A
  Matches in 1. Bundesliga 2015/2016:  59%|█████▉    | 182/306 [01:41<01:11,  1.72it/s][A
  Matches in 1. Bundesliga 2015/2016:  59%|█████▉    | 182/306 [02:00<01:11,  1.72it/s][A
  Matches in

Recovered work

In [None]:
import numpy as np

shots_master_women_world_cup_df['distance_to_goal'] = np.sqrt(
    (GOAL_X - shots_master_women_world_cup_df['x'])**2 +
    (GOAL_CENTER_Y - shots_master_women_world_cup_df['y'])**2
)

In [None]:
import numpy as np

# Constants
GOAL_WIDTH = 7.32  # meters
GOAL_CENTER_X = 120
GOAL_CENTER_Y = 40

In [None]:
distance_to_goal = np.sqrt((GOAL_CENTER_X - x)**2 + (GOAL_CENTER_Y - y)**2)
shots_master_women_world_cup_df['distance_to_goal'] = distance_to_goal

In [None]:
# Goalpost y-coordinates
goal_y1 = GOAL_CENTER_Y - GOAL_WIDTH / 2  # left post
goal_y2 = GOAL_CENTER_Y + GOAL_WIDTH / 2  # right post

# Shot coordinates
x = shots_master_women_world_cup_df['x'].to_numpy()
y = shots_master_women_world_cup_df['y'].to_numpy()

# Distances from shot to left and right goalposts
a = np.sqrt((GOAL_CENTER_X - x)**2 + (goal_y1 - y)**2)
b = np.sqrt((GOAL_CENTER_X - x)**2 + (goal_y2 - y)**2)
c = GOAL_WIDTH

# Avoid divide-by-zero
denominator = 2 * a * b
cos_angle = (a**2 + b**2 - c**2) / denominator
cos_angle = np.clip(cos_angle, -1.0, 1.0)  # Ensure within valid arccos range

# Calculate angle in radians
angle_radians = np.arccos(cos_angle)

# Store in DataFrame
shots_master_women_world_cup_df['angle_to_goal'] = angle_radians
shots_master_women_world_cup_df['angle_to_goal_deg'] = np.degrees(angle_radians)

In [None]:
from mplsoccer import VerticalPitch
import matplotlib.pyplot as plt

test_plot = shots_master_women_world_cup_df[['id','period', 'timestamp', 'team_id', 'team_name', 'player_id', 'player_name', 'x', 'y', 'distance_to_goal', 'angle_to_goal', 'angle_to_goal_deg', 'match_id', 'outcome_name']]

goals_df = test_plot[
    (test_plot['match_id'] == 69301) &
    (test_plot['outcome_name'] == 'Goal')
]


##############################

goal_x = 120
goal_y = 40

# Create the pitch
pitch = VerticalPitch(pitch_type='statsbomb', line_color='black', half = True)
fig, ax = pitch.draw(
    #figsize=(6, 10)
    )

pitch.scatter(goal_x, goal_y, s=50, c='red', ax=ax, label='Goal Center')

for _, row in goals_df.iterrows():
    shot_x = row['x']
    shot_y = row['y']
    distance = row['distance_to_goal']

    # Shot location
    pitch.scatter(shot_x, shot_y, ax=ax, c='gold', s=50, label='Goal')

    # Line to goal center
    pitch.lines(shot_x, shot_y, goal_x, goal_y, ax=ax, color='green', lw=1.5)

    mid_x = (shot_x + goal_x) / 2
    mid_y = (shot_y + goal_y) / 2
    pitch.annotate(
      f"{distance:.1f}u",
      xy=(mid_x, mid_y),
      ax=ax,
      fontsize=8,
      ha='center',
      color='black',
      bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', boxstyle='round', pad=0.2)
    )



# Title and legend
ax.set_title('Distance to Goal', fontsize=16)
ax.legend()
plt.show()

In [None]:
goal_width = 7.32
left_post_y = goal_y - goal_width / 2
right_post_y = goal_y + goal_width / 2

In [None]:
from mplsoccer import VerticalPitch
import matplotlib.pyplot as plt

goal_x, goal_y = 120, 40
goal_width = 7.32
left_post_y = goal_y - goal_width / 2
right_post_y = goal_y + goal_width / 2

pitch = VerticalPitch(pitch_type='statsbomb', line_color='black', half=True)
fig, ax = pitch.draw()

for i, row in goals_df.iterrows():
    shot_x = row['x']
    shot_y = row['y']
    angle = row['angle_to_goal_deg']

    pitch.scatter(shot_x, shot_y, ax=ax, c='gold', s=50, label='Goal' if i == 0 else None)

    pitch.lines(shot_x, shot_y, goal_x, left_post_y, ax=ax, color='blue', lw=1.5, linestyle='--')
    pitch.lines(shot_x, shot_y, goal_x, right_post_y, ax=ax, color='blue', lw=1.5, linestyle='--')

    pitch.annotate(
        f"{angle:.2f} rad",
        xy=(shot_x, shot_y),
        ax=ax,
        fontsize=8,
        ha='left',
        color='blue',
        xytext=(5, 5),
        textcoords='offset points',
        bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', boxstyle='round' ,pad=0.2)
    )

ax.set_title('Shot Angles for Goals', fontsize=16)
ax.legend()
plt.show()

In [None]:
shots_master_women_world_cup_df.info()
shots_master_women_world_cup_df.head(50)

In [None]:
shots_master_women_world_cup_df['goal'] = (shots_master_women_world_cup_df['outcome_name'] == 'Goal').astype(int)

shots_200 = shots_master_women_world_cup_df.iloc[:200]

fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(
    shots_200['angle_to_goal_deg'],
    shots_200['goal'],
    linestyle='none',
    marker='.',
    markersize=12,
    color='black'
)

# Axis labels and formatting
ax.set_ylabel('Goal Scored', fontsize=12)
ax.set_xlabel('Shot Angle (degrees)', fontsize=12)
ax.set_yticks([0, 1])
ax.set_yticklabels(['No', 'Yes'])
plt.ylim(-0.05, 1.05)
ax.set_title('Goal Outcome vs. Shot Angle (First 200 Shots)', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Get all available competitions
competitions_df = parser.competition()

# Step 2: Prepare storage
all_shots = []

# Step 3: Define your desired shot fields
shot_fields = [
    'id',
    'team_id',
    'team_name',
    'shot_key_pass_id',
    'x',
    'y',
    'end_x',
    'end_y',
    'end_z',
    'aerial_won',
    'shot_first_time',
    'shot_statsbomb_xg',
    'block_deflection',
    'technique_id',
    'technique_name',
    'body_part_id',
    'body_part_name',
    'sub_type_id',
    'sub_type_name',
    'outcome_id',
    'outcome_name'
]

# Step 4: Loop through competitions and matches
for _, comp_row in competitions_df.iterrows():
    comp_id = comp_row['competition_id']
    season_id = comp_row['season_id']

    try:
        matches_df = parser.match(competition_id=comp_id, season_id=season_id)
    except Exception as e:
        print(f"Skipping competition {comp_id}-{season_id} due to error: {e}")
        continue

    for match_id in matches_df['match_id']:
        try:
            df_event, df_related, df_freeze, df_tactics = parser.event(match_id)
            df_shots = df_event[df_event['type_name'] == 'Shot']

            for col in shot_fields:
                if col not in df_shots.columns:
                    df_shots[col] = None

            df_shots_clean = df_shots[shot_fields].copy()
            df_shots_clean['match_id'] = match_id
            df_shots_clean['competition_id'] = comp_id
            df_shots_clean['season_id'] = season_id

            all_shots.append(df_shots_clean)

        except Exception as e:
            print(f"Error processing match {match_id}: {e}")
            continue


shots_master_df = pd.concat(all_shots, ignore_index=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

In [None]:
X = shots_master_women_world_cup_df[['angle_to_goal_deg', 'distance_to_goal', 'x', 'y']]


    #'body_part_id',
    #'body_part_name',
    #'shot_key_pass_id',
    #'x',
    #'y',
    #'end_x',
    #'end_y',
    #'end_z',
    #'aerial_won',
    #'shot_first_time',
    #'shot_statsbomb_xg',
    #'block_deflection',
    #'technique_id',
    #'technique_name',
    #'body_part_id',
    #'body_part_name',
    #'sub_type_id',
    #'sub_type_name',

y = shots_master_women_world_cup_df['goal']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # probabilities

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Probability of class = 1 (goal)
y_probs = model.predict_proba(X)[:, 1]

fpr, tpr, thresholds = roc_curve(y, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import statsmodels.api as sm

# Define your features and target
X = shots_master_women_world_cup_df[['angle_to_goal', 'distance_to_goal']]
y = shots_master_women_world_cup_df['goal']

# Add intercept
X_with_const = sm.add_constant(X)

# Fit logistic regression
model = sm.Logit(y, X_with_const)
result = model.fit()

# Calculate McFadden's R²
ll_model = result.llf               # Log-likelihood of fitted model
ll_null = result.llnull             # Log-likelihood of null model

mcfadden_r2 = 1 - (ll_model / ll_null)
print(f"McFadden's R²: {mcfadden_r2:.4f}")