In [1]:
from pathlib import Path
import os
import json

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Get events data

In [2]:
# Get project root
project_root = Path().absolute().parent

# Get path to event data
event_data_path = os.path.join(project_root, "data", "wyscout_data", "events_data")

# Get all JSON files in the directory
json_files = [f for f in os.listdir(event_data_path) if f.endswith('.json')]

# Add all events to a list (more efficient than concatenating on each iteration)
df_list = [
    pd.DataFrame(json.load(open(os.path.join(event_data_path, file))))
    for file in json_files
]

# Concatenate all dataframes at once
df_events = pd.concat(df_list, ignore_index=True)

In [3]:
df_events.head()

Unnamed: 0,id,matchId,matchPeriod,minute,second,matchTimestamp,videoTimestamp,relatedEventId,type,location,team,opponentTeam,player,pass,shot,groundDuel,aerialDuel,infraction,carry,possession
0,2384313747,5588197,1H,0,2,00:00:02.559,3.559115,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 52, 'y': 52}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 286831, 'name': 'D. Solanke', 'position...","{'accurate': True, 'angle': -159, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
1,2384313748,5588197,1H,0,4,00:00:04.324,5.324929,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 37, 'y': 42}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': 62, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
2,2384313771,5588197,1H,0,6,00:00:06.973,7.973209,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 45, 'y': 65}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 551442, 'name': 'Pedro Porro', 'positio...","{'accurate': True, 'angle': -95, 'height': Non...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
3,2384313772,5588197,1H,0,8,00:00:08.768,9.768278,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 44, 'y': 47}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': -135, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
4,2384313775,5588197,1H,0,10,00:00:10.769,11.769625,2384314000.0,"{'primary': 'pass', 'secondary': ['forward_pas...","{'x': 34, 'y': 32}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 136441, 'name': 'B. Davies', 'position'...","{'accurate': True, 'angle': 32, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."


# Explore data

In [4]:
df_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480511 entries, 0 to 480510
Data columns (total 20 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              480511 non-null  int64  
 1   matchId         480511 non-null  int64  
 2   matchPeriod     480511 non-null  object 
 3   minute          480511 non-null  int64  
 4   second          480511 non-null  int64  
 5   matchTimestamp  480511 non-null  object 
 6   videoTimestamp  480511 non-null  object 
 7   relatedEventId  452956 non-null  float64
 8   type            480511 non-null  object 
 9   location        480150 non-null  object 
 10  team            480511 non-null  object 
 11  opponentTeam    480511 non-null  object 
 12  player          480511 non-null  object 
 13  pass            274869 non-null  object 
 14  shot            6971 non-null    object 
 15  groundDuel      77664 non-null   object 
 16  aerialDuel      18450 non-null   object 
 17  infraction

## Possession column

In [5]:
# Possession collumn is a dictionary
df_events.iloc[0]["possession"]

{'id': 2384313747,
 'duration': '9.752984',
 'types': [],
 'eventsNumber': 6,
 'eventIndex': 0,
 'startLocation': {'x': 52, 'y': 52},
 'endLocation': {'x': 45, 'y': 28},
 'team': {'id': 1624, 'name': 'Tottenham Hotspur'},
 'attack': None}

In [6]:
df_events.iloc[0]["possession"]['id']

2384313747

Test what method is the fastest to get data from the object column like possession

In [7]:
import time

# Method 1: apply with lambda and None handling
start = time.time()
test1 = df_events['possession'].apply(lambda x: x['id'] if x is not None else None)
time1 = time.time() - start

# Method 2: list comprehension with None handling
start = time.time()
test2 = [possession['id'] if possession is not None else None for possession in df_events['possession']]
time2 = time.time() - start

# Method 3: Using .get() method
start = time.time()
test3 = df_events['possession'].apply(lambda x: x.get('id') if x is not None else None)
time3 = time.time() - start

# Method 4: List comprehension with .get()
start = time.time()
test4 = [possession.get('id') if possession is not None else None for possession in df_events['possession']]
time4 = time.time() - start


print(f"Apply method: {time1:.4f} seconds")
print(f"List comprehension: {time2:.4f} seconds")
print(f"Apply with .get(): {time3:.4f} seconds")
print(f"List comprehension with .get(): {time4:.4f} seconds")

Apply method: 0.3159 seconds
List comprehension: 0.1818 seconds
Apply with .get(): 0.3087 seconds
List comprehension with .get(): 0.1790 seconds


## Type column

In [8]:
df_events.iloc[22]["type"]

# Will always consist of primary and secondary type


primary_types = [
    "acceleration",
    "clearance", 
    "corner",
    "duel",
    "fairplay",
    "free_kick",
    "game_interruption",
    "goal_kick",
    "goalkeeper_exit",
    "infraction",
    "interception",
    "offside",
    "own_goal",
    "pass",
    "penalty",
    "pressing_attempt",
    "received_pass",
    "shot",
    "shot_against",
    "throw_in",
    "touch"
]

secondary_types = [
    "aerial_duel",
    "assist",
    "back_pass",
    "ball_out",
    "carry",
    "conceded_goal",
    "counterpressing_recovery",
    "cross",
    "cross_blocked",
    "deep_completed_cross",
    "deep_completion",
    "defensive_duel",
    "dribble",
    "dribbled_past_attempt",
    "forward_pass",
    "foul",
    "foul_suffered",
    "free_kick_cross",
    "free_kick_shot",
    "goal",
    "ground_duel",
    "hand_pass",
    "head_pass",
    "head_shot",
    "key_pass",
    "lateral_pass",
    "linkup_play",
    "long_pass",
    "loose_ball_duel",
    "loss",
    "offensive_duel",
    "opportunity",
    "pass_into_penalty_area",
    "pass_to_final_third",
    "penalty_conceded_goal",
    "penalty_foul",
    "penalty_goal",
    "penalty_save",
    "pressing_duel",
    "progressive_pass",
    "progressive_run",
    "recovery",
    "red_card",
    "save",
    "save_with_reflex",
    "second_assist",
    "short_or_medium_pass",
    "shot_after_corner",
    "shot_after_free_kick",
    "shot_after_throw_in",
    "shot_assist",
    "shot_block",
    "sliding_tackle",
    "smart_pass",
    "third_assist",
    "through_pass",
    "touch_in_box",
    "under_pressure",
    "whistle",
    "yellow_card"
]

## Timestamp column

In [9]:
df_match = df_events[(df_events['matchId'] == 5588197) & (df_events['matchPeriod'] == '1H')]
df_match.tail()

Unnamed: 0,id,matchId,matchPeriod,minute,second,matchTimestamp,videoTimestamp,relatedEventId,type,location,team,opponentTeam,player,pass,shot,groundDuel,aerialDuel,infraction,carry,possession
884,2384314325,5588197,1H,48,1,00:48:01.716,2882.716201,2384314000.0,"{'primary': 'interception', 'secondary': ['for...","{'x': 72, 'y': 84}","{'id': 1625, 'name': 'Manchester City'}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 658798, 'name': 'R. Lewis', 'position':...","{'accurate': True, 'angle': 0, 'height': None,...",,,,,,"{'id': 2384314310, 'duration': '35.098258', 't..."
885,2384314327,5588197,1H,48,5,00:48:05.714,2886.714375,2384314000.0,"{'primary': 'duel', 'secondary': ['dribble', '...","{'x': 90, 'y': 84}","{'id': 1625, 'name': 'Manchester City'}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 661732, 'name': 'Savinho', 'position': ...",,,"{'opponent': {'id': 14911, 'name': 'Son Heung-...",,,,"{'id': 2384314310, 'duration': '35.098258', 't..."
886,2384314611,5588197,1H,48,5,00:48:05.916,2886.916831,2384314000.0,"{'primary': 'duel', 'secondary': ['defensive_d...","{'x': 10, 'y': 16}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 14911, 'name': 'Son Heung-Min', 'positi...",,,"{'opponent': {'id': 661732, 'name': 'Savinho',...",,,,"{'id': 2384314310, 'duration': '35.098258', 't..."
887,2384314328,5588197,1H,48,7,00:48:07.201,2888.201434,,"{'primary': 'pass', 'secondary': ['cross']}","{'x': 87, 'y': 83}","{'id': 1625, 'name': 'Manchester City'}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 661732, 'name': 'Savinho', 'position': ...","{'accurate': False, 'angle': -148, 'height': '...",,,,,,"{'id': 2384314310, 'duration': '35.098258', 't..."
888,2384314612,5588197,1H,48,8,00:48:08.940,2889.940408,,"{'primary': 'interception', 'secondary': []}","{'x': 6, 'y': 51}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 326998, 'name': 'G. Vicario', 'position...",,,,,,,"{'id': 2384314310, 'duration': '35.098258', 't..."


In [10]:
df_match = df_events[(df_events['matchId'] == 5588197) & (df_events['matchPeriod'] == '2H')]
df_match.head()

Unnamed: 0,id,matchId,matchPeriod,minute,second,matchTimestamp,videoTimestamp,relatedEventId,type,location,team,opponentTeam,player,pass,shot,groundDuel,aerialDuel,infraction,carry,possession
889,2384314329,5588197,2H,45,1,00:45:01.999,2905.999753,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 51, 'y': 51}","{'id': 1625, 'name': 'Manchester City'}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 447205, 'name': 'P. Foden', 'position':...","{'accurate': True, 'angle': 180, 'height': Non...",,,,,,"{'id': 2384314329, 'duration': '24.7149565', '..."
890,2384314330,5588197,2H,45,6,00:45:06.447,2910.447573,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 25, 'y': 51}","{'id': 1625, 'name': 'Manchester City'}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 71654, 'name': 'Ederson', 'position': '...","{'accurate': True, 'angle': 57, 'height': None...",,,,,,"{'id': 2384314329, 'duration': '24.7149565', '..."
891,2384314331,5588197,2H,45,10,00:45:10.577,2914.577382,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 41, 'y': 89}","{'id': 1625, 'name': 'Manchester City'}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 8277, 'name': 'K. Walker', 'position': ...","{'accurate': True, 'angle': -129, 'height': No...",,,,,,"{'id': 2384314329, 'duration': '24.7149565', '..."
892,2384314334,5588197,2H,45,11,00:45:11.652,2915.652044,2384314000.0,"{'primary': 'pass', 'secondary': ['forward_pas...","{'x': 32, 'y': 74}","{'id': 1625, 'name': 'Manchester City'}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 297957, 'name': 'M. Akanji', 'position'...","{'accurate': True, 'angle': 0, 'height': None,...",,,,,,"{'id': 2384314329, 'duration': '24.7149565', '..."
893,2384314335,5588197,2H,45,12,00:45:12.472,2916.472783,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 38, 'y': 73}","{'id': 1625, 'name': 'Manchester City'}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 14808, 'name': 'İ. Gündoğan', 'position...","{'accurate': True, 'angle': -150, 'height': No...",,,,,,"{'id': 2384314329, 'duration': '24.7149565', '..."


# Prepare data

## Get possession ids from possession column

In [11]:
# Add new column for possession id with proper integer dtype (without it, it would be a float)
df_events['possession_id'] = pd.Series(
    [possession.get('id') if possession is not None else None for possession in df_events['possession']],
    dtype='Int64'  # pandas nullable integer type
)

df_events.head()

Unnamed: 0,id,matchId,matchPeriod,minute,second,matchTimestamp,videoTimestamp,relatedEventId,type,location,...,opponentTeam,player,pass,shot,groundDuel,aerialDuel,infraction,carry,possession,possession_id
0,2384313747,5588197,1H,0,2,00:00:02.559,3.559115,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 52, 'y': 52}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 286831, 'name': 'D. Solanke', 'position...","{'accurate': True, 'angle': -159, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
1,2384313748,5588197,1H,0,4,00:00:04.324,5.324929,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 37, 'y': 42}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': 62, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
2,2384313771,5588197,1H,0,6,00:00:06.973,7.973209,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 45, 'y': 65}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 551442, 'name': 'Pedro Porro', 'positio...","{'accurate': True, 'angle': -95, 'height': Non...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
3,2384313772,5588197,1H,0,8,00:00:08.768,9.768278,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 44, 'y': 47}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': -135, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
4,2384313775,5588197,1H,0,10,00:00:10.769,11.769625,2384314000.0,"{'primary': 'pass', 'secondary': ['forward_pas...","{'x': 34, 'y': 32}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 136441, 'name': 'B. Davies', 'position'...","{'accurate': True, 'angle': 32, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747


## Split type column

In [12]:
df_events['primary_type'] = [type.get('primary') if type is not None else None for type in df_events['type']]
df_events['secondary_type'] = [type.get('secondary') if type is not None else None for type in df_events['type']]

df_events.head()

Unnamed: 0,id,matchId,matchPeriod,minute,second,matchTimestamp,videoTimestamp,relatedEventId,type,location,...,pass,shot,groundDuel,aerialDuel,infraction,carry,possession,possession_id,primary_type,secondary_type
0,2384313747,5588197,1H,0,2,00:00:02.559,3.559115,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 52, 'y': 52}",...,"{'accurate': True, 'angle': -159, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747,pass,"[back_pass, short_or_medium_pass]"
1,2384313748,5588197,1H,0,4,00:00:04.324,5.324929,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 37, 'y': 42}",...,"{'accurate': True, 'angle': 62, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747,pass,"[lateral_pass, short_or_medium_pass]"
2,2384313771,5588197,1H,0,6,00:00:06.973,7.973209,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 45, 'y': 65}",...,"{'accurate': True, 'angle': -95, 'height': Non...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747,pass,"[lateral_pass, short_or_medium_pass]"
3,2384313772,5588197,1H,0,8,00:00:08.768,9.768278,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 44, 'y': 47}",...,"{'accurate': True, 'angle': -135, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747,pass,"[back_pass, short_or_medium_pass]"
4,2384313775,5588197,1H,0,10,00:00:10.769,11.769625,2384314000.0,"{'primary': 'pass', 'secondary': ['forward_pas...","{'x': 34, 'y': 32}",...,"{'accurate': True, 'angle': 32, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747,pass,"[forward_pass, short_or_medium_pass]"


## Get passes

In [13]:
# Get passes and relevant columns
passes_mask = df_events["primary_type"] == "pass"
df_passes = df_events.loc[passes_mask, ["id", "possession_id", "matchTimestamp", "pass", "location", "player"]]

# Only use successful passes
df_passes["pass_completed"] = [event.get("accurate") if event is not None else None for event in df_passes["pass"]]
df_passes = df_passes[df_passes["pass_completed"] == True]

# Split the pass column (use event because pass is python keyword)
df_passes["end_location"] = [event.get("endLocation") if event is not None else None for event in df_passes["pass"]]
df_passes["pass_length"] = [event.get("length") if event is not None else None for event in df_passes["pass"]]

# Get player id for the pass
df_passes["player_id"] = [event.get("id") if event is not None else None for event in df_passes["player"]]

"""
From docs:
For angle, 0° represents a perfect forward pass (straight line towards the goal). 
Passes to the right will have positive values (90° pass is a pass strictly to the right), to the left, negative (-90° pass is a pass strictly to the left). 
Straight back passes will have the angle of 180°. 
Angle is specified in degrees, taking into account standard field dimensions.
"""
df_passes["pass_angle"] = [event.get("angle") if event is not None else None for event in df_passes["pass"]]

# Use absolute value of angle (negative values might be interpreted by the model as "worse" than positive values)
df_passes["pass_angle"] = abs(df_passes["pass_angle"])

# Split the location columns
df_passes["x0"] = [event.get("x") if event is not None else None for event in df_passes["location"]]
df_passes["y0"] = [event.get("y") if event is not None else None for event in df_passes["location"]]
df_passes["x1"] = [event.get("x") if event is not None else None for event in df_passes["end_location"]]
df_passes["y1"] = [event.get("y") if event is not None else None for event in df_passes["end_location"]]

# Drop the pass column
df_passes = df_passes.drop(columns=["pass"])
df_passes = df_passes.drop(columns=["player"])
df_passes = df_passes.drop(columns=["location"])
df_passes = df_passes.drop(columns=["end_location"])

# Rename timestamp column
df_passes = df_passes.rename(columns={"matchTimestamp": "pass_timestamp"})

df_passes.head()

Unnamed: 0,id,possession_id,pass_timestamp,pass_completed,pass_length,player_id,pass_angle,x0,y0,x1,y1
0,2384313747,2384313747,00:00:02.559,True,17,286831,159,52,52,37,42
1,2384313748,2384313747,00:00:04.324,True,18,413582,62,37,42,45,65
2,2384313771,2384313747,00:00:06.973,True,12,551442,95,45,65,44,47
3,2384313772,2384313747,00:00:08.768,True,15,413582,135,44,47,34,32
4,2384313775,2384313747,00:00:10.769,True,10,136441,32,34,32,42,39


## Convert wyscout coordinates

The [wyscout pitch](https://apidocs.wyscout.com/#section/Data-glossary-and-definitions/Pitch-coordinates) is 100 x 100. We need to convert it to 105 x 68 to have a more realistic pitch that can be plotted in mplsoccer

Mplsoccer is basically still drawing a graph but the wyscout coordinates don't match that.

We need to flip the x-axis so (0,0) is the bottom left corner of the graph and (100, 68) is the top right corner of the graph.

In [14]:
df_passes["x0"] = (100 - df_passes["x0"]) * 105/100
df_passes["x1"] = (100 - df_passes["x0"]) * 105/100
df_passes["y0"] = df_passes["y0"] * 68/100
df_passes["y1"] = df_passes["y0"] * 68/100

df_passes.head()

Unnamed: 0,id,possession_id,pass_timestamp,pass_completed,pass_length,player_id,pass_angle,x0,y0,x1,y1
0,2384313747,2384313747,00:00:02.559,True,17,286831,159,50.4,35.36,52.08,24.0448
1,2384313748,2384313747,00:00:04.324,True,18,413582,62,66.15,28.56,35.5425,19.4208
2,2384313771,2384313747,00:00:06.973,True,12,551442,95,57.75,44.2,44.3625,30.056
3,2384313772,2384313747,00:00:08.768,True,15,413582,135,58.8,31.96,43.26,21.7328
4,2384313775,2384313747,00:00:10.769,True,10,136441,32,69.3,21.76,32.235,14.7968


## Calculate distance and angle to goal

The *c* variable is the distance to the horizontal line through the middle of the pitch.

Once we have this, we can use the pythagorean theorem to [calculate the distance](https://www.youtube.com/watch?v=Qkpr30zSpiE&t=297s&ab_channel=FriendsofTracking) to the goal.

And finally we can calculate the angle to the goal using the formula from the [Geometry of Shooting article](https://soccermatics.medium.com/the-geometry-of-shooting-ae7a67fdf760).

In [15]:
df_passes["c0"] = abs(df_passes["y0"] - 34)      # 34 is the middle of the pitch after the scaling we did before
df_passes["c1"] = abs(df_passes["y1"] - 34)

# Calculate distance to goal
df_passes["d0"] = np.sqrt(df_passes["c0"]**2 + df_passes["x0"]**2)
df_passes["d1"] = np.sqrt(df_passes["c1"]**2 + df_passes["x1"]**2)

# Calculate angle to goal
df_passes["a0"] = np.where(np.arctan(7.32 * df_passes["x0"] / (df_passes["x0"]**2 + df_passes["c0"]**2 - (7.32/2)**2)) > 0, np.arctan(7.32 * df_passes["x0"] /(df_passes["x0"]**2 + df_passes["c0"]**2 - (7.32/2)**2)), np.arctan(7.32 * df_passes["x0"] /(df_passes["x0"]**2 + df_passes["c0"]**2 - (7.32/2)**2)) + np.pi)
df_passes["a1"] = np.where(np.arctan(7.32 * df_passes["x1"] / (df_passes["x1"]**2 + df_passes["c1"]**2 - (7.32/2)**2)) > 0, np.arctan(7.32 * df_passes["x1"] /(df_passes["x1"]**2 + df_passes["c1"]**2 - (7.32/2)**2)), np.arctan(7.32 * df_passes["x1"] /(df_passes["x1"]**2 + df_passes["c1"]**2 - (7.32/2)**2)) + np.pi)

df_passes.head()

Unnamed: 0,id,possession_id,pass_timestamp,pass_completed,pass_length,player_id,pass_angle,x0,y0,x1,y1,c0,c1,d0,d1,a0,a1
0,2384313747,2384313747,00:00:02.559,True,17,286831,159,50.4,35.36,52.08,24.0448,1.36,9.9552,50.418346,53.022942,0.144879,0.135414
1,2384313748,2384313747,00:00:04.324,True,18,413582,62,66.15,28.56,35.5425,19.4208,5.44,14.5792,66.373309,38.416434,0.109806,0.176062
2,2384313771,2384313747,00:00:06.973,True,12,551442,95,57.75,44.2,44.3625,30.056,10.2,3.944,58.643862,44.537473,0.122779,0.163355
3,2384313772,2384313747,00:00:08.768,True,15,413582,135,58.8,31.96,43.26,21.7328,2.04,12.2672,58.835377,44.965674,0.124181,0.156373
4,2384313775,2384313747,00:00:10.769,True,10,136441,32,69.3,21.76,32.235,14.7968,12.24,19.2032,70.372634,37.521435,0.102351,0.167624


## Get shots

We need to isolate the shots to get the xG to assign to the possession chain

In [16]:
# Get shots and relevant columns
shots_mask = df_events["primary_type"] == "shot"
df_shots = df_events.loc[shots_mask, ["id", "possession_id", "matchTimestamp", "shot"]]

# Split the shot column
df_shots["ends_in_goal"] = [shot.get("isGoal") if shot is not None else None for shot in df_shots["shot"]]
df_shots["xg"] = [shot.get("xg") if shot is not None else None for shot in df_shots["shot"]]

# Drop the shot column
df_shots = df_shots.drop(columns=["shot"])

# Rename timestamp column
df_shots = df_shots.rename(columns={"matchTimestamp": "shot_timestamp"})

df_shots.head()

Unnamed: 0,id,possession_id,shot_timestamp,ends_in_goal,xg
82,2384313616,2384313590,00:04:02.711,False,0.07799
203,2384313695,2384313689,00:10:46.954,False,0.2278
238,2384313953,2384313934,00:12:31.494,True,0.344
276,2384313741,2384313730,00:15:13.222,False,0.1021
346,2384314079,2384313992,00:17:43.599,False,0.02519


## Merge df_shots with df_passes on possession_id

In [17]:
# Merge df_passes with df_shots_selected on possession_id
df_passes_with_shots = df_passes.merge(
    df_shots, 
    on='possession_id', 
    how='left'
)

# Drop pass_id and shot_id column
df_passes_with_shots = df_passes_with_shots.drop(columns=['id_x', 'id_y'])

In [18]:
df_passes_with_shots[df_passes_with_shots['possession_id'] == 2384313590].head()

Unnamed: 0,possession_id,pass_timestamp,pass_completed,pass_length,player_id,pass_angle,x0,y0,x1,y1,c0,c1,d0,d1,a0,a1,shot_timestamp,ends_in_goal,xg
19,2384313590,00:02:55.211,True,13,297957,90,59.85,10.88,42.1575,7.3984,23.12,26.6016,64.1604,49.848771,0.106369,0.124217,00:04:02.711,False,0.07799
20,2384313590,00:02:59.460,True,30,9380,132,73.5,18.36,27.825,12.4848,15.64,21.5152,75.145589,35.172922,0.095216,0.164928,00:04:02.711,False,0.07799
21,2384313590,00:03:03.057,True,24,71654,48,93.45,40.12,6.8775,27.2816,6.12,6.7184,93.650184,9.614411,0.077957,0.567128,00:04:02.711,False,0.07799
22,2384313590,00:03:06.381,True,11,8277,90,76.65,57.8,24.5175,39.304,23.8,5.304,80.259968,25.084661,0.087062,0.283565,00:04:02.711,False,0.07799
23,2384313590,00:03:09.699,True,42,14808,94,64.05,47.6,37.7475,32.368,13.6,1.632,65.477954,37.782763,0.109261,0.192961,00:04:02.711,False,0.07799


In [19]:
df_passes_with_shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208444 entries, 0 to 208443
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   possession_id   208444 non-null  Int64  
 1   pass_timestamp  208444 non-null  object 
 2   pass_completed  208444 non-null  bool   
 3   pass_length     208444 non-null  int64  
 4   player_id       208444 non-null  int64  
 5   pass_angle      208444 non-null  int64  
 6   x0              208444 non-null  float64
 7   y0              208444 non-null  float64
 8   x1              208444 non-null  float64
 9   y1              208444 non-null  float64
 10  c0              208444 non-null  float64
 11  c1              208444 non-null  float64
 12  d0              208444 non-null  float64
 13  d1              208444 non-null  float64
 14  a0              208444 non-null  float64
 15  a1              208444 non-null  float64
 16  shot_timestamp  31538 non-null   object 
 17  ends_in_go

## Indicate danger passes

For this excercise the danger passes are passes that end in a shot within 15 seconds.

In [20]:
# Convert timestamps to datetime
df_passes_with_shots['pass_timestamp'] = pd.to_timedelta(df_passes_with_shots['pass_timestamp'])
df_passes_with_shots['shot_timestamp'] = pd.to_timedelta(df_passes_with_shots['shot_timestamp'])

# Identify danger passes
df_passes_with_shots['is_danger_pass'] = (
    (df_passes_with_shots['shot_timestamp'].notna()) &
    ((df_passes_with_shots['shot_timestamp'] - df_passes_with_shots['pass_timestamp']).dt.total_seconds() >= 0) &
    ((df_passes_with_shots['shot_timestamp'] - df_passes_with_shots['pass_timestamp']).dt.total_seconds() <= 15)
)

## Prepare data for model

In [21]:
# Drop columns that are not needed
df_danger_passes = df_passes_with_shots.drop(columns=["pass_timestamp", "shot_timestamp", "pass_completed", "possession_id"])

# Fill NaN values
df_danger_passes["xg"] = df_danger_passes["xg"].fillna(0)
df_danger_passes["ends_in_goal"] = df_danger_passes["ends_in_goal"].fillna(False)

df_danger_passes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208444 entries, 0 to 208443
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   pass_length     208444 non-null  int64  
 1   player_id       208444 non-null  int64  
 2   pass_angle      208444 non-null  int64  
 3   x0              208444 non-null  float64
 4   y0              208444 non-null  float64
 5   x1              208444 non-null  float64
 6   y1              208444 non-null  float64
 7   c0              208444 non-null  float64
 8   c1              208444 non-null  float64
 9   d0              208444 non-null  float64
 10  d1              208444 non-null  float64
 11  a0              208444 non-null  float64
 12  a1              208444 non-null  float64
 13  ends_in_goal    208444 non-null  bool   
 14  xg              208444 non-null  float64
 15  is_danger_pass  208444 non-null  bool   
dtypes: bool(2), float64(11), int64(3)
memory usage: 22.7 MB


  df_danger_passes["ends_in_goal"] = df_danger_passes["ends_in_goal"].fillna(False)


In [22]:
df_danger_passes.head()

Unnamed: 0,pass_length,player_id,pass_angle,x0,y0,x1,y1,c0,c1,d0,d1,a0,a1,ends_in_goal,xg,is_danger_pass
0,17,286831,159,50.4,35.36,52.08,24.0448,1.36,9.9552,50.418346,53.022942,0.144879,0.135414,False,0.0,False
1,18,413582,62,66.15,28.56,35.5425,19.4208,5.44,14.5792,66.373309,38.416434,0.109806,0.176062,False,0.0,False
2,12,551442,95,57.75,44.2,44.3625,30.056,10.2,3.944,58.643862,44.537473,0.122779,0.163355,False,0.0,False
3,15,413582,135,58.8,31.96,43.26,21.7328,2.04,12.2672,58.835377,44.965674,0.124181,0.156373,False,0.0,False
4,10,136441,32,69.3,21.76,32.235,14.7968,12.24,19.2032,70.372634,37.521435,0.102351,0.167624,False,0.0,False


# Danger pass model

In [23]:
# Create X and y for danger pass model
X = df_danger_passes.drop(columns=['ends_in_goal', 'xg', 'is_danger_pass', 'player_id'])
y = df_danger_passes['is_danger_pass']

# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train.head()

Unnamed: 0,pass_length,pass_angle,x0,y0,x1,y1,c0,c1,d0,d1,a0,a1
12880,12,114,55.65,63.92,46.5675,43.4656,29.92,9.4656,63.183296,47.519782,0.102028,0.150704
22221,11,73,45.15,21.08,57.5925,14.3344,12.92,19.6656,46.962207,60.857472,0.149644,0.113748
6048,11,0,27.3,62.56,76.335,42.5408,28.56,8.5408,39.509032,76.811311,0.128418,0.094639
48588,44,56,81.9,60.52,19.005,41.1536,26.52,7.1536,86.086703,20.306748,0.080865,0.335508
195329,11,169,11.55,4.08,92.8725,2.7744,29.92,31.2256,32.071933,97.981321,0.083087,0.070793


In [25]:
# Create pipeline
danger_pass_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestClassifier(random_state=42))
])

# Create parameter grid
param_grid = [
    {'rf__n_estimators': [3, 10, 30], 'rf__max_features': [4, 6, 8]},
    {'rf__bootstrap': [False], 'rf__n_estimators': [3, 10], 'rf__max_features': [2, 3, 4]},
]

# Grid search for best parameters
danger_pass_grid_search = GridSearchCV(
    danger_pass_pipeline, 
    param_grid, 
    cv=5, 
    n_jobs=-1, 
    scoring='accuracy'
)

# Fit the grid search
danger_pass_grid_search.fit(X_train, y_train)

# best estimator, params and score
print(f"Best estimator: {danger_pass_grid_search.best_estimator_}")
print(f"Best params: {danger_pass_grid_search.best_params_}")
print(f"Best score: {danger_pass_grid_search.best_score_}")

Best estimator: Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestClassifier(max_features=8, n_estimators=30,
                                        random_state=42))])
Best params: {'rf__max_features': 8, 'rf__n_estimators': 30}
Best score: 0.9178795238523583


## Predict danger pass probability

In [26]:
# Predict probabilities for danger passes
danger_pass_proba = danger_pass_grid_search.best_estimator_.predict_proba(X)

# Add probabilities to df_danger_passes
df_danger_passes["danger_pass_proba"] = danger_pass_proba[:, 1]

In [29]:
df_danger_passes.head()

Unnamed: 0,pass_length,player_id,pass_angle,x0,y0,x1,y1,c0,c1,d0,d1,a0,a1,ends_in_goal,xg,is_danger_pass,danger_pass_proba
0,17,286831,159,50.4,35.36,52.08,24.0448,1.36,9.9552,50.418346,53.022942,0.144879,0.135414,False,0.0,False,0.0
1,18,413582,62,66.15,28.56,35.5425,19.4208,5.44,14.5792,66.373309,38.416434,0.109806,0.176062,False,0.0,False,0.0
2,12,551442,95,57.75,44.2,44.3625,30.056,10.2,3.944,58.643862,44.537473,0.122779,0.163355,False,0.0,False,0.0
3,15,413582,135,58.8,31.96,43.26,21.7328,2.04,12.2672,58.835377,44.965674,0.124181,0.156373,False,0.0,False,0.0
4,10,136441,32,69.3,21.76,32.235,14.7968,12.24,19.2032,70.372634,37.521435,0.102351,0.167624,False,0.0,False,0.033333
