In [12]:
from pathlib import Path
import os
import json

import pandas as pd
import numpy as np

# Get events data

In [13]:
# Get project root
project_root = Path().absolute().parent

# Get path to event data
event_data_path = os.path.join(project_root, "data", "wyscout_data", "events_data")

# Get all JSON files in the directory
json_files = [f for f in os.listdir(event_data_path) if f.endswith('.json')]

# Add all events to a list (more efficient than concatenating on each iteration)
df_list = [
    pd.DataFrame(json.load(open(os.path.join(event_data_path, file))))
    for file in json_files
]

# Concatenate all dataframes at once
df_events = pd.concat(df_list, ignore_index=True)

In [14]:
df_events.head()

Unnamed: 0,id,matchId,matchPeriod,minute,second,matchTimestamp,videoTimestamp,relatedEventId,type,location,team,opponentTeam,player,pass,shot,groundDuel,aerialDuel,infraction,carry,possession
0,2384313747,5588197,1H,0,2,00:00:02.559,3.559115,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 52, 'y': 52}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 286831, 'name': 'D. Solanke', 'position...","{'accurate': True, 'angle': -159, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
1,2384313748,5588197,1H,0,4,00:00:04.324,5.324929,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 37, 'y': 42}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': 62, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
2,2384313771,5588197,1H,0,6,00:00:06.973,7.973209,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 45, 'y': 65}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 551442, 'name': 'Pedro Porro', 'positio...","{'accurate': True, 'angle': -95, 'height': Non...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
3,2384313772,5588197,1H,0,8,00:00:08.768,9.768278,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 44, 'y': 47}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': -135, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
4,2384313775,5588197,1H,0,10,00:00:10.769,11.769625,2384314000.0,"{'primary': 'pass', 'secondary': ['forward_pas...","{'x': 34, 'y': 32}","{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 136441, 'name': 'B. Davies', 'position'...","{'accurate': True, 'angle': 32, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."


# Explore data

In [15]:
df_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480511 entries, 0 to 480510
Data columns (total 20 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              480511 non-null  int64  
 1   matchId         480511 non-null  int64  
 2   matchPeriod     480511 non-null  object 
 3   minute          480511 non-null  int64  
 4   second          480511 non-null  int64  
 5   matchTimestamp  480511 non-null  object 
 6   videoTimestamp  480511 non-null  object 
 7   relatedEventId  452956 non-null  float64
 8   type            480511 non-null  object 
 9   location        480150 non-null  object 
 10  team            480511 non-null  object 
 11  opponentTeam    480511 non-null  object 
 12  player          480511 non-null  object 
 13  pass            274869 non-null  object 
 14  shot            6971 non-null    object 
 15  groundDuel      77664 non-null   object 
 16  aerialDuel      18450 non-null   object 
 17  infraction

In [16]:
# Possession collumn is an object
df_events.iloc[0]["possession"]

{'id': 2384313747,
 'duration': '9.752984',
 'types': [],
 'eventsNumber': 6,
 'eventIndex': 0,
 'startLocation': {'x': 52, 'y': 52},
 'endLocation': {'x': 45, 'y': 28},
 'team': {'id': 1624, 'name': 'Tottenham Hotspur'},
 'attack': None}

In [17]:
df_events.iloc[0]["possession"]['id']

2384313747

Test what method is the fastest to get data from the object column like possession

In [19]:
import time

# Method 1: apply with lambda and None handling
start = time.time()
test1 = df_events['possession'].apply(lambda x: x['id'] if x is not None else None)
time1 = time.time() - start

# Method 2: list comprehension with None handling
start = time.time()
test2 = [possession['id'] if possession is not None else None for possession in df_events['possession']]
time2 = time.time() - start

# Method 3: Using .get() method
start = time.time()
test3 = df_events['possession'].apply(lambda x: x.get('id') if x is not None else None)
time3 = time.time() - start

# Method 4: List comprehension with .get()
start = time.time()
test4 = [possession.get('id') if possession is not None else None for possession in df_events['possession']]
time4 = time.time() - start


print(f"Apply method: {time1:.4f} seconds")
print(f"List comprehension: {time2:.4f} seconds")
print(f"Apply with .get(): {time3:.4f} seconds")
print(f"List comprehension with .get(): {time4:.4f} seconds")

Apply method: 0.5904 seconds
List comprehension: 0.3168 seconds
Apply with .get(): 0.4941 seconds
List comprehension with .get(): 0.2985 seconds


# Prepare data

In [None]:
# Add new column for possession id with proper integer dtype (without it, it would be a float)
df_events['possession_id'] = pd.Series(
    [int(possession.get('id')) if possession is not None else None for possession in df_events['possession']],
    dtype='Int64'  # pandas nullable integer type
)

df_events.head()

Unnamed: 0,id,matchId,matchPeriod,minute,second,matchTimestamp,videoTimestamp,relatedEventId,type,location,...,opponentTeam,player,pass,shot,groundDuel,aerialDuel,infraction,carry,possession,possession_id
0,2384313747,5588197,1H,0,2,00:00:02.559,3.559115,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 52, 'y': 52}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 286831, 'name': 'D. Solanke', 'position...","{'accurate': True, 'angle': -159, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
1,2384313748,5588197,1H,0,4,00:00:04.324,5.324929,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 37, 'y': 42}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': 62, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
2,2384313771,5588197,1H,0,6,00:00:06.973,7.973209,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...","{'x': 45, 'y': 65}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 551442, 'name': 'Pedro Porro', 'positio...","{'accurate': True, 'angle': -95, 'height': Non...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
3,2384313772,5588197,1H,0,8,00:00:08.768,9.768278,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...","{'x': 44, 'y': 47}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': -135, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
4,2384313775,5588197,1H,0,10,00:00:10.769,11.769625,2384314000.0,"{'primary': 'pass', 'secondary': ['forward_pas...","{'x': 34, 'y': 32}",...,"{'id': 1625, 'name': 'Manchester City'}","{'id': 136441, 'name': 'B. Davies', 'position'...","{'accurate': True, 'angle': 32, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty...",2384313747
