In [25]:
import pandas as pd
import numpy as np
import json
import os
import pathlib
import warnings
from joblib import load

import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations_with_replacement
from sklearn.linear_model import LinearRegression

from mplsoccer import Pitch
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

In [26]:
notebook_dir = pathlib.Path().absolute()
events_path = os.path.join(notebook_dir,"wyscout_data", "events_data")

df = pd.DataFrame()

# Get all JSON files in the directory
json_files = [f for f in os.listdir(events_path) if f.endswith('.json')]

# Load each file and concatenate to the dataframe
for file_name in json_files:
    path = os.path.join(events_path, file_name)
    with open(path) as f:
        data = json.load(f)
    df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

df = df.reset_index()
df.head()

Unnamed: 0,index,id,matchId,matchPeriod,minute,second,matchTimestamp,videoTimestamp,relatedEventId,type,...,team,opponentTeam,player,pass,shot,groundDuel,aerialDuel,infraction,carry,possession
0,0,2384313747,5588197,1H,0,2,00:00:02.559,3.559115,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...",...,"{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 286831, 'name': 'D. Solanke', 'position...","{'accurate': True, 'angle': -159, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
1,1,2384313748,5588197,1H,0,4,00:00:04.324,5.324929,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...",...,"{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': 62, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
2,2,2384313771,5588197,1H,0,6,00:00:06.973,7.973209,2384314000.0,"{'primary': 'pass', 'secondary': ['lateral_pas...",...,"{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 551442, 'name': 'Pedro Porro', 'positio...","{'accurate': True, 'angle': -95, 'height': Non...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
3,3,2384313772,5588197,1H,0,8,00:00:08.768,9.768278,2384314000.0,"{'primary': 'pass', 'secondary': ['back_pass',...",...,"{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 413582, 'name': 'Y. Bissouma', 'positio...","{'accurate': True, 'angle': -135, 'height': No...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."
4,4,2384313775,5588197,1H,0,10,00:00:10.769,11.769625,2384314000.0,"{'primary': 'pass', 'secondary': ['forward_pas...",...,"{'id': 1624, 'name': 'Tottenham Hotspur'}","{'id': 1625, 'name': 'Manchester City'}","{'id': 136441, 'name': 'B. Davies', 'position'...","{'accurate': True, 'angle': 32, 'height': None...",,,,,,"{'id': 2384313747, 'duration': '9.752984', 'ty..."


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480511 entries, 0 to 480510
Data columns (total 21 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   index           480511 non-null  int64  
 1   id              480511 non-null  int64  
 2   matchId         480511 non-null  int64  
 3   matchPeriod     480511 non-null  object 
 4   minute          480511 non-null  int64  
 5   second          480511 non-null  int64  
 6   matchTimestamp  480511 non-null  object 
 7   videoTimestamp  480511 non-null  object 
 8   relatedEventId  452956 non-null  float64
 9   type            480511 non-null  object 
 10  location        480150 non-null  object 
 11  team            480511 non-null  object 
 12  opponentTeam    480511 non-null  object 
 13  player          480511 non-null  object 
 14  pass            274869 non-null  object 
 15  shot            6971 non-null    object 
 16  groundDuel      77664 non-null   object 
 17  aerialDuel

# Preparing the data


In [28]:
# Split the type column
df["primaryType"] = df["type"].apply(lambda x: x["primary"] if isinstance(x, dict) else None)
df["secondaryType"] = df["type"].apply(lambda x: x["secondary"] if isinstance(x, dict) else [])

# Look at next event
next_event = df.shift(-1, fill_value=0)
df["nextPrimaryType"] = next_event["primaryType"]
df["nextSecondaryType"] = next_event["secondaryType"]
df["kickedOut"] = df["nextSecondaryType"].apply(lambda x: 'ball_out' in x if isinstance(x, list) else False)

# Filter out actions that ended with the ball going out
df = df[~df["kickedOut"]]
df.info()

# Only keep passes and shots (expand later)
#df = df[df["primaryType"].isin(["pass", "shot", "penalty"])]
#df = df[df["primaryType"].isin(["pass", "shot", "free_kick", "clearance", "corner", "goal_kick", "penalty"])]

<class 'pandas.core.frame.DataFrame'>
Index: 463875 entries, 0 to 480510
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   index              463875 non-null  int64  
 1   id                 463875 non-null  int64  
 2   matchId            463875 non-null  int64  
 3   matchPeriod        463875 non-null  object 
 4   minute             463875 non-null  int64  
 5   second             463875 non-null  int64  
 6   matchTimestamp     463875 non-null  object 
 7   videoTimestamp     463875 non-null  object 
 8   relatedEventId     438570 non-null  float64
 9   type               463875 non-null  object 
 10  location           463515 non-null  object 
 11  team               463875 non-null  object 
 12  opponentTeam       463875 non-null  object 
 13  player             463875 non-null  object 
 14  pass               271709 non-null  object 
 15  shot               4834 non-null    object 
 16  groundD