In [31]:
import pandas as pd
import numpy as np
import json
# plotting
import os
import pathlib
import warnings
import statsmodels.api as sm
import statsmodels.formula.api as smf
from mplsoccer import Pitch
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

# Get the data

In [32]:
notebook_dir = pathlib.Path().absolute()
events_path = os.path.join(notebook_dir.parent, "wyscout-data", "events", "events_England.json")

with open(events_path) as f:
    events = json.load(f)

df = pd.DataFrame(events)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643150 entries, 0 to 643149
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   eventId       643150 non-null  int64  
 1   subEventName  643150 non-null  object 
 2   tags          643150 non-null  object 
 3   playerId      643150 non-null  int64  
 4   positions     643150 non-null  object 
 5   matchId       643150 non-null  int64  
 6   eventName     643150 non-null  object 
 7   teamId        643150 non-null  int64  
 8   matchPeriod   643150 non-null  object 
 9   eventSec      643150 non-null  float64
 10  subEventId    643150 non-null  object 
 11  id            643150 non-null  int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 58.9+ MB


# Preparing the data

In [33]:
# Get at next event
next_event = df.shift(-1, fill_value=0)

# Set nextEvent and kickedOut
df["nextEvent"] = next_event["subEventName"]
df["kickedOut"] = df.apply(lambda x: 1 if x["nextEvent"] == "Ball out of the field" else 0, axis=1)

# Drop interruptions
interruptions = df[df["eventName"] == "Interruption"]
df = df.drop(interruptions.index)

# Drop non-accurate duels (wyscout saves two: attacking and defending)
lost_duels = df[df["eventName"] == "Duel"]
lost_duels = lost_duels[lost_duels.apply(lambda x: {"id": 1802} in x["tags"], axis = 1)]
df = df.drop(lost_duels.index)

# Drop ball out of field
out_of_fields = df[df["subEventName"] == "Ball out of the field"]
df = df.drop(out_of_fields.index)

# Drop save attempts
save_attempts = df[df["subEventName"].isin(["Goalkeeper leaving line", "Save attempt", "Reflexes"])]
df = df.drop(save_attempts.index)

df.shape

(542524, 14)

# Isolating possession chains

In [37]:
def isolateChains(df):
    """
    Takes a dataframe with Wyscout event data.

    Returns a dataframe with isolated possesion chains

    """
    # Add new column with team id's of the next event
    df["nextTeamId"] = df.shift(-1, fill_value=0)["teamId"]

    # Init variables
    chain = 0
    chain_team = df.iloc[0]["teamId"]
    period = df.iloc[0]["matchPeriod"]
    stop_criterion = 0
    df["possession_chain"] = 0
    df["possession_chain_team"] = 0

    for i, row in df.iterrows():
        # Set the chain number and team
        df.at[i, "possession_chain"] = chain
        df.at[i, "possession_chain_team"] = chain_team

        # If pass is not accurate or it's a lost duel, add 1 to stop criterion
        if row["eventName"] == "Pass" or row["eventName"] == "Duel":
            if row["teamId"] == chain_team and {"id": 1802} in row["tags"]:
                    stop_criterion += 1
            if row["teamId"] != chain_team and {"id": 1801} in row["tags"]:
                    stop_criterion += 1

        # If ball is intercepted properly, add 2 to stop criterion
        if row["eventName"] == "Others on the ball":
            if row["teamId"] == row["nextTeamId"]:
                stop_criterion += 2

        # If actions is over thanks to a shot, foul or offside: add 2 to stop criterion
        if row["eventName"] in ["Shot", "Foul", "Offside"]:
            stop_criterion += 2

        # If ball is out of field, add 2 to stop criterion
        if row["kickedOut"] == 1:
            stop_criterion += 2
            
        # If period ended, reset variables
        if row["matchPeriod"] != period:
            chain += 1
            stop_criterion = 0
            chain_team = row['teamId']
            period = row["matchPeriod"]
            df.at[i, "possession_chain"] = chain
            df.at[i, "possession_chain_team"] = chain_team

        # If stop criterion is 2 or more, add 1 to chain number and reset stop criterion
        if stop_criterion >= 2:
            chain += 1
            stop_criterion = 0
            chain_team = row['nextTeamId']

    return df

df = isolateChains(df)

In [38]:
# Check a chain
df[df["possession_chain"] == 4][["eventName", "possession_chain"]]

Unnamed: 0,eventName,possession_chain
36,Free Kick,4
37,Pass,4
38,Duel,4
40,Duel,4
42,Pass,4
43,Pass,4
44,Pass,4
45,Pass,4
46,Shot,4
