In [16]:
%load_ext autoreload
%autoreload 2

import os
import tqdm
import numpy as np
import pandas as pd
import softclustering as sc
import matplotlib.pyplot as plt
import socceraction.spadl as spadl


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Concatenate actions of all games in one DataFrame.

In [17]:
datafolder = "data"
fifa2018h5 = os.path.join(datafolder, "spadl-fifa2018.h5")
games = pd.read_hdf(fifa2018h5, key="games")
with pd.HDFStore(fifa2018h5) as store:
    actions = []  #list of DataFrames
    for game in tqdm.tqdm(games.itertuples()):
        game_action = store[f"actions/game_{game.game_id}"]
        game_action = spadl.play_left_to_right(game_action, game.home_team_id)
        game_action["is_home"] = game_action["team_id"] == game.home_team_id
        actions.append(game_action)
    actions = pd.concat(actions)
    actions.drop("original_event_id", axis=1, inplace=True)
    actions = pd.merge(actions, spadl.config.actiontypes_df(), how="left")

64it [00:00, 209.67it/s]


In [18]:
def consolidate(actions):
    #actions.fillna(0, inplace=True)

    #Consolidate corner_short and corner_crossed
    corner_idx = actions.type_name.str.contains("corner")
    actions["type_name"] = actions["type_name"].mask(corner_idx, "corner")

    #Consolidate freekick_short, freekick_crossed, and shot_freekick
    freekick_idx = actions.type_name.str.contains("freekick")
    actions["type_name"] = actions["type_name"].mask(freekick_idx, "freekick")

    #Consolidate keeper_claim, keeper_punch, keeper_save, keeper_pick_up
    keeper_idx = actions.type_name.str.contains("keeper")
    actions["type_name"] = actions["type_name"].mask(keeper_idx, "keeper_action")

    actions["start_x"] = actions["start_x"].mask(actions.type_name == "shot_penalty", 94.5)
    actions["start_y"] = actions["start_y"].mask(actions.type_name == "shot_penalty", 34)

    return actions


actions = consolidate(actions)

In [19]:
#Actions of Team France matches.
len(actions[actions["team_id"] == 771])

6829

In [20]:
actions.groupby("type_name").size()

type_name
bad_touch         1547
clearance         2074
corner             558
cross             1305
dribble          52731
foul              1876
freekick          1272
goalkick           677
interception      1681
keeper_action      584
pass             56438
shot              1556
shot_penalty        68
tackle            1830
take_on           2109
throw_in          2178
dtype: int64

As suggested in SoccerMix, add noise on the starting and ending locations, but only on those actions that we can visually note a predefined pattern.
* *Add noise in both start and end locations*:
    * Cross
    * Shot
    * Keeper_action
* *Only on start locations*:
    * Clearance
    * Goal kick
* *Only on end locations*:
    * Corner
    * Freekick
    * Shot_penalty

In [21]:
def add_noise(actions):
    # Start locations
    start_list = ["cross", "shot", "keeper_action", "clearance", "goalkick"]
    mask = actions["type_name"].isin(start_list)
    noise = np.random.normal(0, 0.5, size=actions.loc[mask, ["start_x", "start_y"]].shape)
    actions.loc[mask, ["start_x", "start_y"]] += noise

    # End locations
    end_list = ["cross", "shot", "keeper_action", "corner", "freekick", "shot_penalty"]
    mask = actions["type_name"].isin(end_list)
    noise = np.random.normal(0, 0.5, size=actions.loc[mask, ["end_x", "end_y"]].shape)
    actions.loc[mask, ["end_x", "end_y"]] += noise

    return actions


actions = add_noise(actions)

In [22]:
# # display event locations with noise
# corrected_actions = ["cross", "shot", "keeper_action", "clearance", "goalkick","corner", "freekick", "shot_penalty"]
# for actiontype in corrected_actions:
#     actions[actions.type_name == actiontype].plot.scatter(
#         x="start_x",
#         y="start_y",
#         title = f"Start Location: {actiontype}",
#         figsize = (6,4)
#     )
#     plt.show()
#     actions[actions.type_name == actiontype].plot.scatter(
#         x="end_x",
#         y="end_y",
#         title = f"End Location: {actiontype}",
#         figsize = (6,4)
#     )
#     plt.show()

Compute the angle of the direction of the action with respect with the x-axis (pitch's length) a
$$\tan \theta = \frac{y_{end} - y_{start}}{x_{end} - x_{start}}$$

In [23]:
actions["angle"] = np.arctan2(actions.end_y - actions.start_y, actions.end_x - actions.start_x)
actions["angle"].describe()

count    128484.000000
mean          0.061403
std           1.464570
min          -3.140993
25%          -0.969342
50%           0.000000
75%           1.074274
max           3.141593
Name: angle, dtype: float64

In [50]:
mask = (actions["type_name"]=="throw_in") & (actions["team_id"]==771)
data = actions[mask][["start_x", "start_y"]]
data.describe()

Unnamed: 0,start_x,start_y
count,116.0,116.0
mean,60.918103,38.030172
std,26.507723,33.145853
min,9.1875,0.425
25%,38.9375,0.425
50%,64.75,66.725
75%,85.53125,67.575
max,101.9375,67.575


In [70]:
K_gauss = 4
gauss_clusters = [sc.MultivariateGaussian() for j in range(K_gauss)]
gaussian_model = sc.MixtureModel(gauss_clusters)
print(gaussian_model.weights)
_ = gaussian_model.fit_classical_EM(data, verbose=True)

None
Data log-likelihood at iter 0: -668.40
Data log-likelihood at iter 1: -667.32
Data log-likelihood at iter 2: -667.02
Data log-likelihood at iter 3: -666.87
Data log-likelihood at iter 4: -666.76
Data log-likelihood at iter 5: -666.67
Data log-likelihood at iter 6: -666.59
Data log-likelihood at iter 7: -666.51
Data log-likelihood at iter 8: -666.44
Data log-likelihood at iter 9: -666.36
Data log-likelihood at iter 10: -666.28
Data log-likelihood at iter 11: -666.19
Data log-likelihood at iter 12: -666.10
Data log-likelihood at iter 13: -666.00
Data log-likelihood at iter 14: -665.89
Data log-likelihood at iter 15: -665.76
Data log-likelihood at iter 16: -665.61
Data log-likelihood at iter 17: -665.44
Data log-likelihood at iter 18: -665.24
Data log-likelihood at iter 19: -664.99
Data log-likelihood at iter 20: -664.70
Data log-likelihood at iter 21: -664.35
Data log-likelihood at iter 22: -663.95
Data log-likelihood at iter 23: -663.53
Data log-likelihood at iter 24: -663.12
Data 

In [71]:
for cluster in gaussian_model.components:
    print(cluster)

MultivariateGaussian(d=2, mean=[59.64583333  1.275     ], cov=[[5.10295139e+02 6.65601389e-31]
 [6.78318640e-31 1.00000000e-09]])
MultivariateGaussian(d=2, mean=[50.60416667 66.725     ], cov=[[6.38828993e+02 7.25752033e-29]
 [7.20890130e-29 1.00000000e-09]])
MultivariateGaussian(d=2, mean=[64.98333333  0.425     ], cov=[[ 6.78174462e+02 -3.69778549e-32]
 [-6.76503222e-32  1.00000000e-09]])
MultivariateGaussian(d=2, mean=[64.63262195 67.575     ], cov=[[7.50340738e+02 9.86076132e-29]
 [9.85953459e-29 1.00000000e-09]])
