In [27]:
%load_ext autoreload
%autoreload 2

import os
import tqdm
import numpy as np
import pandas as pd
import softclustering as sc
import matplotlib.pyplot as plt
import socceraction.spadl as spadl


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Concatenate actions of all games in one DataFrame.

In [28]:
datafolder = "data"
fifa2018h5 = os.path.join(datafolder, "spadl-fifa2018.h5")
games = pd.read_hdf(fifa2018h5, key="games")
with pd.HDFStore(fifa2018h5) as store:
    actions = []  #list of DataFrames
    for game in tqdm.tqdm(games.itertuples()):
        game_action = store[f"actions/game_{game.game_id}"]
        game_action = spadl.play_left_to_right(game_action, game.home_team_id)
        game_action["is_home"] = game_action["team_id"] == game.home_team_id
        actions.append(game_action)
    actions = pd.concat(actions)
    actions.drop("original_event_id", axis=1, inplace=True)
    actions = pd.merge(actions, spadl.config.actiontypes_df(), how="left")

64it [00:00, 155.61it/s]


In [29]:
def consolidate(actions):
    #actions.fillna(0, inplace=True)

    #Consolidate corner_short and corner_crossed
    corner_idx = actions.type_name.str.contains("corner")
    actions["type_name"] = actions["type_name"].mask(corner_idx, "corner")

    #Consolidate freekick_short, freekick_crossed, and shot_freekick
    freekick_idx = actions.type_name.str.contains("freekick")
    actions["type_name"] = actions["type_name"].mask(freekick_idx, "freekick")

    #Consolidate keeper_claim, keeper_punch, keeper_save, keeper_pick_up
    keeper_idx = actions.type_name.str.contains("keeper")
    actions["type_name"] = actions["type_name"].mask(keeper_idx, "keeper_action")

    actions["start_x"] = actions["start_x"].mask(actions.type_name == "shot_penalty", 94.5)
    actions["start_y"] = actions["start_y"].mask(actions.type_name == "shot_penalty", 34)

    return actions


actions = consolidate(actions)

In [30]:
#Actions of Team France matches.
len(actions[actions["team_id"] == 771])

6829

In [31]:
actions.groupby("type_name").size()

type_name
bad_touch         1547
clearance         2074
corner             558
cross             1305
dribble          52731
foul              1876
freekick          1272
goalkick           677
interception      1681
keeper_action      584
pass             56438
shot              1556
shot_penalty        68
tackle            1830
take_on           2109
throw_in          2178
dtype: int64

As suggested in SoccerMix, add noise on the starting and ending locations, but only on those actions that we can visually note a predefined pattern.
* *Add noise in both start and end locations*:
    * Cross
    * Shot
    * Keeper_action
* *Only on start locations*:
    * Clearance
    * Goal kick
* *Only on end locations*:
    * Corner
    * Freekick
    * Shot_penalty

In [32]:
def add_noise(actions):
    # Start locations
    start_list = ["cross", "shot", "keeper_action", "clearance", "goalkick"]
    mask = actions["type_name"].isin(start_list)
    noise = np.random.normal(0, 0.5, size=actions.loc[mask, ["start_x", "start_y"]].shape)
    actions.loc[mask, ["start_x", "start_y"]] += noise

    # End locations
    end_list = ["cross", "shot", "keeper_action", "corner", "freekick", "shot_penalty"]
    mask = actions["type_name"].isin(end_list)
    noise = np.random.normal(0, 0.5, size=actions.loc[mask, ["end_x", "end_y"]].shape)
    actions.loc[mask, ["end_x", "end_y"]] += noise

    return actions


actions = add_noise(actions)

In [33]:
# # display event locations with noise
# corrected_actions = ["cross", "shot", "keeper_action", "clearance", "goalkick","corner", "freekick", "shot_penalty"]
# for actiontype in corrected_actions:
#     actions[actions.type_name == actiontype].plot.scatter(
#         x="start_x",
#         y="start_y",
#         title = f"Start Location: {actiontype}",
#         figsize = (6,4)
#     )
#     plt.show()
#     actions[actions.type_name == actiontype].plot.scatter(
#         x="end_x",
#         y="end_y",
#         title = f"End Location: {actiontype}",
#         figsize = (6,4)
#     )
#     plt.show()

Compute the angle of the direction of the action with respect with the x-axis (pitch's length) a
$$\tan \theta = \frac{y_{end} - y_{start}}{x_{end} - x_{start}}$$

In [34]:
actions["angle"] = np.arctan2(actions.end_y - actions.start_y, actions.end_x - actions.start_x)
actions["cos_angle"] = np.cos(actions["angle"])
actions["sin_angle"] = np.sin(actions["angle"])
actions[["angle", "cos_angle", "sin_angle"]].describe()

Unnamed: 0,angle,cos_angle,sin_angle
count,128484.0,128484.0,128484.0
mean,0.061743,0.313585,-0.00597
std,1.464672,0.678448,0.664337
min,-3.140397,-1.0,-1.0
25%,-0.969342,-0.183971,-0.647648
50%,0.0,0.525493,0.0
75%,1.076271,0.954427,0.624695
max,3.141593,1.0,1.0


In [35]:
mask = (actions["type_name"]=="throw_in") & (actions["team_id"]==771)
data_loc = actions[mask][["start_x", "start_y"]]
data_loc.describe()

Unnamed: 0,start_x,start_y
count,116.0,116.0
mean,60.918103,38.030172
std,26.507723,33.145853
min,9.1875,0.425
25%,38.9375,0.425
50%,64.75,66.725
75%,85.53125,67.575
max,101.9375,67.575


In [56]:
k_gauss = 6
gauss_clusters = [sc.MultivariateGaussian() for j in range(k_gauss)]
loc_em_model = sc.MixtureModel(gauss_clusters)
loc_bregman_model = sc.MixtureModel(gauss_clusters)
_ = loc_em_model.fit_em_classic(data_loc, verbose=False)
_ = loc_bregman_model.fit_em_bregman(data_loc, verbose=False)

for i in range(k_gauss):
    print(f"Cluster {i+1}.\n Classical EM model")
    print(loc_em_model.components[i])
    print(f" Bregman EM model")
    print(loc_bregman_model.components[i])
    print("")

Cluster 1.
 Classical EM model
MultivariateGaussian(d=2, mean=[20.79851757  1.275     ], cov=[[3.22870946e+01 0.00000000e+00]
 [0.00000000e+00 9.99999556e-10]])
 Bregman EM model
MultivariateGaussian(d=2, mean=[20.79851757  1.275     ], cov=[[3.22870946e+01 0.00000000e+00]
 [0.00000000e+00 9.99999556e-10]])

Cluster 2.
 Classical EM model
MultivariateGaussian(d=2, mean=[56.52770319 67.05891077], cov=[[60.89060947 -2.50906618]
 [-2.50906618  0.17232775]])
 Bregman EM model
MultivariateGaussian(d=2, mean=[56.52770319 67.05891077], cov=[[60.89060947 -2.50906618]
 [-2.50906618  0.17232775]])

Cluster 3.
 Classical EM model
MultivariateGaussian(d=2, mean=[64.98333333  0.425     ], cov=[[6.78174462e+02 0.00000000e+00]
 [0.00000000e+00 1.00000006e-09]])
 Bregman EM model
MultivariateGaussian(d=2, mean=[64.98333333  0.425     ], cov=[[6.78174462e+02 0.00000000e+00]
 [0.00000000e+00 1.00000006e-09]])

Cluster 4.
 Classical EM model
MultivariateGaussian(d=2, mean=[83.35255752 67.439553  ], cov=[

In [64]:
import time

data_dir = actions[mask][["cos_angle", "sin_angle"]].to_numpy()
k_vm = 4

# Create three fresh sets of VonMises components (must be different instances)
vm_clusters_1 = [sc.VonMises() for _ in range(k_vm)]
vm_clusters_2 = [sc.VonMises() for _ in range(k_vm)]
vm_clusters_3 = [sc.VonMises() for _ in range(k_vm)]

# Initialize mixture models
dir_em_model = sc.MixtureModel(vm_clusters_1)
dir_proxy_model = sc.MixtureModel(vm_clusters_2)
dir_bregman_model = sc.MixtureModel(vm_clusters_3)

# Classic EM
start = time.time()
res_em = dir_em_model.fit_em_classic(data_dir, verbose=False)
t_em = time.time() - start

# Proxy EM for Von Mises
start = time.time()
res_proxy = dir_proxy_model.fit_em_vonmises_approx(data_dir, verbose=False)
t_proxy = time.time() - start

# Bregman EM (only if implemented!)
start = time.time()
res_bregman = dir_bregman_model.fit_em_bregman(data_dir, verbose=False)
t_bregman = time.time() - start

# Results
print(f"Final log-likelihoods:\nEM: {res_em[-1]:.4f}\nProxy: {res_proxy[-1]:.4f}\nBregman: {res_bregman[-1]:.4f}")
print(f"Timings:\nEM: {t_em:.4f}s\nProxy: {t_proxy:.4f}s\nBregman: {t_bregman:.4f}s")


Final log-likelihoods:
EM: -182.8626
Proxy: -183.0771
Bregman: -182.8626
Timings:
EM: 0.7876s
Proxy: 0.0205s
Bregman: 0.0332s


In [62]:
for i in range(k_vm):
    print(f"Cluster {i+1}.\n Classical EM model")
    print(dir_em_model.components[i])
    print(f" EM with approximation model")
    print(dir_proxy_model.components[i])
    print(f" Bregman EM model")
    print(dir_bregman_model.components[i])
    print("")

Cluster 1.
 Classical EM model
VonMises(loc=-5.7º, kappa=12.692)
 EM with approximation model
VonMises(loc=-5.7º, kappa=12.692)
 Bregman EM model
VonMises(loc=-5.7º, kappa=12.692)

Cluster 2.
 Classical EM model
VonMises(loc=173.4º, kappa=12.948)
 EM with approximation model
VonMises(loc=173.4º, kappa=12.948)
 Bregman EM model
VonMises(loc=173.4º, kappa=12.948)

Cluster 3.
 Classical EM model
VonMises(loc=-103.3º, kappa=4.191)
 EM with approximation model
VonMises(loc=-103.3º, kappa=4.191)
 Bregman EM model
VonMises(loc=-103.3º, kappa=4.191)

Cluster 4.
 Classical EM model
VonMises(loc=93.3º, kappa=2.296)
 EM with approximation model
VonMises(loc=93.3º, kappa=2.296)
 Bregman EM model
VonMises(loc=93.3º, kappa=2.296)

