In [1]:
%load_ext autoreload
%autoreload 2
import os; import sys;sys.path.append("..")
import socceraction.classification.features as fs
import socceraction.classification.labels as lab
import pandas as pd
import tqdm

In [2]:
games = pd.read_hdf("../data/spadl.h5","games")

actiontypes = pd.read_hdf("../data/spadl.h5", "actiontypes")
bodyparts = pd.read_hdf("../data/spadl.h5", "bodyparts")
results = pd.read_hdf("../data/spadl.h5", "results")

  return f(*args, **kwds)


# Generate all features

In [3]:
xfns = [fs.actiontype,
       fs.actiontype_onehot,
       fs.bodypart,
       fs.bodypart_onehot,
       fs.result,
       fs.result_onehot,
       fs.goalscore,
       fs.startlocation,
       fs.endlocation,
       fs.movement,
       fs.space_delta,
       fs.startpolar,
       fs.endpolar,
       fs.team,
       fs.time,
       fs.time_delta,
       #fs.actiontype_result_onehot
      ]

for game in tqdm.tqdm(list(games.itertuples())):
    actions = pd.read_hdf("../data/spadl.h5",f"actions/game_{game.game_id}")
    actions = (
        actions.merge(actiontypes)
        .merge(results)
        .merge(bodyparts)
        .sort_values(["period_id", "time_seconds", "timestamp"])
        .reset_index(drop=True)
    )
    gamestates = fs.gamestates(actions,3)
    gamestates = fs.play_left_to_right(gamestates,game.home_team_id)
    
    X = pd.concat([fn(gamestates) for fn in xfns],axis=1)
    X.to_hdf("../data/features.h5",f"game_{game.game_id}")

100%|██████████| 45/45 [00:14<00:00,  3.05it/s]


In [4]:
len(list(X.columns))

154

# Generate all labels

In [5]:
yfns = [lab.scores,lab.concedes,lab.goal_from_shot]

for game in tqdm.tqdm(list(games.itertuples())):
    actions = pd.read_hdf("../data/spadl.h5",f"actions/game_{game.game_id}")
    actions = (
        actions.merge(actiontypes)
        .merge(results)
        .merge(bodyparts)
        .sort_values(["period_id", "time_seconds", "timestamp"])
        .reset_index(drop=True)
    )

    Y = pd.concat([fn(actions) for fn in yfns],axis=1)
    Y.to_hdf("../data/labels.h5",f"game_{game.game_id}")

100%|██████████| 45/45 [00:07<00:00,  5.76it/s]


# Solve a classication problem
1. select feature set X 
2. select labels Y
3. train a classifier F(X) = Y

In [6]:
%%time

xfns = [#fs.actiontype,
       #fs.actiontype_onehot,
       #fs.bodypart,
       #fs.bodypart_onehot,
       #fs.result,
       fs.result_onehot,
       fs.goalscore,
       #fs.startlocation,
       #fs.endlocation,
       #fs.movement,
       #fs.space_delta,
       #fs.startpolar,
       fs.endpolar,
       #fs.team,
       fs.time,
       #fs.time_delta,
       #fs.actiontype_result_onehot
      ]
nb_prev_actions = 1

cols = fs.feature_column_names(xfns,nb_prev_actions)

X = []
for game_id in tqdm.tqdm(games.game_id):
    Xi = pd.read_hdf("../data/features.h5",f"game_{game_id}")
    X.append(Xi[cols])
X = pd.concat(X)

Y = []
for game_id in tqdm.tqdm(games.game_id):
    Yi = pd.read_hdf("../data/labels.h5",f"game_{game_id}")
    Y.append(Yi)
Y = pd.concat(Y)

import xgboost
y = Y.scores
model = xgboost.XGBClassifier()
model.fit(X,y)
pred_y = [p[1] for p in model.predict_proba(X)]

from sklearn.metrics import brier_score_loss, roc_auc_score
print(brier_score_loss(y,pred_y),roc_auc_score(y,pred_y))

100%|██████████| 45/45 [00:00<00:00, 78.99it/s]
100%|██████████| 45/45 [00:00<00:00, 150.74it/s]


0.010262954907908052 0.8614648866512042
CPU times: user 4.93 s, sys: 412 ms, total: 5.34 s
Wall time: 5.22 s


In [7]:
import numpy as np
import matplotlib.pyplot as plt
xgboost.plot_importance(model)
relevant_columns = list(np.array(cols)[model.feature_importances_ > 0])
#plt.gcf().set_size_inches(len(relevant_columns)/10,10)

# Repeat classification with fewer features

In [8]:
%%time
cols = relevant_columns

X = []
for game_id in tqdm.tqdm(games.game_id):
    Xi = pd.read_hdf("../data/features.h5",f"game_{game_id}")
    X.append(Xi[cols])
X = pd.concat(X)

Y = []
for game_id in tqdm.tqdm(games.game_id):
    Yi = pd.read_hdf("../data/labels.h5",f"game_{game_id}")
    Y.append(Yi)
Y = pd.concat(Y)

import xgboost
from sklearn.metrics import brier_score_loss, roc_auc_score

Y_pred = pd.DataFrame()
for col in ["scores","concedes"]:
    model = xgboost.XGBClassifier()
    model.fit(X,Y[col])
    Y_pred[col] = [p[1] for p in model.predict_proba(X)]
    print(brier_score_loss(Y[col],Y_pred[col]),roc_auc_score(Y[col],Y_pred[col]))

100%|██████████| 45/45 [00:00<00:00, 79.74it/s]
100%|██████████| 45/45 [00:00<00:00, 150.90it/s]


0.010262954907908052 0.8614648866512042
0.0033288088491676255 0.8950646847270272
CPU times: user 6.61 s, sys: 28 ms, total: 6.64 s
Wall time: 6.63 s


# Save predictions

In [9]:
A = []
for game_id in tqdm.tqdm(games.game_id):
    Ai = pd.read_hdf("../data/spadl.h5",f"actions/game_{game_id}")
    A.append(Ai[["game_id"]])
A = pd.concat(A)
A = A.reset_index(drop=True)

100%|██████████| 45/45 [00:00<00:00, 83.85it/s]


In [10]:
import pandas as pd
for k,df in pd.concat([A,Y_pred],axis=1).groupby("game_id"):
    df = df.reset_index(drop=True)
    df[Y_pred.columns].to_hdf("../data/predictions.h5",f"game_{int(k)}")