# Training models

This notebook shows how to train a pass success model using XGBoost.

In [3]:
from pathlib import Path
from functools import partial

import matplotlib.pyplot as plt

import mlflow
from xgboost import XGBClassifier, XGBRanker

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
from unxpass.databases import SQLiteDatabase
from unxpass.datasets import PassesDataset
from unxpass.components import pass_success, pass_selection_custom
from unxpass.components.utils import log_model, load_model
from unxpass.visualization import plot_action

## Load dataset

We assume you've already generated a training and test with all required features and labels. If not, you can do this from the command line with:

```
unxpass create-dataset \
  sqlite://$(pwd)/stores/database.sqlite \
  $(pwd)/stores/datasets/default/train \
  $(pwd)/config/dataset/train.yaml
```

```
unxpass create-dataset \
  sqlite://$(pwd)/stores/database.sqlite \
  $(pwd)/stores/datasets/default/test \
  $(pwd)/config/dataset/test.yaml
```

Alternatively, the notebook titled "2-computing-and-storing-features" explains how to do this using the API.

In [10]:
STORES_FP = Path("../stores")

db = SQLiteDatabase(STORES_FP / "database.sqlite")

dataset_train = partial(PassesDataset, path=STORES_FP / "datasets" / "euro2020" / "train")
dataset_test = partial(PassesDataset, path=STORES_FP / "datasets" / "euro2020" / "test")
dataset_train_pass = partial(PassesDataset, path=STORES_FP / "datasets_pass" / "euro2020" / "train")
dataset_test_pass = partial(PassesDataset, path=STORES_FP / "datasets_pass" / "euro2020" / "test")

## Configure model

Now we have to define the hyperparameters of the XGBoost model and the features to use. The features should be specified as a dictionary, where the key is the name of the feature generator and the values are the features that should be included (a generator can generate multiple features). The docstring of each feature generator provides details about the features it generates. Note that the suffix "a0" indicates that a feature is computed for the pass action, "a1" is the action before the pass, and so on.

In [13]:
model = pass_success.XGBoostComponent(
    model=XGBClassifier(
        objective="binary:logistic", 
        eval_metric="auc"
        # you probably want to do some hyperparameter tuning here to get a good model
    ),
    features={
        'startpolar': [
            'start_dist_to_goal_a0',
            'start_angle_to_goal_a0'
        ],
        'relative_startlocation': [
            'start_dist_goalline_a0',
            'start_dist_sideline_a0'
        ],
        'endpolar': [
            'end_dist_to_goal_a0',
            'end_angle_to_goal_a0'
        ],
        'relative_endlocation': [
            'end_dist_goalline_a0',
            'end_dist_sideline_a0'
        ],
        'movement': [
            'movement_a0',
            'dx_a0',
            'dy_a0'
        ],
        'angle': [
            'angle_a0'
        ],
        'ball_height_onehot': [
            'ball_height_ground_a0',
            'ball_height_low_a0',
            'ball_height_high_a0'
        ],
        'under_pressure': [
            'under_pressure_a0'
        ],
        'dist_defender': [
            'dist_defender_start_a0',
            'dist_defender_end_a0',
            'dist_defender_action_a0'
        ],
        'nb_opp_in_path': [
            'nb_opp_in_path_a0'
        ]
    }, 
)
model.train(dataset_train)



[0]	validation_0-auc:0.90251
[1]	validation_0-auc:0.91377
[2]	validation_0-auc:0.91306
[3]	validation_0-auc:0.92050
[4]	validation_0-auc:0.92302
[5]	validation_0-auc:0.92284
[6]	validation_0-auc:0.92559
[7]	validation_0-auc:0.92742
[8]	validation_0-auc:0.92903
[9]	validation_0-auc:0.92985
[10]	validation_0-auc:0.92957
[11]	validation_0-auc:0.93091
[12]	validation_0-auc:0.93115
[13]	validation_0-auc:0.93180
[14]	validation_0-auc:0.93127
[15]	validation_0-auc:0.93117
[16]	validation_0-auc:0.93135
[17]	validation_0-auc:0.93166
[18]	validation_0-auc:0.93160
[19]	validation_0-auc:0.93184
[20]	validation_0-auc:0.93179
[21]	validation_0-auc:0.93176
[22]	validation_0-auc:0.93155
[23]	validation_0-auc:0.93152
[24]	validation_0-auc:0.93184
[25]	validation_0-auc:0.93174
[26]	validation_0-auc:0.93161
[27]	validation_0-auc:0.93158
[28]	validation_0-auc:0.93172
[29]	validation_0-auc:0.93169
[30]	validation_0-auc:0.93167
[31]	validation_0-auc:0.93143
[32]	validation_0-auc:0.93132
[33]	validation_0-au

In [14]:
# You can now log the model in the MLFflow registry
mlflow.set_experiment(experiment_name="pass_success/xgb")
modelinfo = log_model(model, artifact_path="component")
print(f"Model saved as {modelinfo.model_uri}")
model = load_model(modelinfo.model_uri)#runs:/988246f857f54c87ba8c2ac33555b4cb/component

Model saved as runs:/f977aaf2f5a0497cb51f5e730ae64609/component


## Train and test model

In [None]:
model.train(dataset_train)

In [1]:
from pathlib import Path
from functools import partial

import matplotlib.pyplot as plt

import mlflow
from xgboost import XGBClassifier, XGBRanker
from unxpass.databases import SQLiteDatabase
from unxpass.datasets import PassesDataset
from unxpass.components import pass_success, pass_selection_custom
from unxpass.components.utils import log_model, load_model
from unxpass.visualization import plot_action
STORES_FP = Path("../stores")

db = SQLiteDatabase(STORES_FP / "database.sqlite")

dataset_train = partial(PassesDataset, path=STORES_FP / "datasets" / "default" / "train")
dataset_test = partial(PassesDataset, path=STORES_FP / "datasets" / "default" / "test")
dataset_train_pass = partial(PassesDataset, path=STORES_FP / "datasets_pass" / "euro2020" / "train")
dataset_test_pass = partial(PassesDataset, path=STORES_FP / "datasets_pass" / "euro2020" / "test")
model = pass_selection_custom.XGBoostComponent(
    model = XGBClassifier(eval_metric =  "auc"),
    features = {
    'pass_options':
    ['origin_x','origin_y', 'destination_x', 'destination_y',
      'distance','angle', 'origin_angle_to_goal', 'destination_angle_to_goal','pass_distance_defender']
})
model.train(dataset_train_pass)



[0]	validation_0-auc:0.78582
[1]	validation_0-auc:0.79159
[2]	validation_0-auc:0.79438
[3]	validation_0-auc:0.79539
[4]	validation_0-auc:0.79643
[5]	validation_0-auc:0.79779
[6]	validation_0-auc:0.79874
[7]	validation_0-auc:0.80001
[8]	validation_0-auc:0.80047
[9]	validation_0-auc:0.80132
[10]	validation_0-auc:0.80181
[11]	validation_0-auc:0.80237
[12]	validation_0-auc:0.80330
[13]	validation_0-auc:0.80357
[14]	validation_0-auc:0.80425
[15]	validation_0-auc:0.80477
[16]	validation_0-auc:0.80477
[17]	validation_0-auc:0.80520
[18]	validation_0-auc:0.80523
[19]	validation_0-auc:0.80535
[20]	validation_0-auc:0.80560
[21]	validation_0-auc:0.80568
[22]	validation_0-auc:0.80574
[23]	validation_0-auc:0.80569
[24]	validation_0-auc:0.80565
[25]	validation_0-auc:0.80560
[26]	validation_0-auc:0.80562
[27]	validation_0-auc:0.80560
[28]	validation_0-auc:0.80573
[29]	validation_0-auc:0.80583
[30]	validation_0-auc:0.80582
[31]	validation_0-auc:0.80575
[32]	validation_0-auc:0.80588
[33]	validation_0-au



In [2]:
# You can now log the model in the MLFflow registry
mlflow.set_experiment(experiment_name="pass_selection_custom/threesixty")
modelinfo = log_model(model, artifact_path="component")
print(f"Model saved as {modelinfo.model_uri}")
model = load_model(modelinfo.model_uri)#runs:/988246f857f54c87ba8c2ac33555b4cb/component

Model saved as runs:/5a13feeb1f8b45078e40aaa944b17979/component


In [3]:
# Next, evaluate how the model performs on a test set
model.test(dataset_test_pass)

{'acc': 0.9401273885350319}

In [4]:
t_class = model.predict(dataset_test_pass)


In [5]:
t_class.to_csv("classtest.csv",index = True)

## Making predictions

Once trained, the model can be used to estimate the success probability of each pass in a dataset.

In [None]:
p_success = model.predict(dataset_test)
p_success

Let's visualize what a pass what a high and a low success probability look like.

In [None]:
easy_pass, hard_pass = (3795506, 4), (3795506, 2791)
df_actions = db.actions(game_id=3795506)

fig, ax = plt.subplots(1, 2, figsize=(12,4))
plot_action(df_actions.loc[easy_pass], ax=ax[0])
ax[0].set_title(f"P(success) = {p_success.loc[easy_pass]:.2f}")
plot_action(df_actions.loc[hard_pass], ax=ax[1])
ax[1].set_title(f"P(success) = {p_success.loc[hard_pass]:.2f}")
plt.show()

Instead of predicting the success probability of the actual pass, we can also estimate the success probability of a pass towards every other location on the  pitch.

In [None]:
p_success_surfaces = model.predict_surface(dataset_test, game_id=3795506, db=db, x_bins=52, y_bins=34)

In [None]:
df_actions = db.actions(game_id=3795506)
sample = (3795506, 4)

fig, ax = plt.subplots(1, 1, figsize=(6,4))
plot_action(df_actions.loc[sample], surface=p_success_surfaces[f"action_{sample[1]}"], ax=ax, surface_kwargs={"cmap": "magma", "vmin": 0, "vmax": 1, "interpolation": "bilinear"})
plt.show()

In [None]:
db.close()