In [1]:
%load_ext autoreload
%autoreload 2

In [49]:
import pandas as pd
from pycaret.classification import ClassificationExperiment
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from expected_tackling.data.mott_features import sample_training_data

## Import Data

In [7]:
mott_features_data = pd.read_csv("../data/features_mott.csv", index_col=[0,1,2,3,4])
mott_features_data.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,ott,mean_distance_to_ball_carrier_from_peak,ball_carrier_distance_won_to_last_frame,tackle_or_assist,pff_missedTackle
gameId,playId,nflId,opportunityId,frameId,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022090800,56,38577.0,0,17.0,0.191902,1.77035,0.0,0.0,0.0
2022090800,56,41239.0,0,7.0,0.00086,13.913438,0.36,0.0,0.0
2022090800,56,42816.0,0,11.0,0.001351,26.450194,0.0,0.0,0.0
2022090800,56,43294.0,0,18.0,1.031626,0.670298,0.0,1.0,0.0
2022090800,56,43298.0,0,2.0,0.000844,12.350122,1.15,0.0,0.0


In [5]:
mott_features_data.describe()

Unnamed: 0,ott,mean_distance_to_ball_carrier_from_peak,ball_carrier_distance_won_to_last_frame,tackle_or_assist,pff_missedTackle
count,129395.0,129395.0,129395.0,129395.0,129395.0
mean,inf,10.030606,4.142201,0.119804,0.01616
std,,9.823902,6.269879,0.324733,0.12609
min,3.48729e-11,0.0,0.0,0.0,0.0
25%,0.002555156,2.072921,0.06,0.0,0.0
50%,0.03033201,6.639037,1.8,0.0,0.0
75%,0.271574,15.611664,5.94,0.0,0.0
max,inf,80.745186,84.81,1.0,1.0


In [6]:
import numpy as np
mott_features_data[mott_features_data['ott']!=np.inf].describe()

Unnamed: 0,ott,mean_distance_to_ball_carrier_from_peak,ball_carrier_distance_won_to_last_frame,tackle_or_assist,pff_missedTackle
count,129391.0,129391.0,129391.0,129391.0,129391.0
mean,0.608403,10.030907,4.142308,0.119792,0.01616
std,2.399223,9.823905,6.269945,0.324719,0.126092
min,3.48729e-11,0.01,0.0,0.0,0.0
25%,0.002555045,2.073137,0.06,0.0,0.0
50%,0.03032951,6.639232,1.8,0.0,0.0
75%,0.2714658,15.61172,5.94,0.0,0.0
max,99.46929,80.745186,84.81,1.0,1.0


In [59]:
sample_mott_features_data = sample_training_data(mott_features_data, negatives_multplier=5)

In [33]:
sample_mott_features_data.describe()

Unnamed: 0,ott,mean_distance_to_ball_carrier_from_peak,ball_carrier_distance_won_to_last_frame,tackle_or_assist,pff_missedTackle
count,6217.0,6217.0,6217.0,6217.0,6217.0
mean,0.8659879,8.026642,5.285955,0.093453,0.327328
std,2.512629,8.555547,7.232869,0.29109,0.469276
min,3.609792e-09,0.02,0.0,0.0,0.0
25%,0.009929247,2.092384,0.4,0.0,0.0
50%,0.188636,4.549821,3.14,0.0,0.0
75%,1.075057,11.350441,7.33,0.0,1.0
max,98.61501,61.114931,73.76,1.0,1.0


## Classification

In [60]:
X_train, X_test, y_train, y_test = train_test_split(
    sample_mott_features_data.drop(columns=["pff_missedTackle"]), 
    sample_mott_features_data["pff_missedTackle"], 
    test_size=0.2, 
    random_state=42
)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)    
X_test_scaled = scaler.transform(X_test)

X_train_weights = compute_sample_weight(class_weight='balanced', y=y_train)

### Catboost

negatives_multplier=10

In [40]:
model = CatBoostClassifier(max_depth=2, scale_pos_weight=5, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      4224
         1.0       0.71      0.87      0.78       365

    accuracy                           0.96      4589
   macro avg       0.85      0.92      0.88      4589
weighted avg       0.97      0.96      0.96      4589



array([[4091,  133],
       [  47,  318]], dtype=int64)

negatives_multplier=5

In [35]:
model = CatBoostClassifier(max_depth=2, scale_pos_weight=5, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.98      0.90      0.94       820
         1.0       0.83      0.96      0.89       424

    accuracy                           0.92      1244
   macro avg       0.91      0.93      0.92      1244
weighted avg       0.93      0.92      0.92      1244



array([[739,  81],
       [ 16, 408]], dtype=int64)

In [30]:
model = CatBoostClassifier(max_depth=2, scale_pos_weight=2, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98      2084
         1.0       0.87      0.89      0.88       414

    accuracy                           0.96      2498
   macro avg       0.92      0.93      0.93      2498
weighted avg       0.96      0.96      0.96      2498



array([[2029,   55],
       [  46,  368]], dtype=int64)

### XGBoost

negatives_multplier=10

In [54]:
model = XGBClassifier(max_depth=2, scale_pos_weight=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.98      4224
         1.0       0.77      0.83      0.80       365

    accuracy                           0.97      4589
   macro avg       0.88      0.90      0.89      4589
weighted avg       0.97      0.97      0.97      4589



array([[4132,   92],
       [  62,  303]], dtype=int64)

negatives_multplier=5

In [64]:
model = XGBClassifier(max_depth=2, scale_pos_weight=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98      2084
         1.0       0.87      0.89      0.88       414

    accuracy                           0.96      2498
   macro avg       0.92      0.93      0.93      2498
weighted avg       0.96      0.96      0.96      2498



array([[2029,   55],
       [  44,  370]], dtype=int64)

negatives_multplier=2

In [37]:
model = XGBClassifier(max_depth=2, scale_pos_weight=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95       820
         1.0       0.90      0.91      0.91       424

    accuracy                           0.93      1244
   macro avg       0.93      0.93      0.93      1244
weighted avg       0.94      0.93      0.93      1244



array([[777,  43],
       [ 38, 386]], dtype=int64)

### Gradient Boosting

negatives_multplier=20

In [58]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train, sample_weight=X_train_weights)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       1.00      0.94      0.97      8343
         1.0       0.45      0.91      0.61       428

    accuracy                           0.94      8771
   macro avg       0.72      0.93      0.79      8771
weighted avg       0.97      0.94      0.95      8771



array([[7875,  468],
       [  38,  390]], dtype=int64)

negatives_multplier=10

In [55]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train, sample_weight=X_train_weights)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97      4224
         1.0       0.62      0.91      0.74       365

    accuracy                           0.95      4589
   macro avg       0.81      0.93      0.85      4589
weighted avg       0.96      0.95      0.95      4589



array([[4019,  205],
       [  33,  332]], dtype=int64)

negatives_multplier=5

In [61]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train, sample_weight=X_train_weights)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.98      0.94      0.96      2084
         1.0       0.75      0.92      0.83       414

    accuracy                           0.94      2498
   macro avg       0.87      0.93      0.89      2498
weighted avg       0.95      0.94      0.94      2498



array([[1958,  126],
       [  32,  382]], dtype=int64)

### PyCaret

In [42]:
s = ClassificationExperiment()
s.setup(sample_mott_features_data, target = 'pff_missedTackle', session_id = 123, normalize = True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,pff_missedTackle
2,Target type,Binary
3,Original data shape,"(22945, 5)"
4,Transformed data shape,"(22945, 5)"
5,Transformed train set shape,"(16061, 5)"
6,Transformed test set shape,"(6884, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x206c9ca8d50>

In [43]:
best = s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.9683,0.9825,0.8076,0.8304,0.8186,0.8013,0.8015,6.032
gbc,Gradient Boosting Classifier,0.9679,0.9824,0.8048,0.8288,0.8164,0.7989,0.7991,0.648
rf,Random Forest Classifier,0.9668,0.9695,0.7907,0.8269,0.8082,0.79,0.7904,0.472
lightgbm,Light Gradient Boosting Machine,0.9665,0.9823,0.7921,0.8235,0.8072,0.7888,0.7892,0.152
ada,Ada Boost Classifier,0.9663,0.9815,0.7823,0.8281,0.8041,0.7856,0.7863,0.194
et,Extra Trees Classifier,0.9652,0.9688,0.7781,0.8206,0.7985,0.7795,0.7799,0.204
xgboost,Extreme Gradient Boosting,0.9644,0.9802,0.7822,0.8098,0.7953,0.7758,0.7762,0.065
knn,K Neighbors Classifier,0.9613,0.9377,0.7247,0.8194,0.7688,0.7478,0.7497,0.07
dt,Decision Tree Classifier,0.9532,0.8564,0.7387,0.7349,0.7367,0.711,0.7111,0.023
dummy,Dummy Classifier,0.9113,0.5,0.0,0.0,0.0,0.0,0.0,0.019


In [25]:
s.evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Aanlysis

In [21]:
import pickle
pickle.dump(model, open("../models/model_mott.pkl", 'wb'))

In [23]:
result_data = mott_features_data.copy()
result_data["prediction"] = model.predict(result_data.drop(columns=["pff_missedTackle"]))

In [24]:
result_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ott,mean_distance_to_ball_carrier_from_peak,ball_carrier_distance_won_to_last_frame,tackle_or_assist,pff_missedTackle,prediction
gameId,playId,nflId,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022090800,56,38577.0,0.158333,1.971121,0.00,0.0,0.0,0.0
2022090800,56,41239.0,0.000607,13.907379,0.00,0.0,0.0,0.0
2022090800,56,42816.0,0.001262,27.279112,0.52,0.0,0.0,0.0
2022090800,56,43294.0,1.046483,0.670298,0.00,1.0,0.0,0.0
2022090800,56,43298.0,0.000308,12.350122,1.15,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2022110700,3787,52627.0,0.012055,13.470441,5.57,0.0,0.0,0.0
2022110700,3787,53460.0,0.115559,4.405723,6.54,0.0,0.0,0.0
2022110700,3787,53533.0,0.001109,19.715603,0.00,0.0,0.0,0.0
2022110700,3787,54541.0,1.136388,1.121133,1.95,0.0,0.0,1.0


In [25]:
result_data.loc[(2022100911,2915)]

Unnamed: 0_level_0,ott,mean_distance_to_ball_carrier_from_peak,ball_carrier_distance_won_to_last_frame,tackle_or_assist,pff_missedTackle,prediction
nflId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42401.0,0.000421,19.369486,7.07,0.0,0.0,0.0
43986.0,0.795232,4.820028,6.44,0.0,1.0,1.0
44848.0,0.018716,8.442979,1.66,0.0,0.0,0.0
47816.0,1.076127,3.278709,5.02,1.0,0.0,1.0
47848.0,0.001377,16.561179,7.89,0.0,0.0,0.0
49410.0,0.012372,9.991531,3.07,0.0,0.0,0.0
52416.0,0.605927,2.162725,3.48,0.0,0.0,1.0
52522.0,0.00344,10.140932,7.43,0.0,0.0,0.0
53445.0,0.032192,5.606291,0.0,0.0,0.0,0.0
53565.0,0.000405,21.414975,7.59,0.0,0.0,0.0
