In [1]:
import warnings
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import ast
import models
import eventstox

%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4]

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

os.chdir("./goal_scoring_both_teams/")

In [2]:
import joblib

lgb = joblib.load("lgb.joblib")
lstm = joblib.load("lstm.joblib")

2024-01-19 19:07:02.277487: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-19 19:07:03.248338: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-19 19:07:03.248507: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-19 19:07:03.449964: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-19 19:07:03.870425: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-19 19:07:03.876794: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [3]:
df_1819 = pd.read_csv("df_1819.csv")
df_1920 = pd.read_csv("df_1920.csv")
df_2021 = pd.read_csv("df_2021.csv")

df = pd.concat([df_1819, df_1920], axis=0).reset_index(drop=True)

In [4]:
from sklearn.preprocessing import MinMaxScaler


def get_X_lstm_cols(X):
    X_lstm = X.copy()

    cols = X_lstm.columns
    X_lstm = X_lstm.drop(columns=[col for col in cols if 'type' in col])
    X_lstm = X_lstm.drop(
        columns=['location_x_10', 'location_y_10', 'shot_angle'])

    return X_lstm


def minmax_scale(X_lstm: pd.DataFrame, scaler=None):

    X_binary = X_lstm[[col for col in X_lstm.columns if (
        ('team' in col) | ('outcome' in col))]]
    X_numerical = X_lstm.drop(columns=X_binary.columns)

    if scaler is None:
        scaler = MinMaxScaler((0, 1))
        scaler.fit(X_numerical)

    X_numerical_scaled = pd.DataFrame(
        scaler.transform(X_numerical),
        columns=X_numerical.columns,
    )
    X_binary = X_binary.reset_index(drop=True)

    X_scaled = pd.concat(
        [X_numerical_scaled, X_binary],
        axis=1
    )
    return X_scaled, scaler


def reshape_X_lstm(X_scaled):

    X_arr = np.zeros((X_scaled.shape[0], 10, 6))

    # List of features
    features = ["team", "outcome", "location_x",
                "location_y", "end_location_x", "end_location_y"]

    # Iterate over each feature and timestamp to fill the array
    for i, feature in enumerate(features):
        for timestamp in range(10):
            column_name = f"{feature}_{timestamp}"
            X_arr[:, timestamp, i] = X_scaled[column_name]

    return X_arr


def get_X_lstm(X, scaler=None):

    X_lstm = get_X_lstm_cols(X)
    X_scaled, scaler = minmax_scale(X_lstm, scaler=scaler)
    X_arr = reshape_X_lstm(X_scaled)

    # Return the scaler to fit the test set
    return X_arr, scaler

In [5]:
from sklearn.model_selection import train_test_split
from eventstox import df_to_X_y
from models import process_X
from imblearn.over_sampling import RandomOverSampler

X, y = df_to_X_y(df_2021)
X_test, X_val, y_test, y_val = train_test_split(X, y, test_size=0.2)

X_lstm_test, scaler = get_X_lstm(X_test)
X_lstm_val, _ = get_X_lstm(X_val, scaler)

X_lgb_test = process_X(X_test)
X_lgb_val = process_X(X_val)

In [19]:
def proba_to_label(y_score, threshold=0.5):

    labels = y_score.copy()
    labels[labels >= threshold] = 1
    labels[labels < threshold] = 0

    return labels

In [77]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score

preds_lstm_val = lstm.predict(X_lstm_val)[:, 0]
preds_lgb_val = lgb.predict(X_lgb_val)

X_val_meta = np.stack(
    [preds_lstm_val, 
     preds_lgb_val,
     proba_to_label(preds_lstm_val),
     proba_to_label(preds_lgb_val, threshold=0.6)
     #  preds_lgb_val - preds_lstm_val
     ],
    axis=1
)

C_values = [i for i in range(1, 20, 1)]
logistic = LogisticRegressionCV(
    cv=10, scoring='roc_auc_ovr',
    Cs=C_values,
)
logistic.fit(X_val_meta, y_val)

logistic_cv_scores = pd.DataFrame(logistic.scores_[1.0], columns=C_values)
logistic_cv_scores.mean().max()



0.7719619666048239

In [48]:
from sklearn.metrics import roc_auc_score

preds_lstm_test = lstm.predict(X_lstm_test)[:, 0]
preds_lgb_test = lgb.predict(X_lgb_test)

X_test_meta = np.stack(
    [preds_lstm_test, 
     preds_lgb_test,
     proba_to_label(preds_lstm_test),
     proba_to_label(preds_lgb_test, threshold=0.6)
     #  preds_lgb_test - preds_lstm_test
     ],
    axis=1
)
preds_logistic_test = logistic.predict_proba(X_test_meta)[:, 1]



In [60]:
print(f"baseline f1: {f1_score(y_true=y_test, y_pred=np.ones_like(y_test))}")
print(f"logistic f1: {f1_score(y_true=y_test, y_pred=proba_to_label(preds_logistic_test, threshold=0.25))}")

baseline f1: 0.21529745042492918
logistic f1: 0.3981623277182236
