In [1]:
import warnings
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import ast
import models
import eventstox

%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4]

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'


In [2]:
df_1819 = pd.read_csv('df_1819.csv')
df_1920 = pd.read_csv('df_1920.csv')
df_2021 = pd.read_csv('df_2021.csv')

X_1819, y_1819 = eventstox.df_to_X_y(df_1819)
X_1920, y_1920 = eventstox.df_to_X_y(df_1920)
X_2021, y_2021 = eventstox.df_to_X_y(df_2021)

In [57]:
from sklearn.preprocessing import MinMaxScaler

X = pd.concat([X_1819, X_1920], axis=0).reset_index(drop=True)
y = np.concatenate([y_1819, y_1920], axis=0)

# drop unecessary columns
X = X.drop(columns=[col for col in X.columns if 'type' in col])
X = X.drop(columns=['location_x_10', 'location_y_10', 'shot_angle'])

# get binary features
X_binary = X[[col for col in X.columns if (
    ('team' in col) | ('outcome' in col))]]

# standard scaling on numerical features (locations)
X_numerical = X.drop(
    columns=X_binary.columns)

scaler = MinMaxScaler((0, 1))
X_scaled = pd.DataFrame(
    scaler.fit_transform(X_numerical), 
    columns=X_numerical.columns,
)

X_scaled = pd.concat([X_binary, X_scaled], axis=1)

In [66]:
def get_lstm_X(X_scaled):
    X_arr = np.zeros((X_scaled.shape[0], 10, 6))

    # List of features
    features = ["team", "outcome", "location_x",
            "location_y", "end_location_x", "end_location_y"]

    # Iterate over each feature and timestamp to fill the array
    for i, feature in enumerate(features):
        for timestamp in range(10):
            column_name = f"{feature}_{timestamp}"
            X_arr[:, timestamp, i] = X_scaled[column_name]

    return X_arr

In [95]:
import numpy as np
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from joblib import Parallel, delayed
import tensorflow as tf
import os

# Function to create a new LSTM model


def create_model():
    model = Sequential()
    # Single LSTM layer
    model.add(Bidirectional(LSTM(50, input_shape=(10, 6))))
    # Output layer with sigmoid activation for binary classification
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


def train_evaluate_model(X_train, y_train, X_val, y_val):
    # Suppress TensorFlow logging
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    # Oversample the training set
    oversampler = RandomOverSampler(random_state=0)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(
        X_train, y_train)

    # Convert to shape (10, 6)
    X_train_resampled = get_lstm_X(X_train_resampled)
    X_val = get_lstm_X(X_val)

    # Train the model
    model = create_model()
    history = model.fit(X_train_resampled, y_train_resampled,
                        epochs=50, batch_size=64, verbose=0)

    # Get final training loss
    final_training_loss = history.history['loss'][-1]

    # Evaluate the model on the validation set
    validation_loss, _ = model.evaluate(X_val, y_val, verbose=0)

    return final_training_loss, validation_loss


# 5-Fold Cross-validation
kf = KFold(n_splits=5)

# Using joblib to parallelize the cross-validation
results = Parallel(n_jobs=-1)(delayed(train_evaluate_model)(
    X_scaled.iloc[train_index], y[train_index], X_scaled.iloc[val_index], y[val_index]
) for train_index, val_index in kf.split(X_scaled)
)

# Unpack results
training_losses, validation_losses = zip(*results)

print("Average Training Loss:", np.mean(training_losses))
print("Average Validation Loss:", np.mean(validation_losses))

Average Training Loss: 0.22705406248569487
Average Validation Loss: 0.6288111865520477


In [None]:
import optuna
from sklearn.metrics import mean_squared_error


def objective(trial):
    # Suggested hyperparameters
    lstm_units = trial.suggest_categorical('lstm_units', [20, 50, 100])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    optimizer = trial.suggest_categorical('optimizer', ['adam', 'rmsprop'])

    # Modify the create_model function to accept hyperparameters
    def create_model(lstm_units, dropout_rate, learning_rate, optimizer):
        model = Sequential()
        model.add(LSTM(lstm_units, return_sequences=True, input_shape=(10, 6)))
        model.add(Dropout(dropout_rate))
        model.add(LSTM(lstm_units))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid'))

        if optimizer == 'adam':
            opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        else:
            opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)

        model.compile(optimizer=opt, loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

    # Cross-validation
    kf = KFold(n_splits=5)
    accuracies = []
    losses = []

    for train_index, val_index in kf.split(X_scaled):
        X_train, X_val = X_scaled.iloc[train_index], X_scaled.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]

        X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train)
        X_train = get_lstm_X(X_train)
        X_val = get_lstm_X(X_val)

        model = create_model(lstm_units, dropout_rate,
                             learning_rate, optimizer)
        model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
        loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
        accuracies.append(accuracy)
        losses.append(loss)

    average_accuracy = np.mean(accuracies)
    average_loss = np.mean(losses)
    return average_loss


# Run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best hyperparameters
print(study.best_params)