In [1]:
import warnings
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import ast
import models
import eventstox

%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4]

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'


In [2]:
df_1819 = pd.read_csv('df_1819.csv')
df_1920 = pd.read_csv('df_1920.csv')
df_2021 = pd.read_csv('df_2021.csv')

X_1819, y_1819 = eventstox.df_to_X_y(df_1819)
X_1920, y_1920 = eventstox.df_to_X_y(df_1920)
X_2021, y_2021 = eventstox.df_to_X_y(df_2021)

In [3]:
from sklearn.preprocessing import MinMaxScaler

def preprocessing(X, y):

    numeric_features = [
        f"{feature}_{i}" for i in range(10) for feature in [
        'location_x', 'location_y', 'end_location_x', 'end_location_y'
        ]
        ]
    X_numeric = X[numeric_features]

    binary_features = [f"{feature}_{i}" for i in range(10) for feature in [
        'outcome', 'team']]
    X_binary = X[binary_features]

    scaler = MinMaxScaler()
    X_numeric_scaled = pd.DataFrame(
        scaler.fit_transform(X_numeric), columns=X_numeric.columns)

    X_scaled = pd.concat([X_numeric_scaled, X_binary], axis=1)


    data = pd.concat([X_scaled, pd.DataFrame({'y': y})], axis=1)
    data_cleaned = data.dropna()

    X_cleaned = data_cleaned.drop('y', axis=1)
    y_cleaned = data_cleaned['y']

    return X_cleaned, y_cleaned


X = pd.concat([X_1819, X_1920], axis=0).reset_index(drop=True)
y = np.concatenate([y_1819, y_1920], axis=0)

X_scaled, y = preprocessing(X, y)

In [4]:
features = ['location_x', 'location_y', 'end_location_x',
            'end_location_y', 'outcome', 'team']
feature_columns = [
    f"{feature}_{i}" for i in range(10)
    for feature in features
]


def get_lstm_arr(X: pd.DataFrame):

    # Extract features for each timestamp
    lstm_arr = X[feature_columns].to_numpy().reshape(
        len(X), 10, len(features)
    )

    return lstm_arr

X_lstm = get_lstm_arr(X)

In [5]:
# import matplotlib.pyplot as plt
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense

# # Define the model
# model = Sequential()

# # Add an LSTM layer with 50 units and input shape (10, 6)
# model.add(LSTM(50, input_shape=(10, 6)))

# # Add a Dense layer with one neuron for binary classification (sigmoid activation)
# model.add(Dense(1, activation='sigmoid'))

# # Compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy',
#               metrics=['accuracy'])

# # Display the model summary
# model.summary()


# # Fit the model and store the training history
# history = model.fit(
#     X_lstm,
#     y,
#     epochs=300,
#     batch_size=32
# )

# # Access the training history
# print(history.history.keys())

# # Plot training and validation accuracy values

# # Plot training and validation loss values
# plt.plot(history.history['loss'])
# plt.title('Model Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

In [9]:
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from joblib import Parallel, delayed

# Assuming X_lstm and y are your data and labels
# X_lstm shape: (n_samples, 10, 6)
# y shape: (n_samples,)

# Define the model


def create_model():
    model = Sequential()
    model.add(LSTM(50, input_shape=(10, 6)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Function to train and evaluate the model for a given fold


def train_and_evaluate(X_train, y_train, X_val, y_val):
    # Create the model
    model = create_model()

    # Train the model
    history = model.fit(X_train, y_train, epochs=200,
                        batch_size=32, validation_data=(X_val, y_val), verbose=0)

    # Evaluate the model on the validation set
    val_scores = model.evaluate(X_val, y_val, verbose=0)

    # Predict probabilities for ROC AUC
    y_val_pred = model.predict(X_val)

    # Calculate F1 score, AUC, precision, and recall
    f1 = f1_score(y_val, np.round(y_val_pred))
    auc = roc_auc_score(y_val, y_val_pred)
    precision = precision_score(y_val, np.round(y_val_pred))
    recall = recall_score(y_val, np.round(y_val_pred))

    return val_scores[1] * 100, val_scores[0], f1, auc, precision, recall


kfold = StratifiedKFold(5)

# Perform 5-fold cross-validation in parallel
results = Parallel(n_jobs=-1)(
    delayed(train_and_evaluate)(
        X_lstm[train_indices], y[train_indices],
        X_lstm[val_indices], y[val_indices]
    ) for fold, (train_indices, val_indices) in enumerate(kfold.split(X_lstm, y), 1)
)

# Unpack the results
acc_per_fold, loss_per_fold, f1_per_fold, auc_per_fold, precision_per_fold, recall_per_fold = zip(
    *results)

# Print the average results across all folds
print('Average validation accuracy: {:.2f}% (+/- {:.2f}%)'.format(
    np.mean(acc_per_fold), np.std(acc_per_fold)))
print('Average validation loss: {:.4f} (+/- {:.4f})'.format(
    np.mean(loss_per_fold), np.std(loss_per_fold)))
print('Average F1 Score: {:.4f} (+/- {:.4f})'.format(
    np.mean(f1_per_fold), np.std(f1_per_fold)))
print(
    'Average AUC: {:.4f} (+/- {:.4f})'.format(np.mean(auc_per_fold), np.std(auc_per_fold)))
print('Average Precision: {:.4f} (+/- {:.4f})'.format(
    np.mean(precision_per_fold), np.std(precision_per_fold)))
print('Average Recall: {:.4f} (+/- {:.4f})'.format(
    np.mean(recall_per_fold), np.std(recall_per_fold)))

2023-12-18 21:31:23.169099: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 21:31:23.191652: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 21:31:23.193926: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 21:31:23.238194: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 21:31:23.242736: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 21:31:23.245641: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-18 21:31:23.245711: E external/local_xla/xla/stream_executor/

Average validation accuracy: 88.34% (+/- 0.40%)
Average validation loss: 0.3577 (+/- 0.0278)
Average F1 Score: 0.0259 (+/- 0.0187)
Average AUC: 0.6781 (+/- 0.0451)
Average Precision: 0.1779 (+/- 0.0964)
Average Recall: 0.0143 (+/- 0.0107)
