# NBA AI - Deep Learning Base Model - PyTorch

## Table of Contents

* [Data Setup](#data-setup)
* [MLP Regression](#mlp-regression)
* [MLP Classification](#mlp-classification)

### Imports and Global Settings

In [1]:
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    accuracy_score,
    precision_score,
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Pandas Settings
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
pd.options.display.max_info_columns = 200
pd.options.display.precision = 5

### Load Data

In [2]:
df_2021_2022 = pd.read_csv("../data/nba_ai/cleaned_data_2021-2022.csv")
df_2022_2023 = pd.read_csv("../data/nba_ai/cleaned_data_2022-2023.csv")

<a name="data-setup"></a>

## Data Preparation

### Train Test Split

In [3]:
def prepare_datasets(train_df, cls_target, reg_target, test_df=None, test_size=0.3):
    """
    Prepares datasets for training and testing for both classification and regression targets,
    ensuring time-sensitive splitting based on a 'date' column.

    Parameters:
    train_df (DataFrame): The training dataframe.
    cls_target (str): The name of the classification target column.
    reg_target (str): The name of the regression target column.
    test_df (DataFrame, optional): An optional testing dataframe. If not provided, a portion of the training data is used.
    test_size (float, optional): The proportion of the dataset to include in the test split (if test_df is not provided).

    Returns:
    tuple: A tuple containing six dataframes:
           (X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg).
    """

    # Sort the dataframe based on the 'date' column
    train_df = train_df.sort_values(by="date")

    # If a test dataframe is not provided, split the training dataframe
    if test_df is None:
        X_train, X_test, y_train, y_test = train_test_split(
            train_df.drop([cls_target, reg_target], axis=1),
            train_df[[cls_target, reg_target]],
            test_size=test_size,
            shuffle=False,  # Important to maintain time order
        )
    else:
        # If a test dataframe is provided, ensure it is also sorted by date
        test_df = test_df.sort_values(by="date")

        # Use provided test dataframe and separate features and targets
        X_train = train_df.drop([cls_target, reg_target], axis=1)
        y_train = train_df[[cls_target, reg_target]]
        X_test = test_df.drop([cls_target, reg_target], axis=1)
        y_test = test_df[[cls_target, reg_target]]

    # Separate classification and regression targets
    y_train_cls = y_train[[cls_target]]
    y_train_reg = y_train[[reg_target]]
    y_test_cls = y_test[[cls_target]]
    y_test_reg = y_test[[reg_target]]

    return X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg

In [4]:
X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg = prepare_datasets(
    df_2021_2022, "CLS_TARGET", "REG_TARGET", test_df=df_2022_2023
)

### Features

In [5]:
betting_feature_set = [
    "home_opening_spread",
    "road_opening_spread",
    "opening_total",
    "home_closing_spread",
    "road_closing_spread",
    "closing_total",
    "home_moneyline",
    "road_moneyline",
]

base_feature_set = [
    "day_of_season",
    "home_win_pct",
    "road_win_pct",
    "home_win_pct_l2w",
    "road_win_pct_l2w",
    "home_avg_pts",
    "road_avg_pts",
    "home_avg_pts_l2w",
    "road_avg_pts_l2w",
    "home_avg_oeff",
    "road_avg_oeff",
    "home_avg_oeff_l2w",
    "road_avg_oeff_l2w",
    "home_avg_deff",
    "road_avg_deff",
    "home_avg_deff_l2w",
    "road_avg_deff_l2w",
    "home_avg_eFG%",
    "road_avg_eFG%",
    "home_avg_eFG%_l2w",
    "road_avg_eFG%_l2w",
    "home_avg_TOV%",
    "road_avg_TOV%",
    "home_avg_TOV%_l2w",
    "road_avg_TOV%_l2w",
    "home_avg_ORB%",
    "road_avg_ORB%",
    "home_avg_ORB%_l2w",
    "road_avg_ORB%_l2w",
    "home_avg_FT%",
    "road_avg_FT%",
    "home_avg_FT%_l2w",
    "road_avg_FT%_l2w",
    "home_avg_pts_allowed",
    "road_avg_pts_allowed",
    "home_avg_pts_allowed_l2w",
    "road_avg_pts_allowed_l2w",
]

features_to_prepare = [
    "home_team",
    "road_team",
    "home_team_rest_days",
    "road_team_rest_days",
    "home_team_starting_lineup",
    "road_team_starting_lineup",
]

In [6]:
X_train_all = X_train.copy()
X_test_all = X_test.copy()

X_train = X_train[base_feature_set]
X_test = X_test[base_feature_set]

<a name="mlp-regression"></a>

## Multi-Layer Perceptron (MLP) - Regression

### Data Conversion

In [7]:
# Convert Pandas DataFrames to PyTorch Tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_reg.values, dtype=torch.float32)

# Create a TensorDataset - this wraps tensors into a dataset
train_data = TensorDataset(X_train_tensor, y_train_tensor)

# DataLoader for batching and shuffling
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# Note: Shuffle is set to True for training data. For time-series data, consider the impact of shuffling.

### Model Definition


In [8]:
# Define a simple regression neural network
class RegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RegressionModel, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.layer2(x)
        return x

In [9]:
# Instantiate the model
reg_mlp_model = RegressionModel(
    input_size=X_train.shape[1], hidden_size=5, output_size=1
)

### Model Training


In [10]:
# Define loss function and optimizer
loss_function = nn.MSELoss()
optimizer = optim.Adam(reg_mlp_model.parameters(), lr=0.001)

# Training loop
num_epochs = 100  # Set the number of epochs
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()  # Zero the gradient buffers
        outputs = reg_mlp_model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

    # Optional: Print the loss every few epochs
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 74.28823852539062
Epoch 10, Loss: 96.0931396484375
Epoch 20, Loss: 137.65530395507812
Epoch 30, Loss: 195.98065185546875
Epoch 40, Loss: 92.8561019897461
Epoch 50, Loss: 105.47020721435547
Epoch 60, Loss: 335.4338684082031
Epoch 70, Loss: 134.19989013671875
Epoch 80, Loss: 315.7101135253906
Epoch 90, Loss: 275.2518005371094


### Model Evaluation and Prediction

In [11]:
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

# Disable gradient computation for evaluation and prediction
with torch.no_grad():
    train_predictions_reg = reg_mlp_model(X_train_tensor)
    test_predictions_reg = reg_mlp_model(X_test_tensor)

# Convert predictions to a NumPy array or Pandas Series for further evaluation
train_predictions_reg_np = train_predictions_reg.numpy()
test_predictions_reg_np = test_predictions_reg.numpy()

In [12]:
train_mae = mean_absolute_error(train_predictions_reg_np, y_train_reg)
train_r2 = r2_score(train_predictions_reg_np, y_train_reg)

test_mae = mean_absolute_error(test_predictions_reg_np, y_test_reg)
test_r2 = r2_score(test_predictions_reg_np, y_test_reg)

In [13]:
print(f"Train MAE: {train_mae:.2f}")
print(f"Train R2: {train_r2:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test R2: {test_r2:.2f}")

Train MAE: 11.81
Train R2: -30.65
Test MAE: 10.73
Test R2: -38.40


### Model Saving and Loading

In [14]:
problem_type = "Regression"
base_model = "MLP"
train_performance = round(train_mae, 2)
test_performance = round(test_mae, 2)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Regression_MLP_11.81_10.73_2024-01-03_15-50-13'

In [15]:
# Save the model state
torch.save(reg_mlp_model.state_dict(), f"../models/{model_id}.pth")

In [16]:
# To load the model, first initialize the model structure, then load the state
# reg_mlp_model = RegressionModel(input_size=10, hidden_size=5, output_size=1)
# reg_mlp_model.load_state_dict(torch.load(f"../models/{model_id}.pth"))
# reg_mlp_model.eval()  # Set the model to evaluation mode

<a name="mlp-classification"></a>

## Multi-Layer Perceptron (MLP) - Classification

### Data Conversion

In [17]:
# Convert Pandas DataFrames to PyTorch Tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_cls.values, dtype=torch.float32)

# Create a TensorDataset - this wraps tensors into a dataset
train_data = TensorDataset(X_train_tensor, y_train_tensor)

# DataLoader for batching and shuffling
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# Note: Shuffle is set to True for training data. For time-series data, consider the impact of shuffling.

### Model Definition


In [18]:
class ClassificationModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(ClassificationModel, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(
            hidden_size, 1
        )  # Output size is 1 for binary classification

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = torch.sigmoid(
            self.layer2(x)
        )  # Sigmoid activation for binary classification
        return x

In [19]:
# Instantiate the model
cls_mlp_model = ClassificationModel(input_size=X_train.shape[1], hidden_size=5)

### Model Training


In [20]:
# Define loss function and optimizer for binary classification
loss_function = nn.BCELoss()
optimizer = optim.Adam(cls_mlp_model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = cls_mlp_model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 0.7315127849578857
Epoch 10, Loss: 0.6890249848365784
Epoch 20, Loss: 0.690775990486145
Epoch 30, Loss: 0.6824485063552856
Epoch 40, Loss: 0.704181969165802
Epoch 50, Loss: 0.6825932860374451
Epoch 60, Loss: 0.7145680785179138
Epoch 70, Loss: 0.6818594336509705
Epoch 80, Loss: 0.6659519672393799
Epoch 90, Loss: 0.6734258532524109


### Model Evaluation and Prediction

In [21]:
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

# Disable gradient computation for evaluation and prediction
with torch.no_grad():
    train_predictions_cls = cls_mlp_model(X_train_tensor)
    test_predictions_cls = cls_mlp_model(X_test_tensor)

In [22]:
# Assuming your model outputs probabilities for class 1
threshold = 0.5

# Convert probabilities to class labels based on the threshold
train_predictions_cls_np = train_predictions_cls.numpy() > threshold
test_predictions_cls_np = test_predictions_cls.numpy() > threshold

In [23]:
train_accuracy = accuracy_score(train_predictions_cls_np, y_train_cls)
train_precision = precision_score(train_predictions_cls_np, y_train_cls)

test_accuracy = accuracy_score(test_predictions_cls_np, y_test_cls)
test_precision = precision_score(test_predictions_cls_np, y_test_cls)

In [24]:
print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Train Precision: {train_precision:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test Precision: {test_precision:.2f}")

Train Accuracy: 0.52
Train Precision: 0.00
Test Accuracy: 0.48
Test Precision: 0.00


### Model Saving and Loading

In [25]:
problem_type = "Classification"
base_model = "MLP"
train_performance = round(train_mae, 2)
test_performance = round(test_mae, 2)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Classification_MLP_11.81_10.73_2024-01-03_15-50-20'

In [26]:
# Save the model state
torch.save(cls_mlp_model.state_dict(), f"../models/{model_id}.pth")

In [27]:
# To load the model, first initialize the model structure, then load the state
# cls_mlp_model = ClassificationModel(input_size=10, hidden_size=5, output_size=1)
# cls_mlp_model.load_state_dict(torch.load(f"../models/{model_id}.pth"))
# cls_mlp_model.eval()  # Set the model to evaluation mode