# NBA AI - Currently in development

## Table of Contents

* [Data Setup](#data-setup)



### Imports and Global Settings

In [None]:
import datetime
import pandas as pd
import numpy as np
from pycaret.classification import ClassificationExperiment
from pycaret.regression import RegressionExperiment
import autokeras as ak
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    accuracy_score,
    precision_score,
)

# Pandas Settings
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
pd.options.display.max_info_columns = 200
pd.options.display.precision = 5

Using TensorFlow backend


2024-01-25 18:40:51.184542: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-25 18:40:51.237263: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-25 18:40:51.238047: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load Data

In [None]:
df_2021_2022 = pd.read_csv("../data/nba_ai/cleaned_data_2021-2022.csv")
df_2022_2023 = pd.read_csv("../data/nba_ai/cleaned_data_2022-2023.csv")

<a name="data-setup"></a>

## Data Preparation

### Train Test Split

In [None]:
def prepare_datasets(train_df, cls_target, reg_target, test_df=None, test_size=0.3):
    """
    Prepares datasets for training and testing for both classification and regression targets,
    ensuring time-sensitive splitting based on a 'date' column.

    Parameters:
    train_df (DataFrame): The training dataframe.
    cls_target (str): The name of the classification target column.
    reg_target (str): The name of the regression target column.
    test_df (DataFrame, optional): An optional testing dataframe. If not provided, a portion of the training data is used.
    test_size (float, optional): The proportion of the dataset to include in the test split (if test_df is not provided).

    Returns:
    tuple: A tuple containing six dataframes:
           (X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg).
    """

    # Sort the dataframe based on the 'date' column
    train_df = train_df.sort_values(by="date")

    # If a test dataframe is not provided, split the training dataframe
    if test_df is None:
        X_train, X_test, y_train, y_test = train_test_split(
            train_df.drop([cls_target, reg_target], axis=1),
            train_df[[cls_target, reg_target]],
            test_size=test_size,
            shuffle=False,  # Important to maintain time order
        )
    else:
        # If a test dataframe is provided, ensure it is also sorted by date
        test_df = test_df.sort_values(by="date")

        # Use provided test dataframe and separate features and targets
        X_train = train_df.drop([cls_target, reg_target], axis=1)
        y_train = train_df[[cls_target, reg_target]]
        X_test = test_df.drop([cls_target, reg_target], axis=1)
        y_test = test_df[[cls_target, reg_target]]

    # Separate classification and regression targets
    y_train_cls = y_train[[cls_target]]
    y_train_reg = y_train[[reg_target]]
    y_test_cls = y_test[[cls_target]]
    y_test_reg = y_test[[reg_target]]

    return X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg

In [None]:
X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg = prepare_datasets(
    df_2021_2022, "CLS_TARGET", "REG_TARGET", test_df=df_2022_2023
)

### Features

In [None]:
betting_feature_set = [
    "home_opening_spread",
    "opening_total",
    "home_moneyline",
    "road_moneyline",
]

base_feature_set = [
    "day_of_season",
    "home_team_rest",
    "road_team_rest",
    "home_win_pct",
    "road_win_pct",
    "home_win_pct_l2w",
    "road_win_pct_l2w",
    "home_avg_pts",
    "road_avg_pts",
    "home_avg_pts_l2w",
    "road_avg_pts_l2w",
    "home_avg_oeff",
    "road_avg_oeff",
    "home_avg_oeff_l2w",
    "road_avg_oeff_l2w",
    "home_avg_deff",
    "road_avg_deff",
    "home_avg_deff_l2w",
    "road_avg_deff_l2w",
    "home_avg_eFG%",
    "road_avg_eFG%",
    "home_avg_eFG%_l2w",
    "road_avg_eFG%_l2w",
    "home_avg_TOV%",
    "road_avg_TOV%",
    "home_avg_TOV%_l2w",
    "road_avg_TOV%_l2w",
    "home_avg_ORB%",
    "road_avg_ORB%",
    "home_avg_ORB%_l2w",
    "road_avg_ORB%_l2w",
    "home_avg_FT%",
    "road_avg_FT%",
    "home_avg_FT%_l2w",
    "road_avg_FT%_l2w",
    "home_avg_pts_allowed",
    "road_avg_pts_allowed",
    "home_avg_pts_allowed_l2w",
    "road_avg_pts_allowed_l2w",
]

lineup_vectors = ["home_lineup_vector", "road_lineup_vector"]

In [None]:
features = base_feature_set

In [None]:
def flatten_vector_columns(df, vector_columns):
    """
    Flatten vector columns into separate feature columns.

    This function takes a DataFrame and a list of column names that store vector data as strings
    (typically after being read from a CSV file), and returns a new DataFrame where the vectors
    have been flattened into separate feature columns.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    vector_columns (list): A list of column names in df that store vector data as strings.

    Returns:
    pandas.DataFrame: The DataFrame with vector columns flattened.
    """
    for column in vector_columns:
        if column not in df.columns:
            continue
        # Convert the string representation of the vector into a numpy array
        df[column] = df[column].apply(
            lambda x: np.array(x.strip("[]").replace("\n", " ").split(), dtype=float)
        )

        # Flatten the numpy array into separate columns
        vector_df = pd.DataFrame(df[column].tolist(), index=df.index)
        vector_df.columns = [f"{column}_{i}" for i in range(vector_df.shape[1])]

        # Drop the original vector column and concatenate the new DataFrame
        df = df.drop(column, axis=1)
        df = pd.concat([df, vector_df], axis=1)

    return df

In [None]:
X_train = X_train[features]
X_test = X_test[features]

In [None]:
# Flatten lineup vectors
X_train = flatten_vector_columns(X_train, lineup_vectors)
X_test = flatten_vector_columns(X_test, lineup_vectors)

### Combined Data

In [None]:
combined_train_df = pd.concat([X_train, y_train_cls, y_train_reg], axis=1)
combined_test_df = pd.concat([X_test, y_test_cls, y_test_reg], axis=1)

In [None]:
combined_train_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1323 entries, 0 to 1322
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_of_season             1323 non-null   int64  
 1   home_team_rest            1323 non-null   int64  
 2   road_team_rest            1323 non-null   int64  
 3   home_win_pct              1323 non-null   float64
 4   road_win_pct              1323 non-null   float64
 5   home_win_pct_l2w          1323 non-null   float64
 6   road_win_pct_l2w          1323 non-null   float64
 7   home_avg_pts              1323 non-null   float64
 8   road_avg_pts              1323 non-null   float64
 9   home_avg_pts_l2w          1323 non-null   float64
 10  road_avg_pts_l2w          1323 non-null   float64
 11  home_avg_oeff             1323 non-null   float64
 12  road_avg_oeff             1323 non-null   float64
 13  home_avg_oeff_l2w         1323 non-null   float64
 14  road_avg

In [None]:
combined_test_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1320 entries, 0 to 1319
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_of_season             1320 non-null   int64  
 1   home_team_rest            1320 non-null   int64  
 2   road_team_rest            1320 non-null   int64  
 3   home_win_pct              1320 non-null   float64
 4   road_win_pct              1320 non-null   float64
 5   home_win_pct_l2w          1320 non-null   float64
 6   road_win_pct_l2w          1320 non-null   float64
 7   home_avg_pts              1320 non-null   float64
 8   road_avg_pts              1320 non-null   float64
 9   home_avg_pts_l2w          1320 non-null   float64
 10  road_avg_pts_l2w          1320 non-null   float64
 11  home_avg_oeff             1320 non-null   float64
 12  road_avg_oeff             1320 non-null   float64
 13  home_avg_oeff_l2w         1320 non-null   float64
 14  road_avg

## Model

### Model Evaluation and Prediction

In [12]:
train_mae = mean_absolute_error(train_predictions_reg, y_train_reg)
train_r2 = r2_score(train_predictions_reg, y_train_reg)

test_mae = mean_absolute_error(test_predictions_reg, y_test_reg)
test_r2 = r2_score(test_predictions_reg, y_test_reg)

In [13]:
print(f"Train MAE: {train_mae:.2f}")
print(f"Train R2: {train_r2:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test R2: {test_r2:.2f}")

Train MAE: 11.81
Train R2: -30.65
Test MAE: 10.73
Test R2: -38.40


### Model Saving and Loading

In [14]:
problem_type = "Regression"
base_model = "MLP"
train_performance = round(train_mae, 2)
test_performance = round(test_mae, 2)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Regression_MLP_11.81_10.73_2024-01-03_15-50-13'