# NBA AI - AutoML and AutoDL

PyCaret
* Main Site - https://pycaret.org/
* Docs - https://pycaret.readthedocs.io/en/latest/

AutoKeras
* Main Site - https://autokeras.com/

## Table of Contents

* [Data Setup](#data-setup)
* [AutoML Classification](#automl-classification)
* [AutoML Regression](#automl-regression)
* [AutoDL Classification](#autodl-classification)
* [AutoDL Regression](#autodl-regression)

### Imports and Global Settings

In [1]:
import datetime
import pandas as pd
import numpy as np
from pycaret.classification import ClassificationExperiment
from pycaret.regression import RegressionExperiment
import autokeras as ak
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    accuracy_score,
    precision_score,
)

# Pandas Settings
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
pd.options.display.max_info_columns = 200
pd.options.display.precision = 5

Using TensorFlow backend


2024-01-25 22:03:55.401063: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-25 22:03:55.456585: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-25 22:03:55.457907: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load Data

In [2]:
df_2021_2022 = pd.read_csv("../data/nba_ai/cleaned_data_2021-2022.csv")
df_2022_2023 = pd.read_csv("../data/nba_ai/cleaned_data_2022-2023.csv")

<a name="data-setup"></a>

## Data Preparation

### Train Test Split

In [3]:
def prepare_datasets(train_df, cls_target, reg_target, test_df=None, test_size=0.3):
    """
    Prepares datasets for training and testing for both classification and regression targets,
    ensuring time-sensitive splitting based on a 'date' column.

    Parameters:
    train_df (DataFrame): The training dataframe.
    cls_target (str): The name of the classification target column.
    reg_target (str): The name of the regression target column.
    test_df (DataFrame, optional): An optional testing dataframe. If not provided, a portion of the training data is used.
    test_size (float, optional): The proportion of the dataset to include in the test split (if test_df is not provided).

    Returns:
    tuple: A tuple containing six dataframes:
           (X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg).
    """

    # Sort the dataframe based on the 'date' column
    train_df = train_df.sort_values(by="date")

    # If a test dataframe is not provided, split the training dataframe
    if test_df is None:
        X_train, X_test, y_train, y_test = train_test_split(
            train_df.drop([cls_target, reg_target], axis=1),
            train_df[[cls_target, reg_target]],
            test_size=test_size,
            shuffle=False,  # Important to maintain time order
        )
    else:
        # If a test dataframe is provided, ensure it is also sorted by date
        test_df = test_df.sort_values(by="date")

        # Use provided test dataframe and separate features and targets
        X_train = train_df.drop([cls_target, reg_target], axis=1)
        y_train = train_df[[cls_target, reg_target]]
        X_test = test_df.drop([cls_target, reg_target], axis=1)
        y_test = test_df[[cls_target, reg_target]]

    # Separate classification and regression targets
    y_train_cls = y_train[[cls_target]]
    y_train_reg = y_train[[reg_target]]
    y_test_cls = y_test[[cls_target]]
    y_test_reg = y_test[[reg_target]]

    return X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg

In [4]:
X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg = prepare_datasets(
    df_2021_2022, "CLS_TARGET", "REG_TARGET", test_df=df_2022_2023
)

### Features

In [5]:
betting_feature_set = [
    "home_opening_spread",
    "opening_total",
    "home_moneyline",
    "road_moneyline",
]

base_feature_set = [
    "day_of_season",
    "home_team_rest",
    "road_team_rest",
    "home_win_pct",
    "road_win_pct",
    "home_win_pct_l2w",
    "road_win_pct_l2w",
    "home_avg_pts",
    "road_avg_pts",
    "home_avg_pts_l2w",
    "road_avg_pts_l2w",
    "home_avg_oeff",
    "road_avg_oeff",
    "home_avg_oeff_l2w",
    "road_avg_oeff_l2w",
    "home_avg_deff",
    "road_avg_deff",
    "home_avg_deff_l2w",
    "road_avg_deff_l2w",
    "home_avg_eFG%",
    "road_avg_eFG%",
    "home_avg_eFG%_l2w",
    "road_avg_eFG%_l2w",
    "home_avg_TOV%",
    "road_avg_TOV%",
    "home_avg_TOV%_l2w",
    "road_avg_TOV%_l2w",
    "home_avg_ORB%",
    "road_avg_ORB%",
    "home_avg_ORB%_l2w",
    "road_avg_ORB%_l2w",
    "home_avg_FT%",
    "road_avg_FT%",
    "home_avg_FT%_l2w",
    "road_avg_FT%_l2w",
    "home_avg_pts_allowed",
    "road_avg_pts_allowed",
    "home_avg_pts_allowed_l2w",
    "road_avg_pts_allowed_l2w",
]

lineup_vectors = ["home_lineup_vector", "road_lineup_vector"]

In [6]:
features = base_feature_set

In [7]:
def flatten_vector_columns(df, vector_columns):
    """
    Flatten vector columns into separate feature columns.

    This function takes a DataFrame and a list of column names that store vector data as strings
    (typically after being read from a CSV file), and returns a new DataFrame where the vectors
    have been flattened into separate feature columns.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    vector_columns (list): A list of column names in df that store vector data as strings.

    Returns:
    pandas.DataFrame: The DataFrame with vector columns flattened.
    """
    for column in vector_columns:
        if column not in df.columns:
            continue
        # Convert the string representation of the vector into a numpy array
        df[column] = df[column].apply(
            lambda x: np.array(x.strip("[]").replace("\n", " ").split(), dtype=float)
        )

        # Flatten the numpy array into separate columns
        vector_df = pd.DataFrame(df[column].tolist(), index=df.index)
        vector_df.columns = [f"{column}_{i}" for i in range(vector_df.shape[1])]

        # Drop the original vector column and concatenate the new DataFrame
        df = df.drop(column, axis=1)
        df = pd.concat([df, vector_df], axis=1)

    return df

In [8]:
X_train = X_train[features]
X_test = X_test[features]

In [9]:
# Flatten lineup vectors
X_train = flatten_vector_columns(X_train, lineup_vectors)
X_test = flatten_vector_columns(X_test, lineup_vectors)

### Combined Data

In [10]:
combined_train_df = pd.concat([X_train, y_train_cls, y_train_reg], axis=1)
combined_test_df = pd.concat([X_test, y_test_cls, y_test_reg], axis=1)

In [11]:
combined_train_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1323 entries, 0 to 1322
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_of_season             1323 non-null   int64  
 1   home_team_rest            1323 non-null   int64  
 2   road_team_rest            1323 non-null   int64  
 3   home_win_pct              1323 non-null   float64
 4   road_win_pct              1323 non-null   float64
 5   home_win_pct_l2w          1323 non-null   float64
 6   road_win_pct_l2w          1323 non-null   float64
 7   home_avg_pts              1323 non-null   float64
 8   road_avg_pts              1323 non-null   float64
 9   home_avg_pts_l2w          1323 non-null   float64
 10  road_avg_pts_l2w          1323 non-null   float64
 11  home_avg_oeff             1323 non-null   float64
 12  road_avg_oeff             1323 non-null   float64
 13  home_avg_oeff_l2w         1323 non-null   float64
 14  road_avg

In [12]:
combined_test_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1320 entries, 0 to 1319
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_of_season             1320 non-null   int64  
 1   home_team_rest            1320 non-null   int64  
 2   road_team_rest            1320 non-null   int64  
 3   home_win_pct              1320 non-null   float64
 4   road_win_pct              1320 non-null   float64
 5   home_win_pct_l2w          1320 non-null   float64
 6   road_win_pct_l2w          1320 non-null   float64
 7   home_avg_pts              1320 non-null   float64
 8   road_avg_pts              1320 non-null   float64
 9   home_avg_pts_l2w          1320 non-null   float64
 10  road_avg_pts_l2w          1320 non-null   float64
 11  home_avg_oeff             1320 non-null   float64
 12  road_avg_oeff             1320 non-null   float64
 13  home_avg_oeff_l2w         1320 non-null   float64
 14  road_avg

<a name="automl-classification"></a>

## AutoML Classification

### Setup and Preprocessing

In [13]:
py_cls = ClassificationExperiment()

In [14]:
setup_params_cls = {
    "data": combined_train_df,
    "test_data": combined_test_df,
    "target": "CLS_TARGET",
    "ignore_features": ["REG_TARGET"],
    "index": False,
    "session_id": 42,
}

In [15]:
py_cls.setup(**setup_params_cls)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,CLS_TARGET
2,Target type,Binary
3,Original data shape,"(2643, 41)"
4,Transformed data shape,"(2643, 40)"
5,Transformed train set shape,"(1323, 40)"
6,Transformed test set shape,"(1320, 40)"
7,Ignore features,1
8,Numeric features,39
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x7fd13b20b8e0>

### Compare Models

In [16]:
best_model_cls = py_cls.compare_models(
    turbo=False, sort="Accuracy", exclude=["catboost"]
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.5405,0.5356,0.4241,0.5234,0.4579,0.0715,0.0749,0.549
lightgbm,Light Gradient Boosting Machine,0.536,0.5208,0.4668,0.5208,0.4831,0.0664,0.0676,0.501
ridge,Ridge Classifier,0.5344,0.0,0.4321,0.5177,0.4628,0.0607,0.0631,0.407
xgboost,Extreme Gradient Boosting,0.5329,0.5416,0.467,0.5125,0.4744,0.0605,0.0617,0.425
lda,Linear Discriminant Analysis,0.5306,0.5416,0.4321,0.5145,0.465,0.0536,0.0556,0.442
et,Extra Trees Classifier,0.5276,0.5365,0.4225,0.5087,0.456,0.0468,0.0488,0.592
gbc,Gradient Boosting Classifier,0.5268,0.5379,0.5094,0.5189,0.4818,0.0524,0.0615,0.475
lr,Logistic Regression,0.5238,0.5391,0.4223,0.4995,0.4522,0.0391,0.0395,0.727
dummy,Dummy Classifier,0.5223,0.5,0.0,0.0,0.0,0.0,0.0,0.455
dt,Decision Tree Classifier,0.5223,0.5199,0.4639,0.5022,0.4724,0.0401,0.0408,0.407


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [17]:
print(best_model_cls)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)


### Create Selected Model

In [18]:
model_cls = py_cls.create_model("rf")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5338,0.502,0.25,0.5333,0.3404,0.048,0.0563
1,0.5639,0.5248,0.5469,0.5469,0.5469,0.1266,0.1266
2,0.4737,0.4799,0.4127,0.4407,0.4262,-0.0589,-0.059
3,0.5303,0.5545,0.4127,0.5098,0.4561,0.0508,0.0517
4,0.5682,0.5757,0.4444,0.56,0.4956,0.1267,0.1293
5,0.5682,0.5968,0.381,0.5714,0.4571,0.1218,0.1288
6,0.5303,0.4967,0.3016,0.5135,0.38,0.0415,0.0453
7,0.5,0.455,0.2698,0.4595,0.34,-0.0204,-0.0223
8,0.5833,0.5855,0.4762,0.5769,0.5217,0.1586,0.1608
9,0.553,0.5853,0.746,0.5222,0.6144,0.1206,0.1317


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### Tune Selected Model

In [19]:
tuned_model_cls = py_cls.tune_model(model_cls)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5113,0.544,0.0,0.0,0.0,-0.015,-0.0838
1,0.5113,0.4774,0.5625,0.4932,0.5255,0.0261,0.0264
2,0.4737,0.4857,0.5714,0.4557,0.507,-0.0423,-0.0436
3,0.6061,0.6084,0.6984,0.5714,0.6286,0.218,0.223
4,0.5682,0.61,0.5238,0.55,0.5366,0.1328,0.1329
5,0.5758,0.6043,0.5714,0.5538,0.5625,0.1509,0.151
6,0.4848,0.4964,0.4603,0.4603,0.4603,-0.0324,-0.0324
7,0.5227,0.5052,0.5238,0.5,0.5116,0.0455,0.0455
8,0.5985,0.5824,0.6825,0.5658,0.6187,0.2025,0.2065
9,0.5455,0.5794,0.7778,0.5158,0.6203,0.1087,0.1236


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [20]:
print(tuned_model_cls)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)


### Evaluate Model

In [21]:
py_cls.evaluate_model(tuned_model_cls)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [22]:
# py_cls.interpret_model(tuned_model_cls)

In [23]:
train_predictions_cls = py_cls.predict_model(tuned_model_cls, data=X_train)

In [24]:
test_predictions_cls = py_cls.predict_model(tuned_model_cls, data=X_test)

In [25]:
train_accuracy = accuracy_score(train_predictions_cls["prediction_label"], y_train_cls)
train_precision = precision_score(
    train_predictions_cls["prediction_label"], y_train_cls
)

test_accuracy = accuracy_score(test_predictions_cls["prediction_label"], y_test_cls)
test_precision = precision_score(test_predictions_cls["prediction_label"], y_test_cls)

In [26]:
print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Train Precision: {train_precision:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test Precision: {test_precision:.2f}")

Train Accuracy: 1.00
Train Precision: 0.99
Test Accuracy: 0.50
Test Precision: 0.39


### Model Finalization and Storage

In [27]:
final_model_cls = py_cls.finalize_model(tuned_model_cls)

In [28]:
problem_type = "Classification"
base_model = "RandomForest"
train_performance = round(train_accuracy * 100)
test_performance = round(test_accuracy * 100)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Classification_RandomForest_100_50_2024-01-25_22-09-20'

In [29]:
py_cls.save_model(final_model_cls, f"../models/AutoML/{model_id}")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['day_of_season', 'home_team_rest',
                                              'road_team_rest', 'home_win_pct',
                                              'road_win_pct', 'home_win_pct_l2w',
                                              'road_win_pct_l2w', 'home_avg_pts',
                                              'road_avg_pts', 'home_avg_pts_l2w',
                                              'road_avg_pts_l2w',
                                              'home_avg_oeff', 'road_avg_oeff',
                                              'home_avg_oeff...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='sqrt',
              

<a name="automl-regression"></a>

## AutoML Regression

### Setup and Preprocessing

In [30]:
py_reg = RegressionExperiment()

In [31]:
setup_params_reg = {
    "data": combined_train_df,
    "test_data": combined_test_df,
    "target": "REG_TARGET",
    "ignore_features": ["CLS_TARGET"],
    "index": False,
    "session_id": 42,
}

In [32]:
py_reg.setup(**setup_params_reg)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,REG_TARGET
2,Target type,Regression
3,Original data shape,"(2643, 41)"
4,Transformed data shape,"(2643, 40)"
5,Transformed train set shape,"(1323, 40)"
6,Transformed test set shape,"(1320, 40)"
7,Ignore features,1
8,Numeric features,39
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x7fd118635a50>

### Compare Models

In [33]:
best_model_reg = py_reg.compare_models(
    turbo=True, sort="MAE", exclude=["catboost", "lar"]
)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,11.6363,216.4178,14.6855,0.0669,1.2384,1.2031,0.639
rf,Random Forest Regressor,11.6487,217.4325,14.7174,0.0611,1.2392,1.2005,0.453
ridge,Ridge Regression,11.7438,220.8187,14.8102,0.0463,1.293,1.2187,0.332
br,Bayesian Ridge,11.7699,221.8191,14.8413,0.0428,1.3665,1.1629,0.381
en,Elastic Net,11.7846,223.3422,14.8815,0.0356,1.3645,1.1781,0.365
huber,Huber Regressor,11.8,223.6563,14.9016,0.033,1.2828,1.2074,0.36
lasso,Lasso Regression,11.8559,225.8953,14.9581,0.0243,1.3769,1.1817,0.334
llar,Lasso Least Angle Regression,11.8567,225.9152,14.9587,0.0243,1.3769,1.1818,0.379
lr,Linear Regression,11.8617,227.7831,15.0196,0.0149,1.2671,1.2597,0.372
ada,AdaBoost Regressor,11.9364,225.918,15.0131,0.0224,1.1434,1.2729,0.548


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [34]:
print(best_model_reg)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='squared_error',
                    max_depth=None, max_features=1.0, max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start=False)


### Create Selected Model

In [35]:
model_reg = py_reg.create_model("rf")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,12.2211,241.7784,15.5492,-0.0993,1.3352,1.351
1,11.7215,200.6927,14.1666,-0.0109,1.2284,1.1547
2,12.6383,256.3479,16.0109,-0.0383,1.3173,1.3369
3,10.8538,180.3011,13.4276,0.1138,1.2011,1.1625
4,11.3572,211.0911,14.529,0.0632,1.299,1.0093
5,9.9938,170.209,13.0464,0.1083,1.1746,1.1831
6,12.0451,239.3228,15.4701,0.0902,1.272,1.3455
7,11.5034,214.8352,14.6573,0.1628,1.1753,1.0818
8,11.5808,220.7306,14.857,0.0723,1.1057,1.1833
9,12.5722,239.0163,15.4602,0.1489,1.2834,1.1973


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### Tune Selected Model

In [36]:
tuned_model_reg = py_reg.tune_model(model_reg)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,12.1072,234.844,15.3246,-0.0677,1.3292,1.3184
1,11.2822,191.9621,13.855,0.0331,1.2211,1.0409
2,12.4648,244.6708,15.642,0.009,1.3813,1.2765
3,10.7754,175.4239,13.2448,0.1377,1.245,1.1462
4,11.3832,207.3846,14.4009,0.0796,1.3178,1.0169
5,9.7389,164.7418,12.8352,0.137,1.189,1.0654
6,11.8156,230.9354,15.1966,0.1221,1.3116,1.214
7,11.6055,222.3423,14.9111,0.1336,1.2322,1.082
8,11.5662,211.0827,14.5287,0.1128,1.1775,1.1118
9,12.684,244.3433,15.6315,0.1299,1.395,1.0785


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [37]:
print(tuned_model_reg)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='squared_error',
                      max_depth=5, max_features=1.0, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_samples_leaf=2, min_samples_split=7,
                      min_weight_fraction_leaf=0.0, n_estimators=280, n_jobs=-1,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False)


### Evaluate Model

In [38]:
py_reg.evaluate_model(tuned_model_reg)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [39]:
# py_reg.interpret_model(tuned_model_reg)

In [40]:
train_predictions_reg = py_reg.predict_model(tuned_model_reg, data=X_train)

In [41]:
test_predictions_reg = py_reg.predict_model(tuned_model_reg, data=X_test)

In [42]:
train_mae = mean_absolute_error(train_predictions_reg["prediction_label"], y_train_reg)
train_r2 = r2_score(train_predictions_reg["prediction_label"], y_train_reg)

test_mae = mean_absolute_error(test_predictions_reg["prediction_label"], y_test_reg)
test_r2 = r2_score(test_predictions_reg["prediction_label"], y_test_reg)

In [43]:
print(f"Train MAE: {train_mae:.2f}")
print(f"Train R2: {train_r2:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test R2: {test_r2:.2f}")

Train MAE: 9.96
Train R2: -4.62
Test MAE: 10.52
Test R2: -10.19


### Model Finalization and Storage

In [44]:
final_model_reg = py_reg.finalize_model(tuned_model_reg)

In [45]:
problem_type = "Regression"
base_model = "RandomForest"
train_performance = round(train_mae, 2)
test_performance = round(test_mae, 2)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Regression_RandomForest_9.96_10.52_2024-01-25_22-12-33'

In [46]:
py_reg.save_model(final_model_reg, f"../models/AutoML/{model_id}")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['day_of_season', 'home_team_rest',
                                              'road_team_rest', 'home_win_pct',
                                              'road_win_pct', 'home_win_pct_l2w',
                                              'road_win_pct_l2w', 'home_avg_pts',
                                              'road_avg_pts', 'home_avg_pts_l2w',
                                              'road_avg_pts_l2w',
                                              'home_avg_oeff', 'road_avg_oeff',
                                              'home_avg_oeff...
                 ('actual_estimator',
                  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                        criterion='squared_error', max_depth=5,
                                        max_features=

<a name="autodl-classification"></a>

## AutoDL Classification

In [47]:
ak_cls = ak.StructuredDataClassifier(
    max_trials=10,
    overwrite=True,
    loss="accuracy",
    seed=42,
)

In [48]:
ak_cls.fit(X_train, y_train_cls)

Trial 10 Complete [00h 00m 06s]
val_accuracy: 0.5872340202331543

Best val_accuracy So Far: 0.5872340202331543
Total elapsed time: 00h 00m 56s
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


<keras.src.callbacks.History at 0x7fd1a4ff4c40>

In [49]:
# Evaluate the best model with testing data.
print(ak_cls.evaluate(X_test, y_test_cls))

[0.7032389640808105, 0.49318182468414307]


In [50]:
ak_cls_model = ak_cls.export_model()
ak_cls_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 39)]              0         
                                                                 
 multi_category_encoding (M  (None, 39)                0         
 ultiCategoryEncoding)                                           
                                                                 
 normalization (Normalizati  (None, 39)                79        
 on)                                                             
                                                                 
 dense (Dense)               (None, 32)                1280      
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056  

<a id=evaluate></a>

### Evaluate Model

In [51]:
train_pred = ak_cls_model.predict(X_train)
test_pred = ak_cls_model.predict(X_test)



In [52]:
train_pred = train_pred.flatten()
train_pred_labels = [True if x > 0.5 else False for x in train_pred]

In [53]:
test_pred = test_pred.flatten()
test_pred_labels = [True if x > 0.5 else False for x in test_pred]

In [54]:
train_accuracy = accuracy_score(y_train_cls, train_pred_labels)
train_precision = precision_score(y_train_cls, train_pred_labels)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Training Precision: {train_precision:.2f}")

Training Accuracy: 0.60
Training Precision: 0.62


In [55]:
test_accuracy = accuracy_score(y_test_cls, test_pred_labels)
test_auc = precision_score(y_test_cls, test_pred_labels)
print(f"Testing Accuracy: {test_accuracy:.2f}")
print(f"Testing Precision: {test_precision:.2f}")

Testing Accuracy: 0.49
Testing Precision: 0.39


### Model Storage

In [56]:
problem_type = "Classification"
base_model = "AutoKeras"
train_performance = round(train_accuracy * 100)
test_performance = round(test_accuracy * 100)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Classification_AutoKeras_60_49_2024-01-25_22-13-39'

In [57]:
ak_cls_model.save(f"../models/AutoDL/{model_id}", save_format="tf")

INFO:tensorflow:Assets written to: ../models/AutoDL/Classification_AutoKeras_60_49_2024-01-25_22-13-39/assets


INFO:tensorflow:Assets written to: ../models/AutoDL/Classification_AutoKeras_60_49_2024-01-25_22-13-39/assets


<a name="autodl-regression"></a>

## AutoDL Regression

In [58]:
ak_reg = ak.StructuredDataRegressor(
    max_trials=10,
    overwrite=True,
    loss="mae",
    seed=42,
)

In [59]:
ak_reg.fit(X_train, y_train_reg)

Trial 10 Complete [00h 00m 06s]
val_loss: 12.362330436706543

Best val_loss So Far: 12.128240585327148
Total elapsed time: 00h 01m 00s
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets


INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets


<keras.src.callbacks.History at 0x7fd0f4c9fca0>

In [60]:
print(ak_reg.evaluate(X_test, y_test_reg))

[10.816460609436035, 189.7711944580078]


In [61]:
ak_reg_model = ak_reg.export_model()
ak_reg_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 39)]              0         
                                                                 
 multi_category_encoding (M  (None, 39)                0         
 ultiCategoryEncoding)                                           
                                                                 
 normalization (Normalizati  (None, 39)                79        
 on)                                                             
                                                                 
 dense (Dense)               (None, 32)                1280      
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 128)               4224  

<a id=evaluate></a>

### Evaluate Model

In [62]:
train_pred = ak_reg_model.predict(X_train)
test_pred = ak_reg_model.predict(X_test)



In [63]:
train_mae = mean_absolute_error(y_train_reg, train_pred)
train_r2 = r2_score(y_train_reg, train_pred)
print(f"Training MAE: {train_mae:.2f}")
print(f"Training R2: {train_r2:.2f}")

Training MAE: 10.98
Training R2: 0.13


In [64]:
test_mae = mean_absolute_error(y_test_reg, test_pred)
test_r2 = r2_score(y_test_reg, test_pred)
print(f"Testing MAE: {test_mae:.2f}")
print(f"Testing R2: {test_r2:.2f}")

Testing MAE: 10.82
Testing R2: -0.02


### Model Storage

In [65]:
problem_type = "Regression"
base_model = "AutoKeras"
train_performance = round(train_mae, 2)
test_performance = round(test_mae, 2)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Regression_AutoKeras_10.98_10.82_2024-01-25_22-14-51'

In [66]:
ak_reg_model.save(f"../models/AutoDL/{model_id}", save_format="tf")

INFO:tensorflow:Assets written to: ../models/AutoDL/Regression_AutoKeras_10.98_10.82_2024-01-25_22-14-51/assets


INFO:tensorflow:Assets written to: ../models/AutoDL/Regression_AutoKeras_10.98_10.82_2024-01-25_22-14-51/assets
