# NBA AI - AutoML and AutoDL

PyCaret
* Main Site - https://pycaret.org/
* Docs - https://pycaret.readthedocs.io/en/latest/

AutoKeras
* Main Site - https://autokeras.com/

## Table of Contents

* [Data Setup](#data-setup)
* [AutoML Classification](#automl-classification)
* [AutoML Regression](#automl-regression)
* [AutoDL Classification](#autodl-classification)
* [AutoDL Regression](#autodl-regression)

### Imports and Global Settings

In [1]:
import datetime
import pandas as pd
from pycaret.classification import ClassificationExperiment
from pycaret.regression import RegressionExperiment
import autokeras as ak
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    accuracy_score,
    precision_score,
)

# Pandas Settings
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
pd.options.display.max_info_columns = 200
pd.options.display.precision = 5

Using TensorFlow backend


2024-01-01 19:27:41.891577: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-01 19:27:41.940890: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-01 19:27:41.941785: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load Data

In [2]:
df_2021_2022 = pd.read_csv("../data/nba_ai/cleaned_data_2021-2022.csv")

<a name="data-setup"></a>

## Data Preparation

### Train Test Split

In [3]:
def prepare_datasets(train_df, cls_target, reg_target, test_df=None, test_size=0.3):
    """
    Prepares datasets for training and testing for both classification and regression targets,
    ensuring time-sensitive splitting based on a 'date' column.

    Parameters:
    train_df (DataFrame): The training dataframe.
    cls_target (str): The name of the classification target column.
    reg_target (str): The name of the regression target column.
    test_df (DataFrame, optional): An optional testing dataframe. If not provided, a portion of the training data is used.
    test_size (float, optional): The proportion of the dataset to include in the test split (if test_df is not provided).

    Returns:
    tuple: A tuple containing six dataframes:
           (X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg).
    """

    # Sort the dataframe based on the 'date' column
    train_df = train_df.sort_values(by="date")

    # If a test dataframe is not provided, split the training dataframe
    if test_df is None:
        X_train, X_test, y_train, y_test = train_test_split(
            train_df.drop([cls_target, reg_target], axis=1),
            train_df[[cls_target, reg_target]],
            test_size=test_size,
            shuffle=False,  # Important to maintain time order
        )
    else:
        # If a test dataframe is provided, ensure it is also sorted by date
        test_df = test_df.sort_values(by="date")

        # Use provided test dataframe and separate features and targets
        X_train = train_df.drop([cls_target, reg_target], axis=1)
        y_train = train_df[[cls_target, reg_target]]
        X_test = test_df.drop([cls_target, reg_target], axis=1)
        y_test = test_df[[cls_target, reg_target]]

    # Separate classification and regression targets
    y_train_cls = y_train[[cls_target]]
    y_train_reg = y_train[[reg_target]]
    y_test_cls = y_test[[cls_target]]
    y_test_reg = y_test[[reg_target]]

    return X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg

In [4]:
X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg = prepare_datasets(
    df_2021_2022, "CLS_TARGET", "REG_TARGET", test_df=None
)

### Features

In [5]:
betting_feature_set = [
    "home_opening_spread",
    "road_opening_spread",
    "opening_total",
    "home_closing_spread",
    "road_closing_spread",
    "closing_total",
    "home_moneyline",
    "road_moneyline",
]

base_feature_set = [
    "day_of_season",
    "home_win_pct",
    "road_win_pct",
    "home_win_pct_l2w",
    "road_win_pct_l2w",
    "home_avg_pts",
    "road_avg_pts",
    "home_avg_pts_l2w",
    "road_avg_pts_l2w",
    "home_avg_oeff",
    "road_avg_oeff",
    "home_avg_oeff_l2w",
    "road_avg_oeff_l2w",
    "home_avg_deff",
    "road_avg_deff",
    "home_avg_deff_l2w",
    "road_avg_deff_l2w",
    "home_avg_eFG%",
    "road_avg_eFG%",
    "home_avg_eFG%_l2w",
    "road_avg_eFG%_l2w",
    "home_avg_TOV%",
    "road_avg_TOV%",
    "home_avg_TOV%_l2w",
    "road_avg_TOV%_l2w",
    "home_avg_ORB%",
    "road_avg_ORB%",
    "home_avg_ORB%_l2w",
    "road_avg_ORB%_l2w",
    "home_avg_FT%",
    "road_avg_FT%",
    "home_avg_FT%_l2w",
    "road_avg_FT%_l2w",
    "home_avg_pts_allowed",
    "road_avg_pts_allowed",
    "home_avg_pts_allowed_l2w",
    "road_avg_pts_allowed_l2w",
]

features_to_prepare = [
    "home_team",
    "road_team",
    "home_team_rest_days",
    "road_team_rest_days",
    "home_team_starting_lineup",
    "road_team_starting_lineup",
]

In [6]:
X_train = X_train[base_feature_set]
X_test = X_test[base_feature_set]

### Combined Data

In [7]:
combined_train_df = pd.concat([X_train, y_train_cls, y_train_reg], axis=1)
combined_test_df = pd.concat([X_test, y_test_cls, y_test_reg], axis=1)

In [8]:
combined_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 926 entries, 0 to 925
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_of_season             926 non-null    int64  
 1   home_win_pct              926 non-null    float64
 2   road_win_pct              926 non-null    float64
 3   home_win_pct_l2w          926 non-null    float64
 4   road_win_pct_l2w          926 non-null    float64
 5   home_avg_pts              926 non-null    float64
 6   road_avg_pts              926 non-null    float64
 7   home_avg_pts_l2w          926 non-null    float64
 8   road_avg_pts_l2w          926 non-null    float64
 9   home_avg_oeff             926 non-null    float64
 10  road_avg_oeff             926 non-null    float64
 11  home_avg_oeff_l2w         926 non-null    float64
 12  road_avg_oeff_l2w         926 non-null    float64
 13  home_avg_deff             926 non-null    float64
 14  road_avg_d

In [9]:
combined_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397 entries, 926 to 1322
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_of_season             397 non-null    int64  
 1   home_win_pct              397 non-null    float64
 2   road_win_pct              397 non-null    float64
 3   home_win_pct_l2w          397 non-null    float64
 4   road_win_pct_l2w          397 non-null    float64
 5   home_avg_pts              397 non-null    float64
 6   road_avg_pts              397 non-null    float64
 7   home_avg_pts_l2w          397 non-null    float64
 8   road_avg_pts_l2w          397 non-null    float64
 9   home_avg_oeff             397 non-null    float64
 10  road_avg_oeff             397 non-null    float64
 11  home_avg_oeff_l2w         397 non-null    float64
 12  road_avg_oeff_l2w         397 non-null    float64
 13  home_avg_deff             397 non-null    float64
 14  road_av

<a name="automl-classification"></a>

## AutoML Classification

### Setup and Preprocessing

In [10]:
py_cls = ClassificationExperiment()

In [11]:
setup_params_cls = {
    "data": combined_train_df,
    "test_data": combined_test_df,
    "target": "CLS_TARGET",
    "ignore_features": ["REG_TARGET"],
}

In [12]:
py_cls.setup(**setup_params_cls)

Unnamed: 0,Description,Value
0,Session id,2231
1,Target,CLS_TARGET
2,Target type,Binary
3,Original data shape,"(1323, 39)"
4,Transformed data shape,"(1323, 38)"
5,Transformed train set shape,"(926, 38)"
6,Transformed test set shape,"(397, 38)"
7,Ignore features,1
8,Numeric features,37
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x7f3d068d3190>

### Compare Models

In [13]:
best_model_cls = py_cls.compare_models(
    turbo=True, sort="Accuracy", exclude=["catboost"]
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.5412,0.0,0.4024,0.5472,0.4406,0.0668,0.0797,0.038
dummy,Dummy Classifier,0.5313,0.5,0.0,0.0,0.0,0.0,0.0,0.056
lr,Logistic Regression,0.5292,0.5474,0.3679,0.5261,0.4086,0.0398,0.0505,0.456
nb,Naive Bayes,0.5281,0.5459,0.0341,0.0455,0.039,-0.0027,-0.0028,0.041
lda,Linear Discriminant Analysis,0.5271,0.5531,0.419,0.5105,0.4435,0.0421,0.048,0.05
qda,Quadratic Discriminant Analysis,0.5097,0.4927,0.4405,0.4693,0.4511,0.0112,0.01,0.043
svm,SVM - Linear Kernel,0.5011,0.0,0.4227,0.2258,0.286,-0.0104,-0.0054,0.043
et,Extra Trees Classifier,0.4945,0.5024,0.3473,0.4781,0.3739,-0.0288,-0.0207,0.384
dt,Decision Tree Classifier,0.4914,0.4867,0.4162,0.4076,0.41,-0.0266,-0.0263,0.05
rf,Random Forest Classifier,0.4847,0.4656,0.3247,0.3965,0.3451,-0.0495,-0.0483,0.551


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [14]:
print(best_model_cls)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, random_state=2231, solver='auto',
                tol=0.0001)


### Create Selected Model

In [15]:
model_cls = py_cls.create_model("ridge")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5484,0.0,0.3636,0.5333,0.4324,0.0792,0.0832
1,0.5054,0.0,0.4545,0.4762,0.4651,0.0056,0.0056
2,0.5484,0.0,0.6364,0.5185,0.5714,0.1045,0.107
3,0.4731,0.0,0.5,0.449,0.4731,-0.0507,-0.051
4,0.5161,0.0,0.4651,0.4762,0.4706,0.0252,0.0252
5,0.5269,0.0,0.3953,0.4857,0.4359,0.0358,0.0364
6,0.5761,0.0,0.3721,0.5714,0.4507,0.13,0.1379
7,0.5217,0.0,0.2791,0.48,0.3529,0.0141,0.0154
8,0.5978,0.0,0.3953,0.6071,0.4789,0.1746,0.1853
9,0.5978,0.0,0.1628,0.875,0.2745,0.1499,0.2521


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### Tune Selected Model

In [16]:
tuned_model_cls = py_cls.tune_model(model_cls)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5699,0.0,0.3636,0.5714,0.4444,0.121,0.1292
1,0.5376,0.0,0.4773,0.5122,0.4941,0.0694,0.0695
2,0.5376,0.0,0.6364,0.5091,0.5657,0.0843,0.0867
3,0.4624,0.0,0.4773,0.4375,0.4565,-0.0734,-0.0737
4,0.5376,0.0,0.4651,0.5,0.4819,0.0654,0.0656
5,0.5269,0.0,0.3953,0.4857,0.4359,0.0358,0.0364
6,0.5761,0.0,0.3488,0.5769,0.4348,0.1274,0.1378
7,0.4891,0.0,0.2326,0.4167,0.2985,-0.0546,-0.0604
8,0.5761,0.0,0.3488,0.5769,0.4348,0.1274,0.1378
9,0.587,0.0,0.1395,0.8571,0.24,0.1256,0.2242


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [17]:
print(tuned_model_cls)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, random_state=2231, solver='auto',
                tol=0.0001)


### Evaluate Model

In [18]:
py_cls.evaluate_model(tuned_model_cls)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [19]:
# py_cls.interpret_model(tuned_model_cls)

In [20]:
train_predictions_cls = py_cls.predict_model(tuned_model_cls, data=X_train)

In [21]:
test_predictions_cls = py_cls.predict_model(tuned_model_cls, data=X_test)

In [22]:
train_accuracy = accuracy_score(train_predictions_cls["prediction_label"], y_train_cls)
train_precision = precision_score(
    train_predictions_cls["prediction_label"], y_train_cls
)

test_accuracy = accuracy_score(test_predictions_cls["prediction_label"], y_test_cls)
test_precision = precision_score(test_predictions_cls["prediction_label"], y_test_cls)

In [23]:
print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Train Precision: {train_precision:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test Precision: {test_precision:.2f}")

Train Accuracy: 0.58
Train Precision: 0.43
Test Accuracy: 0.49
Test Precision: 0.10


### Model Finalization and Storage

In [24]:
final_model_cls = py_cls.finalize_model(tuned_model_cls)

In [25]:
problem_type = "Classification"
base_model = "Ridge"
train_performance = round(train_accuracy * 100)
test_performance = round(test_accuracy * 100)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Classification_Ridge_58_49_2024-01-01_19-28-33'

In [26]:
# py_cls.save_model(final_model_cls, f"../models/AutoML/{model_id}")

<a name="automl-regression"></a>

## AutoML Regression

### Setup and Preprocessing

In [27]:
py_reg = RegressionExperiment()

In [28]:
setup_params_reg = {
    "data": combined_train_df,
    "test_data": combined_test_df,
    "target": "REG_TARGET",
    "ignore_features": ["CLS_TARGET"],
}

In [29]:
py_reg.setup(**setup_params_reg)

Unnamed: 0,Description,Value
0,Session id,6331
1,Target,REG_TARGET
2,Target type,Regression
3,Original data shape,"(1323, 39)"
4,Transformed data shape,"(1323, 38)"
5,Transformed train set shape,"(926, 38)"
6,Transformed test set shape,"(397, 38)"
7,Ignore features,1
8,Numeric features,37
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x7f3d06b93820>

### Compare Models

In [30]:
best_model_reg = py_reg.compare_models(turbo=True, sort="MAE", exclude=["catboost"])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ridge,Ridge Regression,11.5953,215.7389,14.6203,0.0261,1.3017,1.204,0.051
br,Bayesian Ridge,11.6106,214.8984,14.6018,0.0307,1.434,1.1316,0.05
en,Elastic Net,11.615,215.2043,14.6079,0.0297,1.3771,1.1641,0.049
lasso,Lasso Regression,11.6336,215.8861,14.6318,0.0264,1.3962,1.1608,0.056
llar,Lasso Least Angle Regression,11.6338,215.8878,14.6319,0.0264,1.3962,1.1608,0.045
lr,Linear Regression,11.681,220.0833,14.7612,0.0054,1.2484,1.2623,0.047
et,Extra Trees Regressor,11.6935,214.9843,14.614,0.0322,1.237,1.263,0.486
huber,Huber Regressor,11.7142,221.2243,14.7802,0.0029,1.3663,1.1825,0.066
ada,AdaBoost Regressor,11.7232,215.1913,14.6335,0.0258,1.1623,1.314,0.255
rf,Random Forest Regressor,11.8098,216.866,14.6825,0.0222,1.2397,1.2658,1.03


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [31]:
print(best_model_reg)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, positive=False,
      random_state=6331, solver='auto', tol=0.0001)


### Create Selected Model

In [32]:
model_reg = py_reg.create_model("ridge")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,13.7807,307.3384,17.5311,-0.395,1.2256,1.6979
1,11.0267,187.3829,13.6888,-0.0155,1.3058,1.1949
2,11.818,208.64,14.4444,0.055,1.3157,1.0732
3,12.22,264.3194,16.2579,-0.0493,1.2371,1.412
4,11.7251,203.6416,14.2703,0.08,1.2917,1.0816
5,10.3634,171.7124,13.1039,0.1198,1.1946,1.0789
6,12.3453,237.1913,15.401,0.0729,1.4645,0.9499
7,9.6131,155.8291,12.4832,0.1217,1.2033,1.1248
8,11.815,201.8327,14.2068,0.1637,1.4881,1.1739
9,11.2456,219.5009,14.8156,0.1078,1.2908,1.2533


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### Tune Selected Model

In [33]:
tuned_model_reg = py_reg.tune_model(model_reg)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,13.8446,305.1386,17.4682,-0.385,1.2611,1.6751
1,10.8099,180.5792,13.438,0.0214,1.2728,1.1652
2,11.8503,210.4615,14.5073,0.0468,1.3531,1.0774
3,12.0459,260.7423,16.1475,-0.0351,1.2179,1.355
4,11.7234,202.0993,14.2162,0.087,1.3335,1.0715
5,10.2156,167.8061,12.954,0.1399,1.27,1.0365
6,12.416,239.0653,15.4617,0.0656,1.4474,0.9461
7,9.4573,152.8162,12.3619,0.1387,1.1898,1.0772
8,11.8328,202.7207,14.238,0.16,1.4847,1.1199
9,11.3875,222.6365,14.921,0.0951,1.3697,1.2695


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [34]:
print(tuned_model_reg)

Ridge(alpha=9.7, copy_X=True, fit_intercept=True, max_iter=None, positive=False,
      random_state=6331, solver='auto', tol=0.0001)


### Evaluate Model

In [35]:
py_reg.evaluate_model(tuned_model_reg)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [36]:
# py_reg.interpret_model(tuned_model_reg)

In [37]:
train_predictions_reg = py_reg.predict_model(tuned_model_reg, data=X_train)

In [38]:
test_predictions_reg = py_reg.predict_model(tuned_model_reg, data=X_test)

In [39]:
train_mae = mean_absolute_error(train_predictions_reg["prediction_label"], y_train_reg)
train_r2 = r2_score(train_predictions_reg["prediction_label"], y_train_reg)

test_mae = mean_absolute_error(test_predictions_reg["prediction_label"], y_test_reg)
test_r2 = r2_score(test_predictions_reg["prediction_label"], y_test_reg)

In [40]:
print(f"Train MAE: {train_mae:.2f}")
print(f"Train R2: {train_r2:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test R2: {test_r2:.2f}")

Train MAE: 11.16
Train R2: -7.92
Test MAE: 12.54
Test R2: -15.88


### Model Finalization and Storage

In [41]:
final_model_reg = py_reg.finalize_model(tuned_model_reg)

In [42]:
problem_type = "Regression"
base_model = "Ridge"
train_performance = round(train_mae, 2)
test_performance = round(test_mae, 2)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Regression_Ridge_11.16_12.54_2024-01-01_19-29-29'

In [43]:
py_reg.save_model(final_model_reg, f"../models/AutoML/{model_id}")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['day_of_season', 'home_win_pct',
                                              'road_win_pct', 'home_win_pct_l2w',
                                              'road_win_pct_l2w', 'home_avg_pts',
                                              'road_avg_pts', 'home_avg_pts_l2w',
                                              'road_avg_pts_l2w',
                                              'home_avg_oeff', 'road_avg_oeff',
                                              'home_avg_oeff_l2w',
                                              'road_avg_oeff_l2w',
                                              'home_av...
                                                               missing_values=nan,
                                                               strategy='most_frequent',
                        

<a name="autodl-classification"></a>

## AutoDL Classification

In [44]:
ak_cls = ak.StructuredDataClassifier(
    max_trials=10,
    overwrite=True,
    loss="accuracy",
)

In [45]:
ak_cls.fit(X_train, y_train_cls)

Trial 10 Complete [00h 00m 04s]
val_accuracy: 0.5886076092720032

Best val_accuracy So Far: 0.6202531456947327
Total elapsed time: 00h 00m 48s
Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


<keras.src.callbacks.History at 0x7f3cd0566620>

In [46]:
# Evaluate the best model with testing data.
print(ak_cls.evaluate(X_test, y_test_cls))

[0.6933649182319641, 0.5314861536026001]


In [47]:
ak_cls_model = ak_cls.export_model()
ak_cls_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 37)]              0         
                                                                 
 multi_category_encoding (M  (None, 37)                0         
 ultiCategoryEncoding)                                           
                                                                 
 normalization (Normalizati  (None, 37)                75        
 on)                                                             
                                                                 
 dense (Dense)               (None, 32)                1216      
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1024)              33792 

<a id=evaluate></a>

### Evaluate Model

In [48]:
train_pred = ak_cls_model.predict(X_train)
test_pred = ak_cls_model.predict(X_test)



In [49]:
train_pred = train_pred.flatten()
train_pred_labels = [True if x > 0.5 else False for x in train_pred]

In [50]:
test_pred = test_pred.flatten()
test_pred_labels = [True if x > 0.5 else False for x in test_pred]

In [51]:
train_accuracy = accuracy_score(y_train_cls, train_pred_labels)
train_precision = precision_score(y_train_cls, train_pred_labels)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Training Precision: {train_precision:.2f}")

Training Accuracy: 0.60
Training Precision: 0.56


In [52]:
test_accuracy = accuracy_score(y_test_cls, test_pred_labels)
test_auc = precision_score(y_test_cls, test_pred_labels)
print(f"Testing Accuracy: {test_accuracy:.2f}")
print(f"Testing Precision: {test_precision:.2f}")

Testing Accuracy: 0.53
Testing Precision: 0.10


### Model Storage

In [53]:
problem_type = "Classification"
base_model = "AutoKeras"
train_performance = round(train_accuracy * 100)
test_performance = round(test_accuracy * 100)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Classification_AutoKeras_60_53_2024-01-01_19-30-28'

In [54]:
ak_cls_model.save(f"../models/AutoDL/{model_id}", save_format="tf")

INFO:tensorflow:Assets written to: ../models/AutoDL/Classification_AutoKeras_60_53_2024-01-01_19-30-28/assets


INFO:tensorflow:Assets written to: ../models/AutoDL/Classification_AutoKeras_60_53_2024-01-01_19-30-28/assets


<a name="autodl-regression"></a>

## AutoDL Regression

In [55]:
ak_reg = ak.StructuredDataRegressor(
    max_trials=10,
    overwrite=True,
    loss="mae",
)

In [56]:
ak_reg.fit(X_train, y_train_reg)

Trial 10 Complete [00h 00m 12s]
val_loss: 11.725468635559082

Best val_loss So Far: 11.679612159729004
Total elapsed time: 00h 01m 40s
Epoch 1/41
Epoch 2/41
Epoch 3/41
Epoch 4/41
Epoch 5/41
Epoch 6/41
Epoch 7/41
Epoch 8/41
Epoch 9/41
Epoch 10/41
Epoch 11/41
Epoch 12/41
Epoch 13/41
Epoch 14/41
Epoch 15/41
Epoch 16/41
Epoch 17/41
Epoch 18/41
Epoch 19/41
Epoch 20/41
Epoch 21/41
Epoch 22/41
Epoch 23/41
Epoch 24/41
Epoch 25/41
Epoch 26/41
Epoch 27/41
Epoch 28/41
Epoch 29/41
Epoch 30/41
Epoch 31/41
Epoch 32/41
Epoch 33/41
Epoch 34/41
Epoch 35/41
Epoch 36/41
Epoch 37/41
Epoch 38/41
Epoch 39/41
Epoch 40/41
Epoch 41/41
INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets


INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets


<keras.src.callbacks.History at 0x7f3d06a3ab60>

In [57]:
print(ak_reg.evaluate(X_test, y_test_reg))

[12.179737091064453, 243.3804473876953]


In [58]:
ak_reg_model = ak_reg.export_model()
ak_reg_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 37)]              0         
                                                                 
 multi_category_encoding (M  (None, 37)                0         
 ultiCategoryEncoding)                                           
                                                                 
 normalization (Normalizati  (None, 37)                75        
 on)                                                             
                                                                 
 dense (Dense)               (None, 32)                1216      
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056  

<a id=evaluate></a>

### Evaluate Model

In [59]:
train_pred = ak_reg_model.predict(X_train)
test_pred = ak_reg_model.predict(X_test)



In [60]:
train_mae = mean_absolute_error(y_train_reg, train_pred)
train_r2 = r2_score(y_train_reg, train_pred)
print(f"Training MAE: {train_mae:.2f}")
print(f"Training R2: {train_r2:.2f}")

Training MAE: 10.25
Training R2: 0.18


In [61]:
test_mae = mean_absolute_error(y_test_reg, test_pred)
test_r2 = r2_score(y_test_reg, test_pred)
print(f"Testing MAE: {test_mae:.2f}")
print(f"Testing R2: {test_r2:.2f}")

Testing MAE: 12.18
Testing R2: 0.06


### Model Storage

In [62]:
problem_type = "Regression"
base_model = "AutoKeras"
train_performance = round(train_mae, 2)
test_performance = round(test_mae, 2)

model_id = f"{problem_type}_{base_model}_{train_performance}_{test_performance}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

model_id

'Regression_AutoKeras_10.25_12.18_2024-01-01_19-32-22'

In [63]:
ak_reg_model.save(f"../models/AutoDL/{model_id}", save_format="tf")

INFO:tensorflow:Assets written to: ../models/AutoDL/Regression_AutoKeras_10.25_12.18_2024-01-01_19-32-22/assets


INFO:tensorflow:Assets written to: ../models/AutoDL/Regression_AutoKeras_10.25_12.18_2024-01-01_19-32-22/assets
