# AutoML with PyCaret

PyCaret
* Main Site - https://pycaret.org/
* Docs - https://pycaret.readthedocs.io/en/latest/

## Table of Contents

* [Regression](#Regression)
    * [Setup and Preprocessing](#setup)  
    * [Compare Models](#compare)  
    * [Create Model](#create)  
    * [Tune Model](#tune)  
    * [Evaluate Model](#evaluate)  
    * [Finalize and Store Model](#finalize_and_store)
* [Classification](#Classification)
    * [Setup and Preprocessing](#setup_cls)  
    * [Compare Models](#compare_cls)  
    * [Create Model](#create_cls)  
    * [Tune Model](#tune_cls)  
    * [Evaluate Model](#evaluate_cls)  
    * [Finalize and Store Model](#finalize_and_store_cls)

## Imports and Global Settings

In [1]:
import os
import datetime
import json
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sklearn.metrics import accuracy_score, precision_score
from pycaret.classification import ClassificationExperiment
from pycaret.regression import RegressionExperiment

load_dotenv()
RDS_ENDPOINT = os.getenv("RDS_ENDPOINT")
RDS_PASSWORD = os.getenv("RDS_PASSWORD")

# Pandas Settings
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)
pd.options.display.max_info_columns = 200
pd.options.display.precision = 5

## Load Model Training Data

In [2]:
username = "postgres"
password = RDS_PASSWORD
endpoint = RDS_ENDPOINT
database = "nba_betting"
port = "5432"

connection = create_engine(
    f"postgresql+psycopg2://{username}:{password}@{endpoint}/{database}"
).connect()

In [3]:
df = pd.read_csv("all_data.csv", parse_dates=["game_date"])

### Restrict to Specific Years

<a id='basic_data_overview'></a>

## Basic Data Overview

In [4]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3481 entries, 0 to 3480
Data columns (total 59 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   game_id                          3481 non-null   int64         
 1   game_date                        3481 non-null   datetime64[ns]
 2   home_team                        3481 non-null   object        
 3   away_team                        3481 non-null   object        
 4   home_score                       3481 non-null   float64       
 5   away_score                       3481 non-null   float64       
 6   spread                           3481 non-null   float64       
 7   season                           3481 non-null   object        
 8   day_of_week                      3481 non-null   int64         
 9   month                            3481 non-null   int64         
 10  month_of_season                  3481 non-null   int64      

In [5]:
df.head(10)

Unnamed: 0,game_id,game_date,home_team,away_team,home_score,away_score,spread,season,day_of_week,month,month_of_season,week_of_season,total_score,actual_score_diff_hv,vegas_score_diff_hv,vegas_miss,vegas_miss_zscore,vegas_miss_abs,vegas_miss_abs_zscore,days_since_last_game_home,days_since_last_game_away,rest_diff_hv,home_team_last_5,away_team_last_5,home_team_streak,away_team_streak,home_team_win_pct,away_team_win_pct,home_team_avg_point_diff,away_team_avg_point_diff,home_team_avg_point_diff_last_5,away_team_avg_point_diff_last_5,last_5_hv,streak_hv,win_pct_hv,point_diff_hv,point_diff_last_5_hv,avg_score,adjusted_vegas_miss_abs,min_stats_home_sum,pts_stats_home_sum,plus_minus_stats_home_sum,min_box_home_sum,pts_box_home_sum,plus_minus_box_home_sum,min_stats_away_sum,pts_stats_away_sum,plus_minus_stats_away_sum,min_box_away_sum,pts_box_away_sum,plus_minus_box_away_sum,pts_stats_home_wavg,pts_stats_away_wavg,plus_minus_stats_home_wavg,plus_minus_stats_away_wavg,pts_box_home_wavg,plus_minus_box_home_wavg,pts_box_away_wavg,plus_minus_box_away_wavg
0,22000005,2020-12-25,MIA,NOP,111.0,98.0,-3.0,2020-2021,4,12,1,1,209.0,13.0,3.0,10.0,0.76887,10.0,-0.04251,2.0,2.0,0.0,1.0,1.0,-1.0,-1.0,0.0,0.0,-6.0,14.0,,,0.0,0.0,0.0,-20.0,,104.5,0.08921,212.0,102.0,-23.0,189,90,41,236.2,110.0,72.0,238,98,-65,14.55189,14.77858,-4.00047,9.85605,13.1164,6.91005,14.10084,-6.2437
1,22000006,2020-12-25,MIL,GSW,138.0,99.0,-9.5,2020-2021,4,12,1,1,237.0,39.0,9.5,29.5,2.24413,29.5,2.32919,2.0,3.0,-1.0,1.0,1.0,-1.0,-1.0,0.0,0.0,-1.0,-26.0,,,0.0,0.0,0.0,25.0,,118.5,0.26318,240.1,121.0,-5.0,198,114,167,240.1,99.0,-130.0,241,99,-195,17.03124,9.08205,0.7272,-13.97251,12.66162,18.28788,9.36515,-16.58091
2,22000007,2020-12-25,BOS,BKN,95.0,123.0,3.0,2020-2021,4,12,1,1,218.0,-28.0,-3.0,-25.0,-1.87903,25.0,1.78188,2.0,3.0,-1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,26.0,,,0.0,0.0,0.0,-25.0,,109.0,0.22303,227.6,118.0,10.0,222,90,-120,240.1,125.0,130.0,241,123,140,15.41696,11.05206,1.84139,14.56601,11.54955,-15.21171,13.47303,14.97925
3,22000008,2020-12-25,LAL,DAL,138.0,115.0,-6.0,2020-2021,4,12,1,1,253.0,23.0,6.0,17.0,1.29845,17.0,0.80887,3.0,2.0,1.0,1.0,1.0,-1.0,-1.0,1.0,0.0,-7.0,-4.0,,,0.0,0.0,1.0,-3.0,,126.5,0.15166,240.2,109.0,-35.0,233,135,107,240.1,102.0,-20.0,228,105,-110,12.22606,12.41649,-3.72356,-3.32278,14.53219,10.48927,12.38158,-11.34649
4,22000009,2020-12-25,DEN,LAC,108.0,121.0,3.5,2020-2021,4,12,1,1,229.0,-13.0,-3.5,-9.5,-0.70639,9.5,-0.10332,2.0,3.0,-1.0,1.0,1.0,-1.0,-1.0,1.0,0.0,-2.0,7.0,,,0.0,0.0,1.0,-9.0,,114.5,0.08475,265.1,122.0,-10.0,233,108,-63,239.9,116.0,35.0,239,121,65,14.99774,14.16465,0.35383,5.10088,13.86695,-7.54936,13.27197,8.49372
5,22000021,2020-12-26,MEM,ATL,112.0,122.0,-2.5,2020-2021,5,12,1,1,234.0,-10.0,2.5,-12.5,-0.93335,12.5,0.26156,3.0,3.0,0.0,1.0,1.0,-1.0,-1.0,1.0,0.0,-12.0,20.0,,,0.0,0.0,1.0,-32.0,,117.0,0.11152,215.6,116.0,-53.0,224,105,-37,210.6,111.0,81.0,239,122,50,15.60204,12.62583,-4.47681,11.02089,14.17411,-2.45536,14.02092,7.00418
6,22000023,2020-12-26,DET,CLE,119.0,128.0,-1.0,2020-2021,5,12,1,1,247.0,-9.0,1.0,-10.0,-0.74422,10.0,-0.04251,3.0,3.0,0.0,1.0,1.0,-1.0,-1.0,0.0,1.0,-10.0,7.0,,,0.0,0.0,-1.0,-17.0,,123.5,0.08921,240.1,101.0,-50.0,285,119,-44,201.2,111.0,23.0,240,109,39,11.27489,16.66302,-7.10162,3.53231,15.2386,-5.63509,17.09167,8.2375
7,22000024,2020-12-26,WAS,ORL,120.0,130.0,-2.0,2020-2021,5,12,1,1,250.0,-10.0,2.0,-12.0,-0.89552,12.0,0.20074,3.0,3.0,0.0,1.0,1.0,-1.0,-1.0,0.0,1.0,-6.0,6.0,,,0.0,0.0,-1.0,-12.0,,125.0,0.10706,234.0,103.0,-38.0,220,120,-45,240.1,113.0,30.0,240,130,50,12.74786,12.92961,-1.35684,3.0858,15.25,-4.30909,14.47917,5.5
8,22000025,2020-12-26,NYK,PHI,89.0,109.0,8.0,2020-2021,5,12,1,1,198.0,-20.0,-8.0,-12.0,-0.89552,12.0,0.20074,3.0,3.0,0.0,1.0,1.0,-1.0,-1.0,0.0,1.0,-14.0,6.0,,,0.0,0.0,-1.0,-20.0,,99.0,0.10706,204.2,93.0,-71.0,222,85,-101,240.1,113.0,30.0,241,109,100,13.68022,13.53811,-8.0,3.20783,11.85135,-12.47297,12.48133,13.17842
9,22000026,2020-12-26,CHI,IND,106.0,125.0,6.0,2020-2021,5,12,1,1,231.0,-19.0,-6.0,-13.0,-0.97118,13.0,0.32237,3.0,3.0,0.0,1.0,1.0,-1.0,-1.0,1.0,1.0,-20.0,14.0,,,0.0,0.0,0.0,-34.0,,115.5,0.11598,240.1,104.0,-100.0,182,83,-91,240.1,121.0,70.0,231,121,83,11.50604,15.24781,-12.74261,7.81383,10.7033,-12.92857,14.17749,9.35498


## Data Preparation

### Create Targets

In [6]:
df["REG_TARGET"] = df["actual_score_diff_hv"]
df["CLS_TARGET"] = df["actual_score_diff_hv"] > df["vegas_score_diff_hv"]

### Convert Features to Numeric

### Select Features

In [7]:
df.columns

Index(['game_id', 'game_date', 'home_team', 'away_team', 'home_score',
       'away_score', 'spread', 'season', 'day_of_week', 'month',
       'month_of_season', 'week_of_season', 'total_score',
       'actual_score_diff_hv', 'vegas_score_diff_hv', 'vegas_miss',
       'vegas_miss_zscore', 'vegas_miss_abs', 'vegas_miss_abs_zscore',
       'days_since_last_game_home', 'days_since_last_game_away',
       'rest_diff_hv', 'home_team_last_5', 'away_team_last_5',
       'home_team_streak', 'away_team_streak', 'home_team_win_pct',
       'away_team_win_pct', 'home_team_avg_point_diff',
       'away_team_avg_point_diff', 'home_team_avg_point_diff_last_5',
       'away_team_avg_point_diff_last_5', 'last_5_hv', 'streak_hv',
       'win_pct_hv', 'point_diff_hv', 'point_diff_last_5_hv', 'avg_score',
       'adjusted_vegas_miss_abs', 'min_stats_home_sum', 'pts_stats_home_sum',
       'plus_minus_stats_home_sum', 'min_box_home_sum', 'pts_box_home_sum',
       'plus_minus_box_home_sum', 'min_stats_aw

In [8]:
features_to_use = ["vegas_score_diff_hv"]

### Train Test Split

In [9]:
testing_df = df[df["season"] == "2022-2023"]
training_df = df[df["season"] != "2022-2023"]

In [10]:
cls_training_df = training_df[["game_date"] + features_to_use + ["CLS_TARGET"]]
cls_testing_df = testing_df[["game_date"] + features_to_use + ["CLS_TARGET"]]
reg_training_df = training_df[["game_date"] + features_to_use + ["REG_TARGET"]]
reg_testing_df = testing_df[["game_date"] + features_to_use + ["REG_TARGET"]]

### Baselines

In [11]:
training_MAE = df[df["season"] != "2022-2023"]["vegas_miss_abs"].mean()
testing_MAE = df[df["season"] == "2022-2023"]["vegas_miss_abs"].mean()
print(f"Training MAE: {training_MAE:.2f}")
print(f"Testing MAE: {testing_MAE:.2f}")

Training MAE: 10.66
Testing MAE: 9.74


<a id=Regression></a>

## Regression

In [12]:
py_reg = RegressionExperiment()

<a id=setup></a>

### Setup and Preprocessing

The setup process involves a lot of options. Reference the docs below:   
https://pycaret.readthedocs.io/en/latest/api/regression.html#module-pycaret.regression

In [14]:
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

setup_params_reg = {
    "log_experiment": True,
    "log_profile": False,
    "log_plots": False,
    "experiment_name": f"REG_1_{timestamp}",
    "data": reg_training_df,
    "test_data": reg_testing_df,
    "target": "REG_TARGET",
    "preprocess": False,
    "normalize": False,  # zscore
    "transformation": False,  # yeo-johnson power transform to make data more Gaussian
    "remove_outliers": False,  # using SVD
    "remove_multicollinearity": False,
    "feature_selection": False,
    "pca": False,
    "pca_components": 10,
    "numeric_features": [],
    "ignore_features": ["game_date"],
}

In [15]:
py_reg.setup(**setup_params_reg)

Unnamed: 0,Description,Value
0,Session id,4213
1,Target,REG_TARGET
2,Target type,Regression
3,Original data shape,"(3481, 3)"
4,Transformed data shape,"(3481, 2)"
5,Transformed train set shape,"(2272, 2)"
6,Transformed test set shape,"(1209, 2)"
7,Ignore features,1
8,Numeric features,1


2023/07/03 16:09:17 INFO mlflow.tracking.fluent: Experiment with name 'REG_1_20230703160905' does not exist. Creating a new experiment.


<pycaret.regression.oop.RegressionExperiment at 0x7f393a552560>

<a id=compare></a>

### Compare Models

In [16]:
best_model_reg = py_reg.compare_models(turbo=False, sort="MAE")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
svm,Support Vector Regression,10.6531,186.1697,13.632,0.1953,0.967,1.1632,0.146
en,Elastic Net,10.6568,184.8376,13.5851,0.2006,1.0162,1.1637,0.104
lasso,Lasso Regression,10.6569,184.8386,13.5851,0.2006,1.0162,1.1638,0.101
llar,Lasso Least Angle Regression,10.6569,184.8386,13.5851,0.2006,1.0162,1.1638,0.088
br,Bayesian Ridge,10.6576,184.8115,13.5841,0.2007,1.0098,1.1734,0.097
ard,Automatic Relevance Determination,10.6576,184.8115,13.5841,0.2007,1.0098,1.1734,0.104
ridge,Ridge Regression,10.6577,184.8106,13.5841,0.2007,1.0093,1.1743,0.105
lr,Linear Regression,10.6577,184.8106,13.5841,0.2007,1.0093,1.1743,0.46
omp,Orthogonal Matching Pursuit,10.6577,184.8106,13.5841,0.2007,1.0093,1.1743,0.088
lar,Least Angle Regression,10.6577,184.8106,13.5841,0.2007,1.0093,1.1743,0.088


Processing:   0%|          | 0/109 [00:00<?, ?it/s]

In [17]:
print(best_model_reg)

SVR()


<a id=create></a>

### Create Selected Model

In [18]:
model_reg = py_reg.create_model("svm")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,11.5166,210.9315,14.5235,0.1084,0.9569,1.3176
1,10.3618,157.5607,12.5523,0.1803,0.9815,1.2497
2,10.696,193.4858,13.9099,0.1678,1.0214,1.1186
3,10.9224,195.3114,13.9754,0.2153,0.9574,1.1187
4,9.8479,169.8398,13.0323,0.2844,0.8886,1.0585
5,10.3495,166.8436,12.9168,0.1996,1.016,1.1457
6,10.552,190.2404,13.7928,0.1635,0.9881,1.1748
7,10.5134,185.9596,13.6367,0.1194,0.9851,1.1682
8,10.5124,190.4341,13.7998,0.2563,0.9566,1.1952
9,11.2588,201.0904,14.1806,0.2583,0.9183,1.085


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

<a id=tune></a>

### Tune Selected Model

In [19]:
tuned_model_reg = py_reg.tune_model(model_reg)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,11.5559,211.9224,14.5576,0.1043,0.9575,1.3318
1,10.3655,157.9912,12.5695,0.1781,0.9813,1.2441
2,10.6733,193.9972,13.9283,0.1656,1.0206,1.1265
3,10.9145,194.1193,13.9327,0.2201,0.9611,1.1275
4,9.8063,168.02,12.9623,0.2921,0.8835,1.0703
5,10.361,166.825,12.9161,0.1997,1.0243,1.1635
6,10.5797,190.6748,13.8085,0.1616,0.9637,1.1884
7,10.5279,186.4538,13.6548,0.1171,1.0021,1.1698
8,10.4941,189.0006,13.7478,0.2619,0.9517,1.2016
9,11.2463,199.5824,14.1274,0.2639,0.9179,1.0978


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [20]:
print(tuned_model_reg)

SVR(C=3.641, epsilon=1.1, shrinking=False)


<a id=evaluate></a>

### Evaluate Model

https://pycaret.readthedocs.io/en/latest/api/regression.html#pycaret.regression.evaluate_model

In [21]:
py_reg.evaluate_model(tuned_model_reg)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

https://pycaret.readthedocs.io/en/latest/api/regression.html#pycaret.regression.interpret_model

In [22]:
# py_reg.interpret_model(tuned_model_reg)

In [23]:
train_predictions_reg = py_reg.predict_model(tuned_model_reg, data=reg_training_df)
test_predictions_reg = py_reg.predict_model(tuned_model_reg, data=reg_testing_df)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Support Vector Regression,10.6142,184.9288,13.5989,0.2048,0.9641,1.1697


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Support Vector Regression,9.8072,156.2392,12.4996,0.1529,0.943,1.2369


In [25]:
def evaluate_reg_model(df, vegas_column, actual_column, prediction_column, display=True):
    # create prediction side column
    df["pred_side"] = df.apply(
        lambda row: "away" if row[prediction_column] < row[vegas_column] else "home",
        axis=1,
    )
    # create actual side column
    df["actual_side"] = df.apply(
        lambda row: "away" if row[actual_column] < row[vegas_column] else "home", axis=1
    )
    # create which is closer column
    df["closer_to_target"] = df.apply(
        lambda row: abs(row[actual_column] - row[prediction_column])
        < abs(row[actual_column] - row[vegas_column]),
        axis=1,
    )

    accuracy = accuracy_score(df["actual_side"], df["pred_side"])
    precision = ""

    if display:
        # print % of Trues in the which is closer column
        closer_to_target_percent = df["closer_to_target"].mean() * 100
        print(
            f"Prediction is closer to target in {closer_to_target_percent:.2f}% of cases"
        )
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision}")

    return accuracy, precision, df

In [27]:
train_acc_reg, train_prec_reg, train_prediction_df_reg = evaluate_reg_model(
    train_predictions_reg, "vegas_score_diff_hv", "REG_TARGET", "prediction_label"
)
test_acc_reg, test_prec_reg, test_prediction_df_reg = evaluate_reg_model(
    test_predictions_reg, "vegas_score_diff_hv", "REG_TARGET", "prediction_label"
)

Prediction is closer to target in 52.02% of cases
Accuracy: 0.5251
Precision: 
Prediction is closer to target in 47.39% of cases
Accuracy: 0.4830
Precision: 


In [28]:
def calculate_roi(df, actual_column, pred_column, pred_prob=None):
    # Adding new columns to the DataFrame to track if the prediction was correct or not
    df["even_win"] = df.apply(
        lambda row: 100 if row[actual_column] == row[pred_column] else -100, axis=1
    )
    df["typical_win"] = df.apply(
        lambda row: 91 if row[actual_column] == row[pred_column] else -100, axis=1
    )

    # Calculate total ROIs
    total_roi_even = df["even_win"].sum()
    total_roi_typical = df["typical_win"].sum()

    # Calculate average ROIs per bet
    average_roi_even = round(total_roi_even / df.shape[0], 2)
    average_roi_typical = round(total_roi_typical / df.shape[0], 2)

    # Prepare result as a DataFrame
    result = pd.DataFrame(
        {
            "Label": ["All Bets, Even Amount", "All Bets, Typical Odds"],
            "Total ROI": [total_roi_even, total_roi_typical],
            "Average ROI per Bet": [average_roi_even, average_roi_typical],
        }
    )

    # Add extra rows if pred_prob is not None
    if pred_prob is not None:
        cutoffs = [0.50, 0.55, 0.60, 0.65, 0.70]
        for cutoff in cutoffs:
            filtered_df = df[df[pred_prob] > cutoff]

            # Calculate total ROIs for filtered DataFrame
            total_roi_even_filtered = filtered_df["even_win"].sum()
            total_roi_typical_filtered = filtered_df["typical_win"].sum()

            # Calculate average ROIs per bet for filtered DataFrame
            average_roi_even_filtered = (
                round(total_roi_even_filtered / filtered_df.shape[0], 2)
                if not filtered_df.empty
                else 0
            )
            average_roi_typical_filtered = (
                round(total_roi_typical_filtered / filtered_df.shape[0], 2)
                if not filtered_df.empty
                else 0
            )

            # Add new rows to result
            new_rows = pd.DataFrame(
                {
                    "Label": [
                        f"Cutoff {int(cutoff*100)}% Bets, Even Odds",
                        f"Cutoff {int(cutoff*100)}% Bets, Typical Odds",
                    ],
                    "Total ROI": [total_roi_even_filtered, total_roi_typical_filtered],
                    "Average ROI per Bet": [
                        average_roi_even_filtered,
                        average_roi_typical_filtered,
                    ],
                }
            )
            result = pd.concat([result, new_rows], ignore_index=True)

        # Calculate ROI using Kelly Criterion
        df["bet_fraction"] = df[pred_prob].map(lambda p: 2 * p - 1 if p > 0.5 else 0)
        total_roi_even_kelly = 0
        total_roi_typical_kelly = 0

        # Group the dataframe by date
        grouped = df.groupby("game_date")

        # Process each group (i.e., each date's games) separately
        for _, group in grouped:
            total_fraction = group["bet_fraction"].sum()
            group["bet_fraction"] /= total_fraction
            group["bet_size"] = group["bet_fraction"] * 100 * group.shape[0]
            group["even_win_kelly"] = group.apply(
                lambda row: row["bet_size"]
                if row[actual_column] == row[pred_column]
                else -row["bet_size"],
                axis=1,
            )
            group["typical_win_kelly"] = group.apply(
                lambda row: 0.91 * row["bet_size"]
                if row[actual_column] == row[pred_column]
                else -row["bet_size"],
                axis=1,
            )
            total_roi_even_kelly += group["even_win_kelly"].sum()
            total_roi_typical_kelly += group["typical_win_kelly"].sum()

        average_roi_even_kelly = round(total_roi_even_kelly / df.shape[0], 2)
        average_roi_typical_kelly = round(total_roi_typical_kelly / df.shape[0], 2)

        new_rows_kelly = pd.DataFrame(
            {
                "Label": [
                    "All Bets, Even Amount, Kelly Criterion",
                    "All Bets, Typical Odds, Kelly Criterion",
                ],
                "Total ROI": [total_roi_even_kelly, total_roi_typical_kelly],
                "Average ROI per Bet": [
                    average_roi_even_kelly,
                    average_roi_typical_kelly,
                ],
            }
        )
        result = pd.concat([result, new_rows_kelly], ignore_index=True)

    return result

In [33]:
roi_results_reg = calculate_roi(test_prediction_df_reg, "actual_side", "pred_side")
roi_results_reg

Unnamed: 0,Label,Total ROI,Average ROI per Bet
0,"All Bets, Even Amount",-4100,-3.39
1,"All Bets, Typical Odds",-9356,-7.74


<a id=finalize_and_store></a>

### Model Finalization and Storage

In [34]:
model_name = "REG_SVM_1"

features_reg = features_to_use
notes = "Single feature: vegas_score_diff_hv"

In [35]:
def save_metrics(
    name,
    notes,
    filename,
    roi_results,
    train_accuracy,
    test_accuracy,
    train_precision,
    test_precision,
    features,
):
    # Convert the metrics to a DataFrame
    metrics_df = pd.DataFrame(
        {
            "train_accuracy": [train_accuracy],
            "test_accuracy": [test_accuracy],
            "train_precision": [train_precision],
            "test_precision": [test_precision],
        }
    )

    # Get current timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    # Create a dictionary for the JSON
    json_dict = {
        "name": name,
        "timestamp": timestamp,
        "notes": notes,
        "roi": roi_results.to_dict("records"),
        "metrics": metrics_df.to_dict("records"),
        "features": features,
    }

    # Check if the file exists
    if os.path.exists(filename):
        # If it does, load the existing data
        with open(filename, "r") as f:
            data = json.load(f)
    else:
        # If it doesn't, create an empty list to hold the data
        data = []

    # Append the new data
    data.append(json_dict)

    # Write the data to the file
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)

In [37]:
metrics_filename = "../models/model_metrics.json"

# An example usage of the function would be
save_metrics(
    model_name,
    notes,
    metrics_filename,
    roi_results_reg,
    train_accuracy=train_acc_reg,
    test_accuracy=test_acc_reg,
    train_precision=train_prec_reg,
    test_precision=test_prec_reg,
    features=features_reg,
)

In [38]:
final_model_reg = py_reg.finalize_model(tuned_model_reg)

In [39]:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
py_reg.save_model(final_model_reg, f"../models/AutoML/{model_name}_{timestamp}")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('placeholder', None),
                 ('actual_estimator',
                  SVR(C=3.641, epsilon=1.1, shrinking=False))]),
 '../models/AutoML/REG_SVM_1_20230703_161804.pkl')

<a id=Classification></a>

## Classification

<a id=setup_cls></a>

### Setup and Preprocessing

In [40]:
py_cls = ClassificationExperiment()

The setup process involves a lot of options. Reference the docs below:   
https://pycaret.readthedocs.io/en/latest/api/regression.html#module-pycaret.regression

In [41]:
setup_params_cls = {
    "log_experiment": True,
    "log_profile": False,
    "log_plots": False,
    "experiment_name": "CLS_1",
    "data": cls_training_df,
    "test_data": cls_testing_df,
    "target": "CLS_TARGET",
    "preprocess": False,
    "normalize": False,  # zscore
    "transformation": False,  # yeo-johnson power transform to make data more Gaussian
    "remove_outliers": False,  # using SVD
    "remove_multicollinearity": False,
    "polynomial_features": False,
    "feature_selection": False,
    "pca": False,
    "pca_components": 10,
    "numeric_features": [],
    "ignore_features": ["game_date"],
}

In [42]:
py_cls.setup(**setup_params_cls)

Unnamed: 0,Description,Value
0,Session id,1823
1,Target,CLS_TARGET
2,Target type,Binary
3,Original data shape,"(3481, 3)"
4,Transformed data shape,"(3481, 2)"
5,Transformed train set shape,"(2272, 2)"
6,Transformed test set shape,"(1209, 2)"
7,Ignore features,1
8,Numeric features,1


2023/07/03 16:18:46 INFO mlflow.tracking.fluent: Experiment with name 'CLS_1' does not exist. Creating a new experiment.


<pycaret.classification.oop.ClassificationExperiment at 0x7f3933fd8d60>

<a id=compare_cls></a>

### Compare Models

In [43]:
best_model_cls = py_cls.compare_models(turbo=False, sort="Accuracy")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.5233,0.492,0.0,0.0,0.0,0.0,0.0,0.552
nb,Naive Bayes,0.5233,0.5199,0.0,0.0,0.0,0.0,0.0,0.119
ridge,Ridge Classifier,0.5233,0.0,0.0,0.0,0.0,0.0,0.0,0.114
dummy,Dummy Classifier,0.5233,0.5,0.0,0.0,0.0,0.0,0.0,0.134
lda,Linear Discriminant Analysis,0.5233,0.492,0.0,0.0,0.0,0.0,0.0,0.117
qda,Quadratic Discriminant Analysis,0.5233,0.5199,0.0,0.0,0.0,0.0,0.0,0.113
mlp,MLP Classifier,0.5194,0.5193,0.2657,0.5156,0.3354,0.0165,0.0256,0.178
gbc,Gradient Boosting Classifier,0.5119,0.5182,0.4007,0.4842,0.4369,0.0139,0.0139,0.241
gpc,Gaussian Process Classifier,0.5101,0.5217,0.3481,0.4814,0.4027,0.0059,0.0063,3.761
rbfsvm,SVM - Radial Kernel,0.5084,0.4947,0.3518,0.4792,0.4037,0.0028,0.0031,0.584


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [44]:
print(best_model_cls)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1823, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


<a id=create_cls></a>

### Create Selected Model

In [45]:
model_cls = py_cls.create_model("lr")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5219,0.4657,0.0,0.0,0.0,0.0,0.0
1,0.5219,0.5241,0.0,0.0,0.0,0.0,0.0
2,0.5198,0.5316,0.0,0.0,0.0,0.0,0.0
3,0.5242,0.5076,0.0,0.0,0.0,0.0,0.0
4,0.5242,0.5535,0.0,0.0,0.0,0.0,0.0
5,0.5242,0.4586,0.0,0.0,0.0,0.0,0.0
6,0.5242,0.4697,0.0,0.0,0.0,0.0,0.0
7,0.5242,0.4706,0.0,0.0,0.0,0.0,0.0
8,0.5242,0.4407,0.0,0.0,0.0,0.0,0.0
9,0.5242,0.4974,0.0,0.0,0.0,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

<a id=tune_cls></a>

### Tune Selected Model

In [48]:
tuned_model_cls = py_cls.tune_model(model_cls)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5219,0.4657,0.0,0.0,0.0,0.0,0.0
1,0.5219,0.5241,0.0,0.0,0.0,0.0,0.0
2,0.5198,0.5316,0.0,0.0,0.0,0.0,0.0
3,0.5242,0.5076,0.0,0.0,0.0,0.0,0.0
4,0.5242,0.5535,0.0,0.0,0.0,0.0,0.0
5,0.5242,0.4586,0.0,0.0,0.0,0.0,0.0
6,0.5242,0.4697,0.0,0.0,0.0,0.0,0.0
7,0.5242,0.4706,0.0,0.0,0.0,0.0,0.0
8,0.5242,0.4407,0.0,0.0,0.0,0.0,0.0
9,0.5242,0.4974,0.0,0.0,0.0,0.0,0.0


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [49]:
print(tuned_model_cls)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1823, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


<a id=evaluate_cls></a>

### Evaluate Model

https://pycaret.readthedocs.io/en/latest/api/regression.html#pycaret.regression.evaluate_model

In [50]:
py_cls.evaluate_model(tuned_model_cls)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

https://pycaret.readthedocs.io/en/latest/api/regression.html#pycaret.regression.interpret_model

In [51]:
# py_cls.interpret_model(tuned_model_cls)

In [52]:
train_predictions_cls = py_cls.predict_model(tuned_model_cls, data=cls_training_df)
test_predictions_cls = py_cls.predict_model(tuned_model_cls, data=cls_testing_df)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5233,0.5048,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.4955,0.4935,0.0,0.0,0.0,0.0,0.0


In [53]:
test_predictions_cls.head()

Unnamed: 0,game_date,vegas_score_diff_hv,CLS_TARGET,prediction_label,prediction_score
2272,2022-10-21,-7.0,False,0,0.5309
2273,2022-10-21,2.0,False,0,0.5231
2274,2022-10-21,2.0,False,0,0.5231
2275,2022-10-21,8.5,True,0,0.5175
2276,2022-10-21,2.5,True,0,0.5227


In [54]:
train_predictions_cls["CLS_TARGET"] = train_predictions_cls["CLS_TARGET"].map(
    {True: "Home", False: "Away"}
)
train_predictions_cls["prediction_label"] = train_predictions_cls[
    "prediction_label"
].map({1: "Home", 0: "Away"})
test_predictions_cls["CLS_TARGET"] = test_predictions_cls["CLS_TARGET"].map(
    {True: "Home", False: "Away"}
)
test_predictions_cls["prediction_label"] = test_predictions_cls["prediction_label"].map(
    {1: "Home", 0: "Away"}
)

In [56]:
def calculate_cls_metrics(df):
    # Accuracy is the proportion of true results (both true positives and true negatives) in the population
    accuracy = accuracy_score(df["CLS_TARGET"], df["prediction_label"])

    # Precision is the proportion of true positives against all positive results (both true positives and false positives)
    precision = precision_score(
        df["CLS_TARGET"], df["prediction_label"], pos_label="Home"
    )  # assuming 'Home' is the positive label

    return accuracy, precision

In [57]:
train_acc_cls, train_prec_cls = calculate_cls_metrics(train_predictions_cls)
test_acc_cls, test_prec_cls = calculate_cls_metrics(test_predictions_cls)

In [58]:
roi_results_cls = calculate_roi(
    test_predictions_cls, "CLS_TARGET", "prediction_label", pred_prob="prediction_score"
)
roi_results_cls

Unnamed: 0,Label,Total ROI,Average ROI per Bet
0,"All Bets, Even Amount",-1100.0,-0.91
1,"All Bets, Typical Odds",-6491.0,-5.37
2,"Cutoff 50% Bets, Even Odds",-1100.0,-0.91
3,"Cutoff 50% Bets, Typical Odds",-6491.0,-5.37
4,"Cutoff 55% Bets, Even Odds",0.0,0.0
5,"Cutoff 55% Bets, Typical Odds",0.0,0.0
6,"Cutoff 60% Bets, Even Odds",0.0,0.0
7,"Cutoff 60% Bets, Typical Odds",0.0,0.0
8,"Cutoff 65% Bets, Even Odds",0.0,0.0
9,"Cutoff 65% Bets, Typical Odds",0.0,0.0


<a id=finalize_and_store_cls></a>

### Model Finalization and Storage

In [59]:
model_name = "CLS_LR_1"

features_cls = features_to_use
notes = "Single feature: vegas_score_diff_hv"

In [60]:
metrics_filename = "../models/model_metrics.json"

# An example usage of the function would be
save_metrics(
    model_name,
    notes,
    metrics_filename,
    roi_results_cls,
    train_accuracy=train_acc_cls,
    test_accuracy=test_acc_cls,
    train_precision=train_prec_cls,
    test_precision=test_prec_cls,
    features=features_cls,
)

In [61]:
final_model_cls = py_cls.finalize_model(tuned_model_cls)

In [62]:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
py_cls.save_model(final_model_cls, f"../models/AutoML/{model_name}_{timestamp}")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('placeholder', None),
                 ('actual_estimator',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=1000,
                                     multi_class='auto', n_jobs=None,
                                     penalty='l2', random_state=1823,
                                     solver='lbfgs', tol=0.0001, verbose=0,
                                     warm_start=False))],
          verbose=False),
 '../models/AutoML/CLS_LR_1_20230703_163242.pkl')