# AutoML

PyCaret
* Main Site - https://pycaret.org/
* Docs - https://pycaret.readthedocs.io/en/latest/

## Table of Contents

* [Setup and Preprocessing](#setup)  
* [Compare Models](#compare)  
* [Create Model](#create)  
* [Tune Model](#tune)  
* [Evaluate Model](#evaluate)  
* [Finalize and Store Model](#finalize_and_store)  

## Imports and Global Settings

In [1]:
import mlflow
import pandas as pd
from sqlalchemy import create_engine
from pycaret.regression import *

# Pandas Settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.options.display.max_info_columns = 200
pd.options.display.precision = 5


mlflow.set_tracking_uri('file:/home/jeff/Documents/Data_Science_Projects/NBA_Betting/models/AutoML')

## Database Connection

In [2]:
username = 'postgres'
password = ''
endpoint = ''
database = 'nba_betting'
port = '5432'

connection = create_engine(f'postgresql+psycopg2://{username}:{password}@{endpoint}/{database}').connect()

### Datasets

In [3]:
df = pd.read_sql_table('model_ready', connection)

<a id='basic_data_overview'></a>

## Basic Data Overview

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18869 entries, 0 to 18868
Data columns (total 69 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   game_id                    18869 non-null  object 
 1   TARGET_actual_home_margin  18862 non-null  float64
 2   home_team_num              18869 non-null  int64  
 3   away_team_num              18869 non-null  int64  
 4   league_year_end            18869 non-null  int64  
 5   home_line                  18866 non-null  float64
 6   fd_line_home               26 non-null     float64
 7   dk_line_home               0 non-null      float64
 8   covers_consenses_home      25 non-null     float64
 9   wins                       18625 non-null  float64
 10  losses                     18625 non-null  float64
 11  win_pct                    18625 non-null  float64
 12  expected_wins              18625 non-null  float64
 13  expected_losses            18625 non-null  flo

In [5]:
df.head()

Unnamed: 0,game_id,TARGET_actual_home_margin,home_team_num,away_team_num,league_year_end,home_line,fd_line_home,dk_line_home,covers_consenses_home,wins,losses,win_pct,expected_wins,expected_losses,home_ppg,home_papg,away_wins,away_losses,away_win_pct,away_expected_wins,away_expected_losses,away_ppg,away_papg,g,mp,pts,ast,trb,blk,stl,tov,pf,drb,orb,fg,fga,fg_pct,fg2,fg2a,fg2_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,away_g,away_mp,away_pts,away_ast,away_trb,away_blk,away_stl,away_tov,away_pf,away_drb,away_orb,away_fg,away_fga,away_fg_pct,away_fg2,away_fg2a,away_fg2_pct,away_fg3,away_fg3a,away_fg3_pct,away_ft,away_fta,away_fta_pct
0,20211020NYKBOS,4.0,20,3,22,-2.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,20211101NYKTOR,-9.0,20,28,22,-7.0,,,,5.0,1.0,0.833,4.0,2.0,117.0,109.8,4.0,3.0,0.571,4.0,3.0,103.7,99.7,6.0,1490.0,702.0,143.0,276.0,38.0,46.0,77.0,122.0,218.0,58.0,254.0,546.0,0.465,154.0,300.0,0.513,100.0,246.0,0.407,94.0,129.0,0.729,7.0,1680.0,726.0,138.0,339.0,25.0,78.0,93.0,130.0,238.0,101.0,274.0,643.0,0.426,200.0,415.0,0.482,74.0,228.0,0.325,104.0,139.0,0.748
2,20211110CHIDAL,10.0,5,7,22,-3.5,,,,7.0,3.0,0.7,7.0,3.0,108.9,101.9,7.0,3.0,0.7,4.0,6.0,102.4,104.7,10.0,2400.0,1089.0,229.0,446.0,54.0,88.0,119.0,175.0,352.0,94.0,403.0,873.0,0.462,308.0,604.0,0.51,95.0,269.0,0.353,188.0,219.0,0.858,10.0,2400.0,1024.0,205.0,459.0,46.0,67.0,129.0,198.0,355.0,104.0,377.0,874.0,0.431,251.0,477.0,0.526,126.0,397.0,0.317,144.0,203.0,0.709
3,20211108CHIBKN,23.0,5,2,22,1.0,,,,6.0,3.0,0.667,6.0,3.0,107.9,102.7,7.0,3.0,0.7,6.0,4.0,106.1,103.3,9.0,2160.0,971.0,208.0,390.0,49.0,80.0,110.0,158.0,311.0,79.0,357.0,775.0,0.461,271.0,537.0,0.505,86.0,238.0,0.361,171.0,199.0,0.859,10.0,2400.0,1061.0,245.0,457.0,54.0,61.0,147.0,189.0,386.0,71.0,390.0,834.0,0.468,259.0,487.0,0.532,131.0,347.0,0.378,150.0,193.0,0.777
4,20211112BOSMIL,9.0,3,17,22,-5.5,,,,5.0,6.0,0.455,6.0,5.0,108.4,108.2,6.0,6.0,0.5,6.0,6.0,108.1,107.9,11.0,2765.0,1192.0,250.0,499.0,72.0,94.0,152.0,219.0,386.0,113.0,439.0,1011.0,0.434,301.0,597.0,0.504,138.0,414.0,0.333,176.0,228.0,0.772,12.0,2880.0,1297.0,261.0,557.0,54.0,91.0,155.0,217.0,433.0,124.0,475.0,1082.0,0.439,300.0,592.0,0.507,175.0,490.0,0.357,172.0,233.0,0.738


## PyCaret

<a id=setup></a>

### Setup and Preprocessing

In [6]:
features_to_drop = ['game_id']

In [7]:
model_ready_df = df.drop(columns=features_to_drop)

In [8]:
# model_ready_df.info()

The setup process involves a lot of options. Reference the docs below:   
https://pycaret.readthedocs.io/en/latest/api/regression.html#module-pycaret.regression

In [9]:
setup_params = {'log_experiment': True,
                'log_profile': False,
                'log_plots': False,
                'experiment_name': 'NBA_Betting_1',
                'data': model_ready_df,
                'target': 'TARGET_actual_home_margin',
                'train_size': 0.7,
                'preprocess': True,
                'normalize': False,        # zscore
                'transformation': False,   # yeo-johnson power transform to make data more Gaussian
                'remove_outliers': False,  # using SVD
                'remove_multicollinearity': False,
                'polynomial_features': False,
                'trigonometry_features': False,
                'feature_interaction': False,
                'feature_ratio': False,
                'feature_selection': False,
                'feature_selection_threshold': 0.8,
                'pca': False,
                'pca_components': 10,
                'numeric_features': ['league_year_end'],
                'ignore_features': ['dk_line_home', 'fd_line_home', 'covers_consenses_home', 'home_line']
               }

In [10]:
nba_betting_regression = setup(**setup_params)

Unnamed: 0,Description,Value
0,session_id,5773
1,Target,TARGET_actual_home_margin
2,Original Data,"(18869, 68)"
3,Missing Values,True
4,Numeric Features,63
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(13202, 45)"


<a id=compare></a>

### Compare Models

In [11]:
best_3_models = compare_models(n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ridge,Ridge Regression,9.8547,157.432,12.5458,0.1479,1.1709,1.1372,0.026
lr,Linear Regression,9.8576,157.5317,12.5497,0.1473,1.168,1.1414,0.313
br,Bayesian Ridge,9.8647,157.6421,12.5539,0.1468,1.1823,1.1301,0.048
en,Elastic Net,9.8687,157.6942,12.5559,0.1465,1.1939,1.1206,0.138
lasso,Lasso Regression,9.8791,157.9341,12.5655,0.1453,1.2038,1.1126,0.143
gbr,Gradient Boosting Regressor,9.867,158.7134,12.5972,0.1408,1.1385,1.1488,5.425
lightgbm,Light Gradient Boosting Machine,9.9533,161.2539,12.6974,0.1271,1.1244,1.1868,0.737
catboost,CatBoost Regressor,9.9824,161.8603,12.7215,0.1237,1.13,1.2038,10.779
omp,Orthogonal Matching Pursuit,9.9815,161.9649,12.7257,0.1231,1.1945,1.1223,0.025
ada,AdaBoost Regressor,10.0221,162.788,12.7574,0.1189,1.2444,1.1082,1.61


<a id=create></a>

### Create Selected Model

In [12]:
model = create_model('ridge')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.725,150.5853,12.2713,0.1398,1.1581,1.1829
1,9.702,158.3527,12.5838,0.1211,1.1764,1.1531
2,9.9576,162.3506,12.7417,0.1509,1.1757,1.1383
3,9.8999,160.2026,12.6571,0.1367,1.1767,1.139
4,9.8038,152.6466,12.355,0.1662,1.1726,1.0879
5,9.9089,156.3398,12.5036,0.1719,1.1731,1.0925
6,9.8331,159.6592,12.6356,0.1478,1.1616,1.083
7,10.0361,161.0688,12.6913,0.1522,1.2,1.1614
8,9.6066,149.6817,12.2344,0.1498,1.1539,1.2072
9,10.074,163.4329,12.7841,0.1423,1.1611,1.1269


<a id=tune></a>

### Tune Selected Model

In [13]:
tuned_model = tune_model(model)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.7263,150.6466,12.2738,0.1394,1.1587,1.1828
1,9.7027,158.376,12.5848,0.121,1.1757,1.1528
2,9.9612,162.4127,12.7441,0.1506,1.176,1.1383
3,9.8999,160.1573,12.6553,0.1369,1.1774,1.1379
4,9.8033,152.6705,12.356,0.166,1.1718,1.0873
5,9.9121,156.378,12.5051,0.1717,1.1724,1.0929
6,9.8294,159.6204,12.6341,0.148,1.1632,1.0814
7,10.0374,161.1449,12.6943,0.1518,1.1996,1.1617
8,9.606,149.6473,12.233,0.15,1.1544,1.2064
9,10.0769,163.5463,12.7885,0.1417,1.1612,1.1269


<a id=evaluate></a>

### Evaluate Model

https://pycaret.readthedocs.io/en/latest/api/regression.html#pycaret.regression.evaluate_model

In [14]:
evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

https://pycaret.readthedocs.io/en/latest/api/regression.html#pycaret.regression.interpret_model

In [15]:
# interpret_model(tuned_model)

<a id=finalize_and_store></a>

### Model Finalization and Storage

In [16]:
final_model = finalize_model(tuned_model)

In [17]:
# save_model(final_model, '../models/AutoML/Baseline_Ridge_Reg_PyCaret')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['dk_line_home',
                                                        'fd_line_home',
                                                        'covers_consenses_home',
                                                        'home_line'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=['league_year_end'],
                                       target='TARGET_actual_home_margin',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categor...
                 ('fix_perfect', Remove_100(target='TARGET_actual_home_margin')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_se

In [19]:
# !mlflow ui