# AutoML

PyCaret
* Main Site - https://pycaret.org/
* Docs - https://pycaret.readthedocs.io/en/latest/

## Table of Contents

* [Setup and Preprocessing](#setup)  
* [Compare Models](#compare)  
* [Create Model](#create)  
* [Tune Model](#tune)  
* [Evaluate Model](#evaluate)  
* [Finalize and Store Model](#finalize_and_store)  

## Imports and Global Settings

In [1]:
import mlflow
import pandas as pd
from sqlalchemy import create_engine
from pycaret.regression import *

# Pandas Settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.options.display.max_info_columns = 200
pd.options.display.precision = 5


mlflow.set_tracking_uri('file:/home/jeff/Documents/Data_Science_Projects/NBA_Betting/models/AutoML')

## Database Connection

In [2]:
username = 'postgres'
password = ''
endpoint = ''
database = 'nba_betting'
port = '5432'

connection = create_engine(f'postgresql+psycopg2://{username}:{password}@{endpoint}/{database}').connect()

### Datasets

In [3]:
df = pd.read_sql_table('model_ready', connection)

<a id='basic_data_overview'></a>

## Basic Data Overview

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19833 entries, 0 to 19832
Data columns (total 69 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   game_id                    19833 non-null  object 
 1   TARGET_actual_home_margin  19800 non-null  float64
 2   home_team_num              19833 non-null  int64  
 3   away_team_num              19833 non-null  int64  
 4   league_year_end            19833 non-null  int64  
 5   home_line                  19828 non-null  float64
 6   fd_line_home               587 non-null    float64
 7   dk_line_home               562 non-null    float64
 8   covers_consenses_home      587 non-null    float64
 9   wins                       19586 non-null  float64
 10  losses                     19586 non-null  float64
 11  win_pct                    19586 non-null  float64
 12  expected_wins              19586 non-null  float64
 13  expected_losses            19586 non-null  flo

In [5]:
df.head()

Unnamed: 0,game_id,TARGET_actual_home_margin,home_team_num,away_team_num,league_year_end,home_line,fd_line_home,dk_line_home,covers_consenses_home,wins,losses,win_pct,expected_wins,expected_losses,home_ppg,home_papg,away_wins,away_losses,away_win_pct,away_expected_wins,away_expected_losses,away_ppg,away_papg,g,mp,pts,ast,trb,blk,stl,tov,pf,drb,orb,fg,fga,fg_pct,fg2,fg2a,fg2_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,away_g,away_mp,away_pts,away_ast,away_trb,away_blk,away_stl,away_tov,away_pf,away_drb,away_orb,away_fg,away_fga,away_fg_pct,away_fg2,away_fg2a,away_fg2_pct,away_fg3,away_fg3a,away_fg3_pct,away_ft,away_fta,away_fta_pct
0,20211020NYKBOS,4.0,20,3,22,-2.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,20211101NYKTOR,-9.0,20,28,22,-7.0,,,,5.0,1.0,0.833,4.0,2.0,117.0,109.8,4.0,3.0,0.571,4.0,3.0,103.7,99.7,6.0,1490.0,702.0,143.0,276.0,38.0,46.0,77.0,122.0,218.0,58.0,254.0,546.0,0.465,154.0,300.0,0.513,100.0,246.0,0.407,94.0,129.0,0.729,7.0,1680.0,726.0,138.0,339.0,25.0,78.0,93.0,130.0,238.0,101.0,274.0,643.0,0.426,200.0,415.0,0.482,74.0,228.0,0.325,104.0,139.0,0.748
2,20211110CHIDAL,10.0,5,7,22,-3.5,,,,7.0,3.0,0.7,7.0,3.0,108.9,101.9,7.0,3.0,0.7,4.0,6.0,102.4,104.7,10.0,2400.0,1089.0,229.0,446.0,54.0,88.0,119.0,175.0,352.0,94.0,403.0,873.0,0.462,308.0,604.0,0.51,95.0,269.0,0.353,188.0,219.0,0.858,10.0,2400.0,1024.0,205.0,459.0,46.0,67.0,129.0,198.0,355.0,104.0,377.0,874.0,0.431,251.0,477.0,0.526,126.0,397.0,0.317,144.0,203.0,0.709
3,20211108CHIBKN,23.0,5,2,22,1.0,,,,6.0,3.0,0.667,6.0,3.0,107.9,102.7,7.0,3.0,0.7,6.0,4.0,106.1,103.3,9.0,2160.0,971.0,208.0,390.0,49.0,80.0,110.0,158.0,311.0,79.0,357.0,775.0,0.461,271.0,537.0,0.505,86.0,238.0,0.361,171.0,199.0,0.859,10.0,2400.0,1061.0,245.0,457.0,54.0,61.0,147.0,189.0,386.0,71.0,390.0,834.0,0.468,259.0,487.0,0.532,131.0,347.0,0.378,150.0,193.0,0.777
4,20211112BOSMIL,9.0,3,17,22,-5.5,,,,5.0,6.0,0.455,6.0,5.0,108.4,108.2,6.0,6.0,0.5,6.0,6.0,108.1,107.9,11.0,2765.0,1192.0,250.0,499.0,72.0,94.0,152.0,219.0,386.0,113.0,439.0,1011.0,0.434,301.0,597.0,0.504,138.0,414.0,0.333,176.0,228.0,0.772,12.0,2880.0,1297.0,261.0,557.0,54.0,91.0,155.0,217.0,433.0,124.0,475.0,1082.0,0.439,300.0,592.0,0.507,175.0,490.0,0.357,172.0,233.0,0.738


## PyCaret

<a id=setup></a>

### Setup and Preprocessing

In [6]:
features_to_drop = ['game_id']

In [7]:
model_ready_df = df.drop(columns=features_to_drop)

In [8]:
# model_ready_df.info()

The setup process involves a lot of options. Reference the docs below:   
https://pycaret.readthedocs.io/en/latest/api/regression.html#module-pycaret.regression

In [9]:
setup_params = {'log_experiment': True,
                'log_profile': False,
                'log_plots': False,
                'experiment_name': 'NBA_Betting_1',
                'data': model_ready_df,
                'target': 'TARGET_actual_home_margin',
                'train_size': 0.7,
                'preprocess': True,
                'normalize': False,        # zscore
                'transformation': False,   # yeo-johnson power transform to make data more Gaussian
                'remove_outliers': False,  # using SVD
                'remove_multicollinearity': False,
                'polynomial_features': False,
                'trigonometry_features': False,
                'feature_interaction': False,
                'feature_ratio': False,
                'feature_selection': False,
                'feature_selection_threshold': 0.8,
                'pca': False,
                'pca_components': 10,
                'numeric_features': ['league_year_end'],
                'ignore_features': ['dk_line_home', 'fd_line_home', 'covers_consenses_home', 'home_line']
               }

In [10]:
nba_betting_regression = setup(**setup_params)

Unnamed: 0,Description,Value
0,session_id,355
1,Target,TARGET_actual_home_margin
2,Original Data,"(19833, 68)"
3,Missing Values,True
4,Numeric Features,63
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(13861, 47)"


Traceback (most recent call last):
  File "/home/jeff/anaconda3/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 256, in list_experiments
    experiment = self._get_experiment(exp_id, view_type)
  File "/home/jeff/anaconda3/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 336, in _get_experiment
    meta = read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/home/jeff/anaconda3/lib/python3.7/site-packages/mlflow/utils/file_utils.py", line 175, in read_yaml
    raise MissingConfigException("Yaml file '%s' does not exist." % file_path)
mlflow.exceptions.MissingConfigException: Yaml file '/home/jeff/Documents/Data_Science_Projects/NBA_Betting/models/AutoML/mlruns/meta.yaml' does not exist.
  File "/home/jeff/anaconda3/lib/python3.7/site-packages/pycaret/internal/tabular.py", line 1738, in setup
    mlflow.create_experiment(exp_name_log)
  File "/home/jeff/anaconda3/lib/python3.7/site-packages/mlflow/tracking/fluent.py", line 868, in

INFO:logs:setup() succesfully completed......................................


<a id=compare></a>

### Compare Models

In [11]:
best_3_models = compare_models(n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ridge,Ridge Regression,9.9598,160.6595,12.6724,0.1435,1.1819,1.1335,0.025
br,Bayesian Ridge,9.9697,160.8969,12.6817,0.1422,1.192,1.1259,0.051
en,Elastic Net,9.9752,160.9855,12.6852,0.1417,1.2019,1.1173,0.137
lasso,Lasso Regression,9.988,161.2743,12.6967,0.1402,1.2132,1.11,0.141
gbr,Gradient Boosting Regressor,9.9893,162.127,12.7298,0.1357,1.1531,1.1454,5.798
lr,Linear Regression,10.0157,162.2128,12.7325,0.1354,1.178,1.1486,0.266
omp,Orthogonal Matching Pursuit,10.0456,163.5262,12.7845,0.1283,1.1862,1.1242,0.022
lightgbm,Light Gradient Boosting Machine,10.0625,164.1958,12.8111,0.1246,1.1405,1.1819,0.316
catboost,CatBoost Regressor,10.0885,165.1386,12.8479,0.1195,1.1419,1.194,8.355
ada,AdaBoost Regressor,10.1719,166.8027,12.9114,0.1109,1.2562,1.1182,2.283


INFO:logs:create_model_container: 20
INFO:logs:master_model_container: 20
INFO:logs:display_container: 2
INFO:logs:[Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=355, solver='auto', tol=0.001), BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False), ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=355, selection='cyclic', tol=0.0001, warm_start=False)]
INFO:logs:compare_models() succesfully completed......................................


<a id=create></a>

### Create Selected Model

In [12]:
model = create_model('ridge')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.8025,154.3621,12.4243,0.1439,1.1736,1.1286
1,9.9903,164.6191,12.8304,0.1436,1.1698,1.1029
2,9.9353,158.194,12.5775,0.1266,1.1793,1.1372
3,9.8532,159.6353,12.6347,0.1498,1.1635,1.1628
4,10.163,164.6753,12.8326,0.1588,1.2079,1.1577
5,9.9268,159.2102,12.6179,0.1587,1.2098,1.1496
6,10.4344,178.2105,13.3496,0.1269,1.2081,1.1597
7,9.801,154.1466,12.4156,0.1267,1.1685,1.1161
8,9.9149,158.2984,12.5817,0.137,1.1774,1.131
9,9.7764,155.2438,12.4597,0.1627,1.1614,1.0898


INFO:logs:create_model_container: 21
INFO:logs:master_model_container: 21
INFO:logs:display_container: 3
INFO:logs:Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=355, solver='auto', tol=0.001)
INFO:logs:create_model() succesfully completed......................................


<a id=tune></a>

### Tune Selected Model

In [13]:
tuned_model = tune_model(model)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9.7994,154.0428,12.4114,0.1457,1.1714,1.1298
1,9.993,164.6639,12.8321,0.1433,1.1722,1.1027
2,9.9356,158.2921,12.5814,0.126,1.1809,1.1342
3,9.8533,159.549,12.6313,0.1502,1.1648,1.1609
4,10.1687,164.786,12.8369,0.1582,1.2044,1.1574
5,9.9253,159.203,12.6176,0.1588,1.2058,1.1495
6,10.4454,178.3625,13.3552,0.1261,1.213,1.162
7,9.8079,154.4214,12.4266,0.1252,1.1696,1.1159
8,9.9198,158.4444,12.5875,0.1362,1.177,1.1292
9,9.7812,155.3955,12.4658,0.1619,1.1662,1.0884


INFO:logs:create_model_container: 22
INFO:logs:master_model_container: 22
INFO:logs:display_container: 4
INFO:logs:Ridge(alpha=8.94, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=355, solver='auto', tol=0.001)
INFO:logs:tune_model() succesfully completed......................................


<a id=evaluate></a>

### Evaluate Model

https://pycaret.readthedocs.io/en/latest/api/regression.html#pycaret.regression.evaluate_model

In [14]:
evaluate_model(tuned_model)

INFO:logs:Initializing evaluate_model()
INFO:logs:evaluate_model(estimator=Ridge(alpha=8.94, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=355, solver='auto', tol=0.001), fold=None, fit_kwargs=None, feature_name=None, groups=None, use_train_data=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

https://pycaret.readthedocs.io/en/latest/api/regression.html#pycaret.regression.interpret_model

In [15]:
# interpret_model(tuned_model)

<a id=finalize_and_store></a>

### Model Finalization and Storage

In [16]:
final_model = finalize_model(tuned_model)

INFO:logs:Initializing finalize_model()
INFO:logs:finalize_model(estimator=Ridge(alpha=8.94, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=355, solver='auto', tol=0.001), fit_kwargs=None, groups=None, model_only=True, display=None)
INFO:logs:Finalizing Ridge(alpha=8.94, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=355, solver='auto', tol=0.001)
INFO:logs:Initializing create_model()
INFO:logs:create_model(estimator=Ridge(alpha=8.94, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=355, solver='auto', tol=0.001), fold=None, round=4, cross_validation=True, predict=True, fit_kwargs={}, groups=None, refit=True, verbose=False, system=False, metrics=None, add_to_model_list=False, probability_threshold=None, display=None, kwargs={})
INFO:logs:Checking exceptions
INFO:logs:Importing libraries
INFO:logs:Copying training dataset
INFO:logs:Defining folds
INFO:logs:Declaring metric variab

In [17]:
# save_model(final_model, '../models/AutoML/Baseline_Ridge_Reg_PyCaret')

INFO:logs:Initializing save_model()
INFO:logs:save_model(model=Ridge(alpha=8.94, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=355, solver='auto', tol=0.001), model_name=../models/AutoML/Baseline_Ridge_Reg_PyCaret, prep_pipe_=Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True,
                                      features_todrop=['dk_line_home',
                                                       'fd_line_home',
                                                       'covers_consenses_home',
                                                       'home_line'],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=['league_year_end'],
                                      target='TARGET_actual_home_margin',
                                      time_fe

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['dk_line_home',
                                                        'fd_line_home',
                                                        'covers_consenses_home',
                                                        'home_line'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=['league_year_end'],
                                       target='TARGET_actual_home_margin',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categor...
                 ('fix_perfect', Remove_100(target='TARGET_actual_home_margin')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_se

In [18]:
# !mlflow ui