In [2]:
from time import time
from tqdm import tqdm
from pathlib import Path
from joblib import dump,load

import pandas as pd
import numpy as np
np.random.seed(42)
from scipy.stats import loguniform, powerlaw, uniform, nbinom, randint, boltzmann, binom, beta
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.base import TransformerMixin
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, IsolationForest, VotingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoost, CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb

**Load Data, Model, fit model**

In [3]:
data = pd.read_parquet(Path('data/game_stats.parquet'))
X = data.iloc[:,6:-2]
y = data['home_margin'] # target for regression

# Initial Split, this test set will be hold out for scoring
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
features = X_train.columns
print(X_train.shape)
print(X_test.shape)

(1216, 12)
(305, 12)


In [9]:
gbr_pipe = Pipeline([('norm',StandardScaler()),
                     ('GradientBoostingRegressor', GradientBoostingRegressor(random_state=42, ))])
cat_pipe = Pipeline([('norm', StandardScaler()), 
                     ('CatBoostRegressor', CatBoostRegressor(random_seed=42, verbose=False))])

tuning_grids = {
    'GradientBoostingRegressor': {'iterations': 60, 'grid': {
        'loss':['huber'],
        'learning_rate': loguniform(.04,1,0,1),
        'n_estimators': boltzmann(0.01,200,10),
    }},
    'CatBoostRegressor': {'iterations': 20, 'grid': {
        'l2_leaf_reg':uniform(0,4), # def=3
        'random_strength':uniform(0,3), # def=1
        'learning_rate':beta(2,20,0,1), # def=0.043
    }},
}

In [10]:
%%time

# tune GBR, Cat | record test results
test_results = pd.DataFrame()
top_tuned = []
model_params = {}

for pipe in tqdm([gbr_pipe, cat_pipe]):
    start = time()
    ind = pipe[-1].__class__.__name__
    
    # tune hyperparameters
    grid = {f'{ind}__{key}':val 
            for key,val in tuning_grids[ind]['grid'].items()}
    search = RandomizedSearchCV(pipe, grid, n_iter=tuning_grids[ind]['iterations'],
                            scoring='r2', refit=True,  random_state=42, cv=10, n_jobs=-1)
    cv = search.fit(X_train,y_train)
    
    # make predictions, record final parameters
    tuned = cv.best_estimator_
    tuned.fit(X_train,y_train)
    y_pred = tuned.predict(X_test)
    model_params[ind] = cv.best_params_
    top_tuned.append(tuned)

    # test results
    test_results.loc[ind,'R2'] = r2_score(y_test,y_pred)
    test_results.loc[ind,'RMSE'] = np.sqrt(mean_squared_error(y_test,y_pred))
    test_results.loc[ind,'MAE'] = mean_absolute_error(y_test,y_pred)
    test_results.loc[ind,'MAPE'] = mean_absolute_percentage_error(y_test,y_pred) 
    test_results.loc[ind,'time'] = round(start-time())

100%|██████████| 2/2 [02:59<00:00, 89.96s/it]

CPU times: total: 33.2 s
Wall time: 2min 59s





In [11]:
top_tuned

[Pipeline(steps=[('norm', StandardScaler()),
                 ('GradientBoostingRegressor',
                  GradientBoostingRegressor(learning_rate=0.1335502201652365,
                                            loss='huber', n_estimators=182,
                                            random_state=42))]),
 Pipeline(steps=[('norm', StandardScaler()),
                 ('CatBoostRegressor',
                  <catboost.core.CatBoostRegressor object at 0x000001A2FE931C30>)])]

In [15]:
%%time
# start = time()

# create blend, pipeline
blend = VotingRegressor([
    ('gbr',top_tuned[0][-1]), ('catboost',top_tuned[1][-1]),
])
pipe_blend = Pipeline([('norm', StandardScaler()), ('vr', blend)])

# fit to train, predict test
ind = 'voting_GBR-Cat'
pipe_blend.fit(X_train,y_train)
y_pred = pipe_blend.predict(X_test)

# test results
test_results.loc[ind,'R2'] = r2_score(y_test,y_pred)
test_results.loc[ind,'RMSE'] = np.sqrt(mean_squared_error(y_test,y_pred))
test_results.loc[ind,'MAE'] = mean_absolute_error(y_test,y_pred)
test_results.loc[ind,'MAPE'] = mean_absolute_percentage_error(y_test,y_pred) 
# test_results.loc[ind,'time'] = round(start-time())

CPU times: total: 14.5 s
Wall time: 3.14 s


In [17]:
test_results.sort_values('R2', ascending=False).style\
    .background_gradient(subset='R2', cmap='viridis')\
    .background_gradient(subset=['RMSE', 'MAE', 'MAPE'], cmap='viridis_r')\
    .format('{:.3f}')

Unnamed: 0,R2,RMSE,MAE,MAPE
voting_GBR-Cat,0.849,2.789,1.64,0.435
GradientBoostingRegressor,0.846,2.811,1.721,0.455
CatBoostRegressor,0.843,2.845,1.627,0.435


In [None]:
# dump
dump(pipe_blend, 'final_model_training_v1.3.joblib')

In [20]:
# fit to entire dataset
pipe_blend.fit(X,y)

In [38]:
pipe_blend

In [21]:
# dump
dump(pipe_blend, 'final_model_all_v1.3.joblib')

['final_model_all_v1.3.joblib']

---
**Reload Voting Regressor (sklearn v1.3)**

In [36]:
# load from file or create above
final_model = load('final_model_all_v1.3.joblib')

---
**Example for Slides**

In [5]:
gid = '2023-05-19-LA-SLC'

In [8]:
data[data.game==gid].T

Unnamed: 0,3101
game,2023-05-19-LA-SLC
date,2023-05-19 19:00:00+00:00
home,Shred
away,Aviators
home_score,25
away_score,13
home_completions,233
away_completions,268
home_throws,244
away_throws,292


In [46]:
gid = '2023-05-19-LA-SLC'
print('model predicted home margin\n',
round(final_model.predict(data[\
data.game==gid].iloc[:,6:-2])[0]))

model predicted home margin
 12
