In [0]:
# File location and type
file_location = "/FileStore/tables/Period_1_Game_Stats_Final_ModelReady.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
nhl_db = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(nhl_db)

won,Shots,Shots_Against,Goals,Goals_Against,Takeaways,Takeaways_Against,Hits,Hits_Against,Blocked Shots,Blocked Shots Against,Giveaways,Giveaways_Against,Missed Shots,Missed Shots_Against,Penalities,Penalities_Against,#Won Faceoffs,#Lost Faceoffs,HoA_away,HoA_home
0,8.0,8.0,0.0,3.0,1.0,3.0,14.0,5.0,3.0,3.0,6.0,7.0,4.0,0.0,1.0,1.0,10.0,12.0,1,0
1,8.0,8.0,3.0,0.0,3.0,1.0,5.0,14.0,3.0,3.0,7.0,6.0,0.0,4.0,1.0,1.0,12.0,10.0,0,1
1,11.0,12.0,0.0,1.0,0.0,2.0,4.0,4.0,6.0,8.0,2.0,0.0,6.0,3.0,1.0,3.0,11.0,9.0,1,0
0,12.0,11.0,1.0,0.0,2.0,0.0,4.0,4.0,8.0,6.0,0.0,2.0,3.0,6.0,3.0,1.0,9.0,11.0,0,1
1,9.0,8.0,0.0,1.0,3.0,2.0,4.0,5.0,7.0,1.0,7.0,5.0,3.0,8.0,2.0,1.0,7.0,11.0,1,0
0,8.0,9.0,1.0,0.0,2.0,3.0,5.0,4.0,1.0,7.0,5.0,7.0,8.0,3.0,1.0,2.0,11.0,7.0,0,1
0,4.0,8.0,0.0,1.0,5.0,5.0,12.0,5.0,3.0,4.0,2.0,2.0,3.0,3.0,1.0,3.0,5.0,12.0,1,0
1,8.0,4.0,1.0,0.0,5.0,5.0,5.0,12.0,4.0,3.0,2.0,2.0,3.0,3.0,3.0,1.0,12.0,5.0,0,1
0,5.0,19.0,0.0,1.0,2.0,4.0,9.0,6.0,3.0,3.0,4.0,5.0,2.0,6.0,5.0,3.0,7.0,11.0,1,0
1,19.0,5.0,1.0,0.0,4.0,2.0,6.0,9.0,3.0,3.0,5.0,4.0,6.0,2.0,3.0,5.0,11.0,7.0,0,1


In [0]:
! pip install lightgbm

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
# Importing correct packages
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# Machine Learning Libraries
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Figures
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

In [0]:
nhl_db=nhl_db.toPandas()

# convert from spark to pandas dataframe format
df = nhl_db

# No null values
df = df[:-9611]
df.drop(df.index[18800:40871],0,inplace=True)
df = df.reset_index(drop=True)
df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
Out[23]: won                      0
Shots                    0
Shots_Against            0
Goals                    0
Goals_Against            0
Takeaways                0
Takeaways_Against        0
Hits                     0
Hits_Against             0
Blocked Shots            0
Blocked Shots Against    0
Giveaways                0
Giveaways_Against        0
Missed Shots             0
Missed Shots_Against     0
Penalities               0
Penalities_Against       0
#Won Faceoffs            0
#Lost Faceoffs           0
HoA_away                 0
HoA_home                 0
dtype: int64

In [0]:
# split data into X and y
y = df.pop('won')
X = df
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [0]:
# Lightgbm model
model = lgb.LGBMClassifier(num_leaves = 31, learning_rate=0.09,max_depth=-5,random_state=42)
model.fit(X_train,y_train,eval_set=[(X_test,y_test),(X_train,y_train)],
          verbose=20,eval_metric='logloss')

[20]	training's binary_logloss: 0.588145	valid_0's binary_logloss: 0.602557
[40]	training's binary_logloss: 0.564346	valid_0's binary_logloss: 0.599359
[60]	training's binary_logloss: 0.546199	valid_0's binary_logloss: 0.600596
[80]	training's binary_logloss: 0.529762	valid_0's binary_logloss: 0.60314
[100]	training's binary_logloss: 0.514817	valid_0's binary_logloss: 0.604746
Out[25]: LGBMClassifier(learning_rate=0.09, max_depth=-5, random_state=42)

In [0]:
print('Training accuracy {:.4f}'.format(model.score(X_train,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test,y_test)))

# Interestingly, slightly better accuracy
# make predictions for test data
from sklearn.metrics import precision_score, recall_score
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('Precision score: ', (precision_score(y_test, predictions)))
print('Recall score: ', recall_score(y_test, predictions))

Training accuracy 0.7570
Testing accuracy 0.6538
Accuracy: 65.38%
Precision score:  0.6565500154846702
Recall score:  0.6459475929311396


In [0]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state = 5,oob_score = True) 
model = randomforest.fit(X_train, y_train)

In [0]:
# get importance
importance = model.feature_importances_

# summarize feature importance
pd.DataFrame(list(zip(X_train.columns, importance)), 
             columns = ['predictor','feature importance']).sort_values("feature importance", ascending = False)

Unnamed: 0,predictor,feature importance
2,Goals,0.086894
3,Goals_Against,0.0782
6,Hits,0.063282
7,Hits_Against,0.06267
0,Shots,0.059923
1,Shots_Against,0.058892
17,#Lost Faceoffs,0.057535
16,#Won Faceoffs,0.056804
8,Blocked Shots,0.053829
9,Blocked Shots Against,0.053589


In [0]:
from sklearn.metrics import precision_score, recall_score
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('Precision score: ', (precision_score(y_test, predictions)))
print('Recall score: ', recall_score(y_test, predictions))

Accuracy: 65.32%
Precision score:  0.6568047337278107
Recall score:  0.6425959780621572


In [0]:
!pip install xgboost

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
# XGBoost classifier
import xgboost as xgb
xg_reg = xgb.XGBClassifier(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)

Out[31]: XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.3,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=10, n_jobs=0,
              num_parallel_tree=1, objective='reg:linear', predictor='auto',
              random_state=0, ...)

In [0]:
from sklearn.metrics import mean_squared_error
y_pred = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('Precision score: ', (precision_score(y_test, predictions)))
print('Recall score: ', recall_score(y_test, predictions))

RMSE: 0.592380
Accuracy: 64.91%
Precision score:  0.6388101983002833
Recall score:  0.6870810481413772


# Hyperparameter Tuning
## Gridsearchcv

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

clf_lgb_grid = lgb.LGBMClassifier(num_leaves = 31, learning_rate=0.09,max_depth=-5,random_state=42)
gkf = KFold(n_splits=5, shuffle=True, random_state=42).split(X=X_train, y=y_train)

param_grid = {
    'num_leaves': [31, 62],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
    }
gsearch = GridSearchCV(estimator=clf_lgb_grid, param_grid=param_grid, cv=gkf)
lgb_model = gsearch.fit(X=X_train, y=y_train)

print(lgb_model.best_params_, lgb_model.best_score_)

{'learning_rate': 0.06, 'max_depth': 10, 'num_leaves': 31} 0.6612352683137512


## Bayesian Optimization

In [0]:
! pip install bayesian-optimization

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as RFC
from bayes_opt import BayesianOptimization
def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    """Random Forest cross validation.
    This function will instantiate a random forest classif"er with parameters
    n_estimators, min_samples_split, and max_features. Combined with data and
    targets this will in turn be used to perform cross validation. The result
    of cross validation is returned.
    Our goal is to f"nd combinations of n_estimators, min_samples_split, and
    max_features that minimzes the log loss.
    """
    estimator = RFC(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=2
    )
    cval = cross_val_score(estimator, data, targets,
    scoring='neg_log_loss', cv=4)
    return cval.mean()

In [0]:
def optimize_rfc(data, targets):
    """Apply Bayesian Optimization to Random Forest parameters."""
    def rfc_crossval(n_estimators, min_samples_split, max_features):
        """Wrapper of RandomForest cross validation.
        Notice how we ensure n_estimators and min_samples_split are casted
        to integer before we pass them along. Moreover, to avoid max_features
        taking values outside the (0, 1) range, we also ensure it is capped
        accordingly.
        """
        return rfc_cv(
        n_estimators=int(n_estimators),
        min_samples_split=int(min_samples_split),
        max_features=max(min(max_features, 0.999), 1e-3),
        data=data,
        targets=targets,
    )
    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
        "n_estimators": (10, 250),
        "min_samples_split": (2, 25),
        "max_features": (0.1, 0.999),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=10)
    print("Final result:", optimizer.max)

In [0]:
optimize_rfc(X_train, y_train)

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m-0.6136  [0m | [0m 0.2722  [0m | [0m 16.31   [0m | [0m 115.1   [0m |
| [0m 2       [0m | [0m-0.6223  [0m | [0m 0.806   [0m | [0m 19.94   [0m | [0m 75.42   [0m |
| [95m 3       [0m | [95m-0.6131  [0m | [95m 0.3485  [0m | [95m 20.44   [0m | [95m 240.0   [0m |
| [0m 4       [0m | [0m-0.6255  [0m | [0m 0.8875  [0m | [0m 10.23   [0m | [0m 130.2   [0m |
| [0m 5       [0m | [0m-0.6215  [0m | [0m 0.7144  [0m | [0m 18.39   [0m | [0m 98.86   [0m |
| [0m 6       [0m | [0m-0.6203  [0m | [0m 0.8021  [0m | [0m 18.14   [0m | [0m 230.7   [0m |
| [0m 7       [0m | [0m-0.6244  [0m | [0m 0.9241  [0m | [0m 16.19   [0m | [0m 80.11   [0m |
| [0m 8       [0m | [0m-0.6154  [0m | [0m 0.498   [0m | [0m 20.94   [0m | [0m 245.3   [0m |
| [0m 9       [0m | [0m-0.6208  [0m | [0m 0.7962  

## HyperOpt

In [0]:
! pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[?25l[K     |▏                               | 10 kB 20.2 MB/s eta 0:00:01[K     |▍                               | 20 kB 24.2 MB/s eta 0:00:01[K     |▋                               | 30 kB 12.7 MB/s eta 0:00:01[K     |▉                               | 40 kB 9.8 MB/s eta 0:00:01[K     |█                               | 51 kB 5.6 MB/s eta 0:00:01[K     |█▎                              | 61 kB 6.4 MB/s eta 0:00:01[K     |█▌                              | 71 kB 6.5 MB/s eta 0:00:01[K     |█▋                              | 81 kB 5.8 MB/s eta 0:00:01[K     |█▉                              | 92 kB 6.4 MB/s eta 0:00:01[K     |██                              | 102 kB 5.5 MB/s eta 0:00:01[K     |██▎                             | 112 kB 5.5 MB/s eta 0:00:01[K     |██▌                             | 122 kB 5.5 MB/s eta 0:00:01[K     |██▊                             | 133 kB 5.5 MB/s eta 0:0

In [0]:
# define an objective function
def objective(args):
    case, val = args
    if case == 'case 1':
        return val
    else:
        return val ** 2

# define a search space
from hyperopt import hp
space = hp.choice('a',
  [
    ('case 1', 1 + hp.lognormal('c1', 0, 1)),
    ('case 2', hp.uniform('c2', -10, 10))
  ])

# minimize the objective over the space
from hyperopt import fmin, tpe, space_eval
best = fmin(objective, space, algo=tpe.suggest, max_evals=100) # algo: Tree of Parzen Estimators (TPE)


  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?] 30%|███       | 30/100 [00:00<00:00, 285.96trial/s, best loss: 0.3117729916790779] 59%|█████▉    | 59/100 [00:00<00:00, 190.24trial/s, best loss: 0.05073281512389674] 80%|████████  | 80/100 [00:00<00:00, 167.93trial/s, best loss: 0.00018975468107330513] 98%|█████████▊| 98/100 [00:00<00:00, 146.13trial/s, best loss: 0.00018975468107330513]100%|██████████| 100/100 [00:00<00:00, 160.41trial/s, best loss: 0.00018975468107330513]


In [0]:
print(best)
print(space_eval(space, best))

{'a': 1, 'c2': 0.013775147225104534}
('case 2', 0.013775147225104534)


## HyperOpt with MLFlow

In [0]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-1.25.1-py3-none-any.whl (16.8 MB)
[?25l[K     |                                | 10 kB 16.4 MB/s eta 0:00:02[K     |                                | 20 kB 23.0 MB/s eta 0:00:01[K     |                                | 30 kB 13.7 MB/s eta 0:00:02[K     |                                | 40 kB 10.4 MB/s eta 0:00:02[K     |                                | 51 kB 6.6 MB/s eta 0:00:03[K     |▏                               | 61 kB 7.5 MB/s eta 0:00:03[K     |▏                               | 71 kB 7.9 MB/s eta 0:00:03[K     |▏                               | 81 kB 8.8 MB/s eta 0:00:02[K     |▏                               | 92 kB 7.9 MB/s eta 0:00:03[K     |▏                               | 102 kB 6.1 MB/s eta 0:00:03[K     |▏                               | 112 kB 6.1 MB/s eta 0:00:03[K     |▎                               | 122 kB 6.1 MB/s eta 0:00:03[K     |▎                               | 133 kB 6.1 MB/s eta 0:00:03

In [0]:
!pip install xgboost

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope
from math import exp
import mlflow.xgboost
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score

In [0]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 123, # Set a seed for deterministic training
    }

In [0]:
from mlflow.models.signature import infer_signature
def train_model(params):
    # With MLflow autologging, hyperparameters and the trained model are automatically logged to ML!"ow.
    mlflow.xgboost.autolog()
    with mlflow.start_run(nested=True):
        train = xgb.DMatrix(data=X_train, label=y_train)
        test = xgb.DMatrix(data=X_test, label=y_test)
        # Pass in the test set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric is no longer improving.
        booster = xgb.train(params=params, dtrain=train, num_boost_round=1000, evals=[(test, "test")], early_stopping_rounds=50)
        predictions_test = booster.predict(test)
        auc_score = roc_auc_score(y_test, predictions_test)
        mlflow.log_metric('auc', auc_score)

        signature = infer_signature(X_train, booster.predict(train))
        mlflow.xgboost.log_model(booster, "model", signature=signature)
        # Set the loss to -1*auc_score so fmin maximizes the auc_score

        return {'status': STATUS_OK, 'loss': -1*auc_score, 'booster': booster.attributes()}
    
# Greater parallelism will lead to speedups, but a less optimal hyperparameter sweep. A reasonable value for parallelism is the square root of max_evals.
spark_trials = SparkTrials(parallelism=10)

In [0]:
# Databricks CLI
token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
dbutils.fs.put("file:///root/.databrickscfg", "[DEFAULT]\nhost=https://community.cloud.databricks.com\ntoken = " +token,overwrite=True)

Wrote 98 bytes.
Out[46]: True

In [0]:
# Run fmin within an MLflow run context so that each hyperparameter configuration is logged as a child run of a parent. run called "xgboost_models" .
with mlflow.start_run(run_name='xgboost_models'):
    best_params = fmin(
        fn=train_model,
        space=search_space,
        algo=tpe.suggest,
        max_evals=96,
        trials=spark_trials,
        rstate=np.random.default_rng(123)
        )


  1%|          | 1/96 [01:46<2:48:26, 106.39s/trial, best loss: -0.6889736165130935]  2%|▏         | 2/96 [02:00<1:21:33, 52.06s/trial, best loss: -0.7020092958095239]   3%|▎         | 3/96 [02:21<58:43, 37.89s/trial, best loss: -0.7020092958095239]    4%|▍         | 4/96 [02:41<47:17, 30.84s/trial, best loss: -0.7146449154688116]  5%|▌         | 5/96 [03:08<44:41, 29.47s/trial, best loss: -0.7146449154688116]  6%|▋         | 6/96 [03:36<43:28, 28.98s/trial, best loss: -0.7146449154688116]  7%|▋         | 7/96 [03:52<36:42, 24.75s/trial, best loss: -0.7146449154688116]  8%|▊         | 8/96 [04:13<34:33, 23.56s/trial, best loss: -0.7146449154688116]  9%|▉         | 9/96 [04:16<24:51, 17.14s/trial, best loss: -0.7146449154688116] 10%|█         | 10/96 [04:23<20:05, 14.02s/trial, best loss: -0.7146449154688116] 11%|█▏        | 11/96 [05:10<34:16, 24.20s/trial, best loss: -0.7146449154688116] 12%|█▎        | 12/96 [05:42<36:55, 26.38s/trial, best loss: -0.7146449154688116] 14