In [2]:
import logging
from tqdm import tqdm
import sys
sys.path.append('.')
from pathlib import Path
import pandas as pd
from cbastats.DBHelper import MongoDBHelper
import os
from dotenv import load_dotenv
from utils.datarefresher import get_current_season, set_logging_config
import neptune

In [3]:
config={}


In [4]:
DOTENV_PATH="."
env_path = Path(DOTENV_PATH) / '.env'
if not (env_path.exists()):
    print('.env file is missing.')
    sys.exit()
load_dotenv(dotenv_path=env_path,verbose=True)

True

In [5]:
needed_envs = ['MONGODB_PWD', 'MONGODB_USERNAME', 'MONGODB_ENDPOINT','LOGGER_NAME']
envs = os.environ
# only checks if user wants to save data to DB
# check if all needed environment variables are present

for needed_env in needed_envs:
    if needed_env not in envs:
        raise Exception(f"Missing environment variable: {needed_env}.\n     Please check if these environment variables are present: {needed_envs}")
    config[needed_env] = envs[needed_env]

In [6]:
mongodbio = MongoDBHelper()
client = mongodbio.create_connection(
    config['MONGODB_USERNAME'], config['MONGODB_PWD'], config['MONGODB_ENDPOINT'])
nba_db = client['nbaStats']
# coll_nbaGames = nba_db['nbaGames']
# coll_nbaGamesStaging= nba_db['nbaGamesStaging']
# coll_nbaBoxScores = nba_db['nbaProcessedBoxScores']

# in the future, when you have multiple features table, iterate through the tables
# or better yet, built a feature store
coll_feat = nba_db['nbaTeamFeat_Past10Games_Avg']

existing database ['cbaStats', 'nbaStats', 'admin', 'local']


In [7]:
logger = set_logging_config(config['LOGGER_NAME'],False)

In [8]:
config['random_state'] = 248

# Models

## Team based - simple average of past xx games


# Generate Training/Validation/Test Data

In [9]:
# import libraries
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
# from dtreeviz.trees import *

  return torch._C._cuda_getDeviceCount() > 0


In [178]:
# load the processed data
teamstats = pd.DataFrame(mongodbio.select_records(coll_feat,
                                                  filter={"season":{"$in":['2020-2021','2019-2020','2018-2019',
                                                                          '2017-2018','2016-2017','2015-2016']}},
                                                  field={'_id':0}))
teamstats = add_datepart(teamstats, 'DATE')
teamstats.columns

Index(['season', 'VISITOR', 'DATEWeek', 'VISITOR_PTS', 'HOME', 'HOME_PTS',
       'boxscores_url', 'game_id', 'Pace_home', 'eFG%_home', 'TOV%_home',
       'TS%_home', '3PAr_home', 'FTr_home', 'DRB%_home', 'TRB%_home',
       'AST%_home', 'STL%_home', 'BLK%_home', 'DRtg_home', 'ORB%_home',
       'FT/FGA_home', 'ORtg_home', 'Pace_visitor', 'eFG%_visitor',
       'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor', 'ORtg_visitor',
       'TS%_visitor', '3PAr_visitor', 'FTr_visitor', 'DRB%_visitor',
       'TRB%_visitor', 'AST%_visitor', 'STL%_visitor', 'BLK%_visitor',
       'DRtg_visitor', 'TOTAL_PTS', 'HOME_VISITOR_PTS_DIFF', 'HOME_WIN',
       'DATEYear', 'DATEMonth', 'DATEDay', 'DATEDayofweek', 'DATEDayofyear',
       'DATEIs_month_end', 'DATEIs_month_start', 'DATEIs_quarter_end',
       'DATEIs_quarter_start', 'DATEIs_year_end', 'DATEIs_year_start',
       'DATEElapsed'],
      dtype='object')

In [179]:
print(teamstats.shape)

(4803, 53)


In [180]:
# set what are dependent variables, what are independent variables
config['dep_variable'] = ['HOME_WIN']
config['ind_variable'] = ([col for col in teamstats.columns if col.endswith("_home")] + 
                         [col for col in teamstats.columns if col.endswith("_visitor")])

In [181]:
# get test set
other_idx,test_idx = train_test_split(list(teamstats.index),test_size=0.1,random_state=config['random_state'])
test_set = teamstats.loc[test_idx,config['ind_variable']+config['dep_variable']].copy()

# get train set and valid set
# don't want to mix non-test set with training set, therefore "other_set"
other_set = teamstats.loc[other_idx,config['ind_variable']+config['dep_variable']].copy()
train_idx,valid_idx = train_test_split(list(other_set.reset_index(drop=True).index),test_size=0.2,random_state=config['random_state'])
[len(idx) for idx in [train_idx,valid_idx,test_idx]]

[3457, 865, 481]

In [182]:
# split continuous variables and categorical variabls
cont_var, cat_var=cont_cat_split(other_set,dep_var=config['dep_variable'])
print(cont_var,cat_var)

['Pace_home', 'eFG%_home', 'TOV%_home', 'TS%_home', '3PAr_home', 'FTr_home', 'DRB%_home', 'TRB%_home', 'AST%_home', 'STL%_home', 'BLK%_home', 'DRtg_home', 'ORB%_home', 'FT/FGA_home', 'ORtg_home', 'Pace_visitor', 'eFG%_visitor', 'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor', 'ORtg_visitor', 'TS%_visitor', '3PAr_visitor', 'FTr_visitor', 'DRB%_visitor', 'TRB%_visitor', 'AST%_visitor', 'STL%_visitor', 'BLK%_visitor', 'DRtg_visitor'] []


In [183]:
# return a tabular object
to=TabularPandas(other_set,cat_names=cat_var,cont_names=cont_var,
                 y_names=config['dep_variable'],splits=(train_idx,valid_idx))
len(to.train),len(to.valid)

(3457, 865)

In [184]:
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y
print(len(xs))
print(len(valid_xs))

3457
865


# Initialize Neptune to Record Experiments

In [148]:
neptune.init(project_qualified_name='jjnotjimmyjohn/NBA-Score-Prediction')
# neptune.init(project_qualified_name='jjnotjimmyjohn/sandbox',api_token=neptune.ANONYMOUS_API_TOKEN)

Project(jjnotjimmyjohn/NBA-Score-Prediction)

In [416]:
# experiment name should not show other things except your training objective
# winclassification

# objective
# model params
# features
# accuracy

# TODO: instead of using pandas built-ins to calculate features, use custom functions. 
# that way you can use function as parameters, easier to log and streamline
def record_experiment(model,feat_name,model_objective,
                      x,y,valid_x,valid_y,metric_name,metric_func,
                      tags:list,
                      proj_name='jjnotjimmyjohn/NBA-Score-Prediction'):
    # create experiment
    neptune.set_project(proj_name)
    neptune.create_experiment(name=model_objective)
    if hasattr(model,'best_estimator_'):
        param_log = model.best_estimator_.get_params()
    else:
        param_log = model.get_params()
    neptune.log_text('best_model', str(param_log))
    try:
        neptune.log_text('model_type',type(model).__name__)
    except:
        print('No model name is recorded')
        neptune.log_text('model_type',"unknown")
    neptune.log_metric('training_'+metric_name, metric_func(y,model.predict(xs)))
    neptune.log_metric('validation_'+metric_name, metric_func(valid_y,model.predict(valid_xs)))
    if tags:
        for tag in tags:
            neptune.append_tags(*tags)
    # for key,value in args.items():
    #     neptune.set_property(key,value)
    neptune.stop()

# Decision Tree

In [150]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import scipy.stats as stats
from sklearn.metrics import SCORERS,accuracy_score,f1_score

In [19]:
# parameters distribution to randomized search
dt_params_dist={
    'min_samples_split':[5,10,15,20],
    'min_samples_leaf':[10,20,30,40,60],
    'max_features':[0.5,0.6,0.7,0.8,0.9],
}
# base model
base_model = DecisionTreeClassifier(random_state=config['random_state'])

In [50]:
print(coll_feat.name)

nbaTeamFeat_PastNGames_Avg


In [20]:
model = RandomizedSearchCV(base_model,dt_params_dist,n_iter=90,n_jobs=6,
                                   cv=8,verbose=5,return_train_score=True,refit=True)
model.fit(xs, y)

Fitting 8 folds for each of 90 candidates, totalling 720 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    1.8s
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:    3.2s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:    5.5s
[Parallel(n_jobs=6)]: Done 276 tasks      | elapsed:    8.9s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:   13.4s
[Parallel(n_jobs=6)]: Done 636 tasks      | elapsed:   17.9s
[Parallel(n_jobs=6)]: Done 720 out of 720 | elapsed:   19.9s finished


RandomizedSearchCV(cv=8, estimator=DecisionTreeClassifier(random_state=248),
                   n_iter=90, n_jobs=6,
                   param_distributions={'max_features': [0.5, 0.6, 0.7, 0.8,
                                                         0.9],
                                        'min_samples_leaf': [10, 20, 30, 40,
                                                             60],
                                        'min_samples_split': [5, 10, 15, 20]},
                   return_train_score=True, verbose=5)

In [21]:
print(accuracy_score(y,model.predict(xs)))
print(accuracy_score(valid_y,model.predict(valid_xs)))

0.7038518206693327
0.5808080808080808


In [22]:
# create experiment
# neptune.set_project('jjnotjimmyjohn/NBA-Score-Prediction')
# neptune.create_experiment(name='TeamBasedModel_moredata',description='',params=model.get_params())
# neptune.append_tag(f'summarize_past_games')
# neptune.append_tag(f'decision_tree')
# neptune.append_tag(f'randomized_search_cv')
# neptune.log_metric('training_error', mean_absolute_error(y,model.predict(xs)))
# neptune.log_metric('validation_error', mean_absolute_error(valid_y,model.predict(valid_xs)))
# neptune.log_text('best_model', str(model.best_estimator_))
# for key,value in args.items():
#     neptune.set_property(key,value)
# neptune.stop()

# Randome Forest

In [23]:
# parameters distribution to randomized search
dt_params_dist={
    'min_samples_split':[5,10,15,20],
    'max_samples':[0.5,0.7,0.8,0.9,1],
    'min_samples_leaf':[30,40,50],
    'max_features':[0.6,0.7,0.8,0.9,1],
}
# base model


In [185]:
base_model = RandomForestClassifier(n_jobs=-1, n_estimators=2000, oob_score=True, min_samples_leaf=45)
base_model.fit(xs,y)
print(accuracy_score(y,base_model.predict(xs)))
print(accuracy_score(valid_y,base_model.predict(valid_xs)))

0.719409892970784
0.6323699421965318


In [25]:
model = RandomizedSearchCV(base_model,dt_params_dist,n_iter=100,n_jobs=-1,
                                   cv=8,verbose=5,return_train_score=True,refit=True)
model.fit(xs, y)

Fitting 8 folds for each of 100 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed: 28.9min
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed: 54.4min
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed: 81.7min
[Parallel(n_jobs=-1)]: Done 636 tasks      | elapsed: 120.1min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 152.5min finished


RandomizedSearchCV(cv=8,
                   estimator=RandomForestClassifier(min_samples_leaf=45,
                                                    n_estimators=2000,
                                                    n_jobs=-1, oob_score=True),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_features': [0.6, 0.7, 0.8, 0.9, 1],
                                        'max_samples': [0.5, 0.7, 0.8, 0.9, 1],
                                        'min_samples_leaf': [30, 40, 50],
                                        'min_samples_split': [5, 10, 15, 20]},
                   return_train_score=True, verbose=5)

In [26]:
print(accuracy_score(y,model.predict(xs)))
print(accuracy_score(valid_y,model.predict(valid_xs)))

0.7331088191959587
0.6338383838383839


In [59]:
record_experiment(model=model,feat_name=coll_feat.name,model_objective="winloss_classification",
                  x=xs,y=y,valid_x=valid_xs,valid_y=valid_y,metric_name='Accuracy',metric_func=accuracy_score,
                 proj_name='jjnotjimmyjohn/NBA-Score-Prediction')

https://ui.neptune.ai/jjnotjimmyjohn/NBA-Score-Prediction/e/NBAS-6


In [27]:
model.best_estimator_.feature_importances_

array([0.01541239, 0.01973962, 0.02409648, 0.03867845, 0.02027797,
       0.01485195, 0.01925435, 0.02022374, 0.02326313, 0.02133321,
       0.02107099, 0.11722661, 0.01321176, 0.01650976, 0.15388962,
       0.02079595, 0.04096719, 0.01557033, 0.01808111, 0.01483974,
       0.07810076, 0.05194351, 0.02509932, 0.01461921, 0.02030002,
       0.02028742, 0.02362852, 0.01930176, 0.02542785, 0.07199728])

In [28]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [29]:
fi = rf_feat_importance(model.best_estimator_, xs)
fi

Unnamed: 0,cols,imp
14,ORtg_home,0.15389
11,DRtg_home,0.117227
20,ORtg_visitor,0.078101
29,DRtg_visitor,0.071997
21,TS%_visitor,0.051944
16,eFG%_visitor,0.040967
3,TS%_home,0.038678
28,BLK%_visitor,0.025428
22,3PAr_visitor,0.025099
2,TOV%_home,0.024096


# XGBoost

In [186]:
import xgboost as xg

In [228]:
base_model = xg.XGBClassifier(n_estimators = 6, seed = config['random_state'],max_depth=4,reg_lambda=0.1, 
                             learning_rate=0.4)
base_model.fit(xs,y)
print(accuracy_score(y,base_model.predict(xs)))
print(accuracy_score(valid_y,base_model.predict(valid_xs)))

0.7162279433034423
0.6497109826589595


In [218]:
xg_params_dist={
    "n_estimators":[6],
    "max_depth":[4],
    "reg_lambda":[0.1],
    "learning_rate":[0.4]
}

In [219]:
model = RandomizedSearchCV(base_model,xg_params_dist,n_iter=500,n_jobs=-1,
                                   cv=8,verbose=3,return_train_score=True,refit=True)
model.fit(xs, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.6s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    0.6s remaining:    0.2s




[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.8s finished


RandomizedSearchCV(cv=8,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints='',
                                           learning_rate='0.4',
                                           max_delta_step=0, max_depth=4,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',
                                           n_estimators=6, n_jobs=6,
                                           num_parallel_tree=1,
                                           random_state=248, reg_alpha=0,
                                           reg_lambda=0.1, 

In [229]:
print(accuracy_score(y,base_model.predict(xs)))
print(accuracy_score(valid_y,base_model.predict(valid_xs)))

0.7162279433034423
0.6497109826589595


In [222]:
record_experiment(model=model,feat_name=coll_feat.name,model_objective="winloss_classification",
                  x=xs,y=y,valid_x=valid_xs,valid_y=valid_y,metric_name='Accuracy',metric_func=accuracy_score,
                  tags=['use recent 6 seasons'],
                  proj_name='jjnotjimmyjohn/NBA-Score-Prediction')

https://ui.neptune.ai/jjnotjimmyjohn/NBA-Score-Prediction/e/NBAS-16


# SVM

In [394]:
from sklearn.svm import SVC

In [403]:
svc = SVC(C=0.5,kernel='linear')

svc.fit(xs,y)

print(accuracy_score(y,svc.predict(xs)))
print(accuracy_score(valid_y,svc.predict(valid_xs)))

0.6528782181081862
0.6358381502890174


In [417]:
record_experiment(model=svc,feat_name=coll_feat.name,model_objective="winloss_classification",
                  x=xs,y=y,valid_x=valid_xs,valid_y=valid_y,metric_name='Accuracy',metric_func=accuracy_score,
                  tags=['use recent 6 seasons'],
                  proj_name='jjnotjimmyjohn/NBA-Score-Prediction')

https://ui.neptune.ai/jjnotjimmyjohn/NBA-Score-Prediction/e/NBAS-19


'Sequential'

# Neural Networks

In [230]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [321]:
# normalize data
from sklearn.preprocessing import MinMaxScaler
minmaxscaler = MinMaxScaler()
minmaxscaler.fit(xs)
xs_norm = minmaxscaler.transform(xs)
valid_xs_norm = minmaxscaler.transform(valid_xs)

In [336]:
# covnert to Tensors
tensor_xs_norm = torch.Tensor(xs_norm) # transform to torch tensor
tensor_y = torch.Tensor(y.values).unsqueeze(1)
tensor_valid_xs_norm = torch.Tensor(valid_xs_norm) # transform to torch tensor
tensor_valid_y = torch.Tensor(valid_y.values).unsqueeze(1)

In [338]:
# convert to Dataset
xs_ds=torch.utils.data.TensorDataset(tensor_xs_norm,tensor_y)
valid_xs_ds=torch.utils.data.TensorDataset(tensor_valid_xs_norm,tensor_valid_y)

In [377]:
data_loader = torch.utils.data.DataLoader(xs_ds,
                                          batch_size=128,
                                          shuffle=True)

In [378]:
for data in data_loader:
    print(data[0].shape)
    print(data[1].shape)
    break

torch.Size([128, 30])
torch.Size([128, 1])


In [381]:
model = nn.Sequential(nn.Linear(tensor_xs_norm.shape[1],2048),
                      nn.ReLU(),
                      nn.Linear(2048,1024),
                      nn.ReLU(),
                      nn.Linear(1024,512),
                      nn.ReLU(),
                      nn.Linear(512,256),
                      nn.ReLU(),
                      nn.Linear(256,128),
                      nn.ReLU(),
                      nn.Linear(128,64),
                      nn.ReLU(),
                      nn.Linear(64,32),
                      nn.ReLU(),
                      nn.Linear(32,1),
                      nn.Sigmoid()
                     )

criterion = torch.nn.MSELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    for data in data_loader:

        # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
        optimizer.zero_grad()

        # get output from the model, given the inputs
        outputs = model(data[0])

        # get loss for the predicted output
        loss = criterion(outputs, data[1])
        # get gradients w.r.t to parameters
        loss.backward()

        # update parameters
        optimizer.step()

    print('epoch {}, loss {}'.format(epoch, loss.item()))

epoch 0, loss 0.20004931092262268
epoch 1, loss 0.2347828447818756
epoch 2, loss 0.2199774831533432
epoch 3, loss 0.144601970911026
epoch 4, loss 0.6668237447738647
epoch 5, loss 0.16621433198451996
epoch 6, loss 0.0429544635117054
epoch 7, loss 0.0805291160941124
epoch 8, loss 0.3907792568206787
epoch 9, loss 0.07786593586206436


In [383]:
(((model(tensor_valid_xs_norm)>0.5).squeeze()==tensor_valid_y)*1.0).mean()

tensor(0.5446)

In [None]:
## Model - Player based

In [None]:
# players? -> include minutes played
# injured players?
# use news to guess how many minutes he will play