In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb
print('XGB version:', xgb.__version__)

import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
print('LGB version:', lgb.__version__)

from tqdm import tqdm

import shap
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path  #for Windows/Linux compatibility

XGB version: 2.0.0
LGB version: 4.1.0


In [16]:
df = pd.read_csv("transformed.csv")

In [17]:
df.columns

Index(['Team_home', 'GAME_DATE_EST', 'HOME_TEAM_WINS', 'PTS_home', 'FGM_home',
       'FGA_home', 'FG_PCT_home', '3PM_home', '3PA_home', 'FG3_PCT_home',
       'FTM_home', 'FTA_home', 'FT_PCT_home', 'OREB_home', 'DREB_home',
       'REB_home', 'AST_home', 'STL_home', 'BLK_home', 'TOV_home', 'PF_home',
       '+/-_home', 'HOME_TEAM_ID', 'GAME_ID', 'Team_away', 'PTS_away',
       'FGM_away', 'FGA_away', 'FG_PCT_away', '3PM_away', '3PA_away',
       'FG3_PCT_away', 'FTM_away', 'FTA_away', 'FT_PCT_away', 'OREB_away',
       'DREB_away', 'REB_away', 'AST_away', 'STL_away', 'BLK_away', 'TOV_away',
       'PF_away', '+/-_away', 'VISITOR_TEAM_ID', 'SEASON', 'PLAYOFF',
       'TARGET'],
      dtype='object')

In [18]:
latest_season = df['SEASON'].unique().max()

train = df[df['SEASON'] < (latest_season)]
test = df[df['SEASON'] >= (latest_season - 1)]

train.to_csv("train.csv",index=False)
test.to_csv("test.csv",index=False)

In [19]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [20]:
predict = predict = np.ones((train.shape[0],))
true =  train['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.5905921980260066, 0.5)

In [21]:
predict = predict = np.ones((test.shape[0],))
true =  test['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.5799457994579946, 0.5)

In [22]:
def fix_datatypes(df):
    df['GAME_DATE_EST'] = df['GAME_DATE_EST'].apply(lambda x: x[:10])
    df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'])

    long_integer_fields = ['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON']

    #convert long integer fields to int32 from int64
    for field in long_integer_fields:
        df[field] = df[field].astype('int32')
    
    #convert the remaining int64s to int8
    for field in df.select_dtypes(include=['int64']).columns.tolist():
        df[field] = df[field].astype('int8')
        
    #convert float64s to float16s
    for field in df.select_dtypes(include=['float64']).columns.tolist():
        df[field] = df[field].astype('float16')
        
    return df

train = fix_datatypes(train)
test = fix_datatypes(test)

In [23]:
def add_rolling_means(df, location):
    
    location_id = location + "_TEAM_ID"

    # sort games by the order in which they were played for each home or visitor team
    df = df.sort_values(by = [location_id, 'GAME_DATE_EST'], axis=0, ascending=[True, True,], ignore_index=True)
    
    # rolling means
    feature_list = ['HOME_TEAM_WINS','PTS_home',
       'FGM_home', 'FGA_home', 'FG_PCT_home', '3PM_home', '3PA_home',
       'FG3_PCT_home', 'FTM_home', 'FTA_home', 'FT_PCT_home', 'OREB_home',
       'DREB_home', 'REB_home', 'AST_home', 'STL_home', 'BLK_home', 'TOV_home',
       'PF_home', '+/-_home']
    
    if location == 'VISITOR':
        feature_list = ['HOME_TEAM_WINS', 'PTS_away',
            'FGM_away', 'FGA_away', 'FG_PCT_away', '3PM_away', '3PA_away',
            'FG3_PCT_away', 'FTM_away', 'FTA_away', 'FT_PCT_away', 'OREB_away',
            'DREB_away', 'REB_away', 'AST_away', 'STL_away', 'BLK_away', 'TOV_away',
            'PF_away', '+/-_away']

    roll_feature_list = []
    for feature in feature_list:
        roll_feature_name = location + '_' + feature + '_AVG_LAST_' + '5_' + location
        if feature == 'HOME_TEAM_WINS': #remove the "HOME_" for better readability
            roll_feature_name = location + '_' + feature[5:] + '_AVG_LAST_' + '5_' + location
        roll_feature_list.append(roll_feature_name)
        df[roll_feature_name] = df.groupby(['HOME_TEAM_ID'])[feature].rolling(5, closed= "left").mean().values

    return df

train = add_rolling_means(train, 'HOME')
train = add_rolling_means(train, 'VISITOR')
test = add_rolling_means(test, 'HOME')
test = add_rolling_means(test, 'VISITOR')

In [24]:
target = train['TARGET']
test_target = test['TARGET']

category_columns = ['HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON', 'HOME_TEAM_WINS', 'PLAYOFF', 'CONFERENCE_x', 'CONFERENCE_y',]

all_columns = train.columns.tolist()
drop_columns = ['TARGET', 'GAME_DATE_EST', 'GAME_ID',] #not really useful as-is

# non-rolling features, which would be data leakage
drop_columns1 = ['HOME_TEAM_WINS', 
       'PTS_home', 'FGM_home', 'FGA_home', 'FG_PCT_home', 
       '3PM_home', '3PA_home', 'FG3_PCT_home', 'FTM_home', 'FTA_home', 
       'FT_PCT_home', 'OREB_home', 'DREB_home', 'REB_home', 'AST_home', 
       'STL_home', 'BLK_home', 'TOV_home', 'PF_home', '+/-_home', 'Team_home']
drop_columns2 = [
       'PTS_away', 'FGM_away', 'FGA_away', 'FG_PCT_away',
       '3PM_away', '3PA_away', 'FG3_PCT_away', 'FTM_away', 'FTA_away',
       'FT_PCT_away', 'OREB_away', 'DREB_away', 'REB_away', 'AST_away',
       'STL_away', 'BLK_away', 'TOV_away', 'PF_away', '+/-_away', 'Team_away']

drop_columns = drop_columns + drop_columns1
drop_columns = drop_columns + drop_columns2 

use_columns = [item for item in all_columns if item not in drop_columns]

train = train[use_columns]
test = test[use_columns]

In [25]:
K_FOLDS = 5
SEED = 13

In [26]:
GPU = True

LightGBM

In [27]:
%%time

NUM_BOOST_ROUND = 700
EARLY_STOPPING = 200
LOG_EVALUATION = 100

train_oof = np.zeros((train.shape[0],))
test_preds = 0
train_oof_shap = np.zeros((train.shape[0],train.shape[1]+1))
#train_oof_shap_interact = np.zeros((train.shape[0],train.shape[1]+1,train.shape[1]+1))
test_preds_shap = 0

lgb_params= {
            'seed': SEED,
            'verbose': 0,           
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'auc', 
            #'num_leaves': 31,
            #'learning_rate': 0.05,
            #'feature_fraction': 0.9,
            #'bagging_fraction': 0.8,
            #'bagging_freq': 5,

            }

gpu_params= {
            'device': 'gpu',
            'gpu_platform_id': 0,
            'gpu_device_id': 0,
             }

if GPU:
    lgb_params = lgb_params | gpu_params
    

# K-fold cross validation

kf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
    
    train_df, val_df = train.iloc[train_ind], train.iloc[val_ind]
    train_target, val_target = target[train_ind], target[val_ind]

    train_lgbdataset = lgb.Dataset(train_df, label=train_target,)
    val_lgbdataset = lgb.Dataset(val_df, label=val_target, reference = train_lgbdataset )

    model =  lgb.train(lgb_params, 
                       train_lgbdataset,
                       valid_sets=val_lgbdataset,
                       num_boost_round = NUM_BOOST_ROUND,
                       callbacks=[log_evaluation(LOG_EVALUATION),early_stopping(EARLY_STOPPING,verbose=False)],
                       #verbose_eval= VERBOSE_EVAL,
                      )

    temp_oof = model.predict(val_df)
    temp_oof_shap = model.predict(val_df, pred_contrib=True)
    
    temp_test = model.predict(test)
    temp_test_shap = model.predict(test, pred_contrib=True)

    train_oof[val_ind] = temp_oof
    test_preds += temp_test/K_FOLDS

    train_oof_shap[val_ind, :] = temp_oof_shap
    test_preds_shap += temp_test_shap/K_FOLDS
    

    #for accuracy score, prediction probabilities must be convert to binary scores (Win or Lose)
    #determine optimum threshold for conveting probablities using ROC curve
    #generally 0.5 works for balanced data
    #fpr = false positive rate, tpr = true postive rate
    fpr, tpr, thresholds = roc_curve(val_target,temp_oof)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    temp_oof_binary = (temp_oof > optimal_threshold).astype(int)

    print(accuracy_score(val_target, temp_oof_binary), roc_auc_score(val_target, temp_oof))
    

    
# Out-of-Fold composite for train data

fpr, tpr, thresholds = roc_curve(target,train_oof)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
train_oof_binary = (train_oof > optimal_threshold).astype(int)

print()
print("Composite Train OOF CV Scores:")
print()
print("Accuracy Score:",accuracy_score(target, train_oof_binary))
print("AUC Score:", roc_auc_score(target, train_oof))
print("Optimal Threshold:", optimal_threshold)

#scores for Test data

test_preds_binary = (test_preds > optimal_threshold).astype(int)
print()
print("Test data Scores:")
print()
print("Accuracy Score:",accuracy_score(test_target, test_preds_binary))
print("AUC Score:", roc_auc_score(test_target, test_preds))


0it [00:00, ?it/s]

[100]	valid_0's auc: 0.641206
[200]	valid_0's auc: 0.632416


1it [00:03,  3.03s/it]

0.6211082827491679 0.6434009846417703
[100]	valid_0's auc: 0.625669
[200]	valid_0's auc: 0.617949


2it [00:06,  3.26s/it]

0.6048560798903466 0.6322576420100291
[100]	valid_0's auc: 0.626797
[200]	valid_0's auc: 0.62522


3it [00:09,  3.19s/it]

0.581081081081081 0.6302377749292449
[100]	valid_0's auc: 0.62178
[200]	valid_0's auc: 0.613847


4it [00:12,  3.07s/it]

0.5740305522914219 0.6268786567334661
[100]	valid_0's auc: 0.638904
[200]	valid_0's auc: 0.635611


5it [00:15,  3.06s/it]

0.6055620838229534 0.6452177182000091

Composite Train OOF CV Scores:

Accuracy Score: 0.5943521854927151
AUC Score: 0.6353467706007583
Optimal Threshold: 0.6018298719710422

Test data Scores:

Accuracy Score: 0.5989159891598916
AUC Score: 0.6626578233343383
CPU times: total: 1min 43s
Wall time: 15.3 s





XGBoost

In [28]:
%%time

NUM_BOOST_ROUND = 700

train_oof = np.zeros((train.shape[0],))
test_preds = 0
train_oof_shap = np.zeros((train.shape[0],train.shape[1]+1))
train_oof_shap_interact = np.zeros((train.shape[0],train.shape[1]+1,train.shape[1]+1))
test_preds_shap = 0

xgb_params= {
            'seed': SEED,
            'eval_metric': 'auc',
            }

gpu_params= {
             'tree_method': 'hist',
             'device': 'gpu'
             }

if GPU:
    xgb_params = xgb_params | gpu_params
    

# K-fold cross validation

test_dmatrix = xgb.DMatrix(test)

kf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
    
    train_df, val_df = train.iloc[train_ind], train.iloc[val_ind]
    train_target, val_target = target[train_ind], target[val_ind]

    train_dmatrix = xgb.DMatrix(train_df, label=train_target)
    val_dmatrix = xgb.DMatrix(val_df, label=val_target)

    model =  xgb.train(xgb_params, 
                       train_dmatrix, 
                       num_boost_round = NUM_BOOST_ROUND,
                      )

    temp_oof = model.predict(val_dmatrix)
    temp_oof_shap = model.predict(val_dmatrix, pred_contribs=True)
    temp_oof_shap_interact = model.predict(val_dmatrix, pred_interactions=True)
    
    temp_test = model.predict(test_dmatrix)
    temp_test_shap = model.predict(test_dmatrix, pred_contribs=True)

    train_oof[val_ind] = temp_oof
    test_preds += temp_test/K_FOLDS

    train_oof_shap[val_ind, :] = temp_oof_shap
    train_oof_shap_interact[val_ind, :,:] = temp_oof_shap_interact
    test_preds_shap += temp_test_shap/K_FOLDS
    
    #for accuracy score, prediction probabilities must be convert to binary scores (Win or Lose)
    #determine optimum threshold for conveting probablities using ROC curve
    #generally 0.5 works for balanced data
    #fpr = false positive rate, tpr = true postive rate
    fpr, tpr, thresholds = roc_curve(val_target,temp_oof)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    temp_oof_binary = (temp_oof > optimal_threshold).astype(int)

    print(accuracy_score(val_target, temp_oof_binary), roc_auc_score(val_target, temp_oof))
    

    
# Out-of-Fold composite for train data

fpr, tpr, thresholds = roc_curve(target,train_oof)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
train_oof_binary = (train_oof > optimal_threshold).astype(int)

print()
print("Composite Train OOF CV Scores:")
print()
print("Accuracy Score:",accuracy_score(target, train_oof_binary))
print("AUC Score:", roc_auc_score(target, train_oof))
print("Optimal Threshold:", optimal_threshold)

#scores for Test data

test_preds_binary = (test_preds > optimal_threshold).astype(int)
print()
print("Test data Scores:")
print()
print("Accuracy Score:",accuracy_score(test_target, test_preds_binary))
print("AUC Score:", roc_auc_score(test_target, test_preds))



1it [00:09,  9.29s/it]

0.542001174858038 0.5562487869573656


2it [00:18,  9.20s/it]

0.5553162326218916 0.5608179934974572


3it [00:27,  9.11s/it]

0.5697218958088524 0.5486531163935883


4it [00:36,  9.17s/it]

0.5487661574618097 0.547304011777696


5it [00:45,  9.18s/it]

0.5626713670191931 0.562223792562772

Composite Train OOF CV Scores:

Accuracy Score: 0.5497023343255523
AUC Score: 0.5550406654168019
Optimal Threshold: 0.56333327293396

Test data Scores:

Accuracy Score: 0.6086720867208673
AUC Score: 0.691350015073862
CPU times: total: 41.3 s
Wall time: 46 s



