In [27]:
import neptune
from neptune.integrations.lightgbm import NeptuneCallback as LGBCallback
from neptune.integrations.xgboost import NeptuneCallback as XGBCallback

from src.functions import *

In [2]:
from lightgbm import early_stopping, log_evaluation
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances
)
from sklearn.metrics import (
    roc_curve,
    roc_auc_score,
    accuracy_score
)

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
import lightgbm as lgb
import xgboost as xgb
import pandas as pd
import optuna
import json
from tqdm import tqdm

In [4]:
NEPTUNE_API_TOKEN_OPTUNA = 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIxZWU5ZWRjOC02NTMxLTQ2ZGYtYmYyMS00MWVjZjQxODFmZmQifQ=='
NEPTUNE_API_TOKEN = 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzYmUwOTc0Ni1jYmM2LTQ1NzEtOTBiOS05MjYzOGIwYTBiZDYifQ=='

In [5]:
GPU = True
OPTUNA = False
MODEL_NAME = "lightgbm"
LOGGING_NOTE = "running"
TRAIN_NAME = "train_features.csv"
TEST_NAME = "test_features.csv"
OPTUNA_CV = "StratifiedKFold"

In [6]:
PROJECT = "edwardbk29/KHDL"
PROJECT_OPTUNA = "quan-tran-tu/KHDL-optuna"
SOURCE = "13.ipynb"

run = neptune.init_run(
    project=PROJECT,
    source_files=[SOURCE],
    api_token=NEPTUNE_API_TOKEN,
)

if MODEL_NAME == "xgboost":
    neptune_callback = LGBCallback(run=run)
if MODEL_NAME == "lightgbm":
    neptune_callback = XGBCallback(run=run)



https://app.neptune.ai/edwardbk29/KHDL/e/KHDL-2


In [7]:
run["note"] = LOGGING_NOTE
run["sys/tags"].add([MODEL_NAME,])
run['dataset/train'] = TRAIN_NAME
run['dataset/test'] = TEST_NAME

In [8]:
run["model/parameters/GPU"] = GPU
run["model/parameters/OPTUNA"] = OPTUNA

if OPTUNA:
    run["model/optuna/optuna_cv"] = OPTUNA_CV
    run["model/optuna/optuna_folds"] = OPTUNA_FOLDS = 5
    run["model/optuna/optuna_trials"] = OPTUNA_TRIALS = 150

run["model/parameters/k_folds"] = K_FOLDS = 5
run["model/parameters/seed"] = SEED = 13
run["model/parameters/num_boost_round"] = NUM_BOOST_ROUND = 2000 #xgb param
run["model/parameters/enable_categorical"] = ENABLE_CATEGORICAL = False
run["model/parameters/early_stopping"] = EARLY_STOPPING = 200 

XGB_VERBOSITY = 0 #xgb param
LGB_VERBOSITY = -1 #lgb param
VERBOSE_EVAL = False #lgb param
LOG_EVALUATION = 10000 #lgb display parameter

In [9]:
if MODEL_NAME == "xgboost":

    BASE_MODEL = xgb
    
    STATIC_PARAMS = {
                    'seed': SEED,
                    'eval_metric': 'auc',
                    "objective": "binary:logistic",
                    'verbosity': XGB_VERBOSITY,
                    }

    GPU_PARAMS = {
                 'tree_method': 'gpu_hist',
                 'predictor': 'gpu_predictor',
                 }
    
if MODEL_NAME == "lightgbm":
    
    BASE_MODEL = lgb
    
    STATIC_PARAMS = {
                    'seed': SEED,
                    'verbosity': LGB_VERBOSITY,           
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': 'auc', 
                    }

    GPU_PARAMS = {
                'device': 'gpu',
                'gpu_platform_id': 0,
                'gpu_device_id': 0,
                 }


if GPU:
    STATIC_PARAMS = STATIC_PARAMS | GPU_PARAMS

In [10]:
train = pd.read_csv(TRAIN_NAME)
test = pd.read_csv(TEST_NAME)

In [11]:
CATEGORY_COLUMNS = ['SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID' ]

train = encode_categoricals(train, CATEGORY_COLUMNS, MODEL_NAME, ENABLE_CATEGORICAL)
test = encode_categoricals(test, CATEGORY_COLUMNS, MODEL_NAME, ENABLE_CATEGORICAL)

In [12]:
DROP_COLUMNS = ['TARGET', 'GAME_DATE_EST', 'GAME_ID', ]
target = train['TARGET']
test_target = test['TARGET']
test_target_original = test['TARGET'] #save for later probability calibration

all_columns = remove_non_rolling(train)

use_columns = [item for item in all_columns if item not in DROP_COLUMNS]


train = train[use_columns]
test = test[use_columns]
test_original = test.copy() #save for later probability calibration

run["model/features"].log(use_columns)

In [13]:
def run_optuna():
    
    #log separate Neptune run for optuna hyperameter tuning
    run2 = neptune.init_run(
                    project=PROJECT_OPTUNA,
                    source_files=[SOURCE,],
                    api_token=NEPTUNE_API_TOKEN_OPTUNA,
                    )
    run2["options/optuna_cv"] = OPTUNA_CV 
    run2["options/optuna_folds"] = OPTUNA_FOLDS 
    run2["options/optuna_trials"] = OPTUNA_TRIALS 
    run2["options/GPU"] = GPU
    run2["options/enable_categorical"] = ENABLE_CATEGORICAL
    run2["features"].log(use_columns)
    run2["sys/tags"].add([MODEL_NAME,])
    
    if MODEL_NAME == "xgboost":
        func = lambda trial: XGB_objective(trial, train, target, STATIC_PARAMS, ENABLE_CATEGORICAL, NUM_BOOST_ROUND, OPTUNA_CV, OPTUNA_FOLDS, SEED)
    if MODEL_NAME == "lightgbm":
        func = lambda trial: LGB_objective(trial, train, target, CATEGORY_COLUMNS, STATIC_PARAMS, ENABLE_CATEGORICAL, NUM_BOOST_ROUND, OPTUNA_CV, OPTUNA_FOLDS, SEED, EARLY_STOPPING)         
    
    study = optuna.create_study(direction='maximize')
    study.optimize(func, n_trials = OPTUNA_TRIALS,)

    #optuna_utils.log_study_metadata(study, run2)
    
    print("Study Best Value:",study.best_value)
    print("Study Best Params:",study.best_params)
    
    plot_optimization_history(study)
    
    plot_param_importances(study)
    
    run2["best_value"] = study.best_value
    run2["best_params"] = study.best_params
    run2["static_params"] = STATIC_PARAMS
    
    run2.stop()
    
    return study.best_params

In [14]:
if OPTUNA:
    tuned_params = run_optuna()
else:
    with open(('configs/' + MODEL_NAME + '.json')) as f:
        tuned_params = json.loads(f.read())

model_params= STATIC_PARAMS | tuned_params

run["model/params"] = model_params

In [15]:
def get_scores(target, preds):
    #for accuracy score, prediction probabilities must be convert to binary scores (Win or Lose)
    #determine optimum threshold for converting probabilities using ROC curve
    #generally 0.5 works for balanced data
    #fpr = false positive rate, tpr = true positive rate
    
    fpr, tpr, thresholds = roc_curve(target,preds)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    preds_binary = (preds > optimal_threshold).astype(int)
    
    acc_score = accuracy_score(target, preds_binary)
    auc_score = roc_auc_score(target, preds)

    print()
    print("Scores:")
    print()
    print("Accuracy Score:", acc_score)
    print("AUC Score:", auc_score)
    print("Optimal Threshold:", optimal_threshold)
    
    return preds_binary, acc_score, auc_score, optimal_threshold

In [16]:
def get_shapley(MODEL_NAME, model, data):
    if MODEL_NAME == "xgboost":
        shap = model.predict(data, pred_contribs=True)
    if MODEL_NAME == "lightgbm":
        shap = model.predict(data, pred_contrib=True)
        
    return shap

def get_shapley_interactions(MODEL_NAME, model, data):
    if MODEL_NAME == "xgboost":
        shap_interactions = model.predict(data, pred_interactions=True)
    if MODEL_NAME == "lightgbm": #not currently supported
        shap_interactions = np.zeros((data.shape[0],data.shape[1]+1,data.shape[1]+1))
    
    return shap_interactions

In [21]:
results = pd.DataFrame()

In [None]:
#initialize oof arrays including Shapley values and Shapley interaction values
train_oof = np.zeros((train.shape[0],))
train_oof_shap = np.zeros((train.shape[0],train.shape[1]+1))
# train_oof_shap_interact = np.zeros((train.shape[0],train.shape[1]+1,train.shape[1]+1))

   
# K-fold cross validation
if OPTUNA_CV == "StratifiedKFold": 
    kf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)
elif OPTUNA_CV == "TimeSeriesSplit":
    kf = TimeSeriesSplit(n_splits=K_FOLDS)


for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
    
    train_df, val_df = train.iloc[train_ind], train.iloc[val_ind]
    train_target, val_target = target[train_ind], target[val_ind]

    if MODEL_NAME == "xgboost":
        train_dmatrix = xgb.DMatrix(train_df, label=train_target,enable_categorical=ENABLE_CATEGORICAL)
        val_dmatrix = xgb.DMatrix(val_df, label=val_target,enable_categorical=ENABLE_CATEGORICAL)
        val_data = val_dmatrix
       
        model =  xgb.train(model_params, 
                           train_dmatrix, 
                           num_boost_round = NUM_BOOST_ROUND,
                          callbacks=[neptune_callback],
                          )
    
    if MODEL_NAME == "lightgbm":
        train_lgbdataset = lgb.Dataset(train_df, label=train_target, categorical_feature=CATEGORY_COLUMNS)
        val_lgbdataset = lgb.Dataset(val_df, label=val_target, reference = train_lgbdataset, categorical_feature=CATEGORY_COLUMNS)
        val_data = val_df
        
        model =  lgb.train(model_params, 
                       train_lgbdataset,
                       valid_sets=val_lgbdataset,
                       #num_boost_round = NUM_BOOST_ROUND,
                       callbacks=[log_evaluation(LOG_EVALUATION), neptune_callback],
                      )
    
    temp_oof = model.predict(val_data)
    temp_oof_shap = get_shapley(MODEL_NAME, model, val_data)
    temp_oof_shap_interact = get_shapley_interactions(MODEL_NAME, model, val_data)

    train_oof[val_ind] = temp_oof

    train_oof_shap[val_ind, :] = temp_oof_shap
    # train_oof_shap_interact[val_ind, :,:] = temp_oof_shap_interact
    
    temp_oof_binary, acc_score, auc_score, optimal_threshold = get_scores(val_target, temp_oof)

# Out-of-Fold composite for train data

train_oof_binary, acc_score, auc_score, optimal_threshold = get_scores(target,train_oof)

run["train/accuracy"] = acc_score 
run["train/AUC"] = auc_score 
run["train/optimal_threshold"] = optimal_threshold

df = {'Label': 'Train', 'Accuracy': acc_score, 'AUC': auc_score, 'Threshold':optimal_threshold}
results = results.append(df, ignore_index = True) 

In [None]:
if MODEL_NAME == "xgboost": 
    model = XGBClassifier(n_estimators=NUM_BOOST_ROUND, **model_params) 
if MODEL_NAME == "lightgbm": 
    model = LGBMClassifier(verbose_eval=False, **model_params)
# we then set up CalibratedClassifierCV using Isotonic and Sigmoid Regression 
model_isotonic = CalibratedClassifierCV(model, cv=5, method="isotonic") 
model_sigmoid = CalibratedClassifierCV(model, cv=5, method="sigmoid") 
clf_list = [ (model, "Base Model"), (model_isotonic, "Model + Isotonic"), (model_sigmoid, "Model + Sigmoid"), ] 
y_train = target 
y_test = test_target_original 
X_train = train[use_columns] 
X_test = test_original 
plot_calibration_curve(clf_list, X_train, y_train, X_test, y_test, n_bins=7)

In [None]:
df_scores, clf_list = calculate_classification_metrics(clf_list, X_train, y_train, X_test, y_test) 
df_scores = df_scores.reset_index() 
df_scores

In [None]:
model = [classifier for (classifier, name) in clf_list if name == best_calibrated_model][0] 
joblib.dump(model, MODELS_PATH / 'model.pkl')