### Model Construction and Evaluation

In [1]:
import os
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm
from datetime import datetime
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from constants import DATA_DIR
from utils import csv_concatenate, calculate_FPTS, calculate_MAE, calculate_RMSE

In [3]:
np.random.seed(23)
warnings.filterwarnings("ignore")

In [None]:
def cross_val(reg_base, X, y, nfolds=5, verbose=1):
    mae_results_train, rmse_results_train = [], []
    mae_results_valid, rmse_results_valid = [], []

    for i in tqdm(range(nfolds)):
        X_train, X_valid, y_train, y_valid = train_test_split(
            X, y, test_size=1/nfolds, stratify=None, random_state=i)

        reg = reg_base

        reg.fit(X_train, y_train)
        y_pred_train = reg.predict(X_train)

        mae_results_train.append(calculate_MAE(y_pred_train, y_train))
        rmse_results_train.append(calculate_RMSE(y_pred_train, y_train))

        y_pred_valid = reg.predict(X_valid)

        mae_results_test.append(calculate_MAE(y_pred_valid, y_valid))
        rmse_results_test.append(calculate_RMSE(y_pred_valid, y_valid))

    if verbose == 1:
        print('[Training Eror]')
        print("RMSE: {} +- {}".format(round(cv_result['rmse-mean'][min_index],7),
                              round(cv_result['rmse-stdv'][min_index],7)))
        print('MAE: {0:10.7f} +- {0:10.7f}'.format(np.mean(mae_results_train), np.std(mae_results_train))
        print('MAE: {0:10.7f} +- {0:10.7f}'.format(np.mean(mae_results_train), np.std(mae_results_train))
        
        print('\n[TEST]')

    print('MAE:', np.mean(mae_results_test))
    print('RMSE:', np.mean(rmse_results_test))
    return rmse_results_test


In [7]:
np.std([1,1,2])

0.4714045207910317

### 1. Baseline - Simple Average

In [None]:
df_baseline = csv_concatenate(os.path.join(DATA_DIR, 'Dataframes','Modelling', 'Baseline'))
df_baseline['FPTS_pred'] = calculate_FPTS(df_baseline)

In [None]:
print('MAE:', calculate_MAE(df_baseline['FPTS_pred'], df_baseline['FPTS']))
print('RMSE:', calculate_RMSE(df_baseline['FPTS_pred'], df_baseline['FPTS']))

### Linear Regression with basic 9 variables 

In [None]:
df_baseline = df_baseline.sort_values(by=['Date','Name']).reset_index(drop=True)
basic =  ['PTS','3P','AST','TRB','STL','BLK','TOV', 'DD', 'TD']

X = df_baseline.loc[:, basic]
X = MinMaxScaler().fit_transform(X)
y = df_baseline['FPTS'].values.reshape(-1,1).flatten()

reg = LinearRegression()
cross_val(reg, X, y, show_train=True)

### 3. Weighted Model

Choose weighting scheme

In [None]:
original_stats = ['SG', 'F', 'C', 'PTS', '3P', 'AST', 'TRB', 'STL', 'BLK', 'TOV', 'DD', 'TD', 'MP', 'FT',
                  'FTA', 'FGA', '3PA', 'DRB', 'ORB', 'USG_perc', 'DRtg', 'ORtg', 'AST_perc', 'DRB_perc',
                  'ORB_perc', 'BLK_perc', 'TOV_perc', 'STL_perc', 'eFG_perc', 'FG_perc', '3P_perc', 'FT_perc']
len(original_stats)

### Choosing the best weighting scheme

In [None]:
for weighting in ['sqrt', 'linear', 'quad']:
    df_features = csv_concatenate(os.path.join(DATA_DIR, 'Dataframes','Modelling', 'Features', weighting))  
    
    X = df_features.loc[:, original_stats]
    X = MinMaxScaler().fit_transform(X)
    y = df_features['FPTS'].values.reshape(-1,1).flatten()

    reg = LinearRegression()
    cross_val(reg, X, y)

In [None]:
weighting = 'linear'

df_features = csv_concatenate(os.path.join(DATA_DIR, 'Dataframes','Modelling', 'Features', weighting))
df_features['FPTS_pred'] = calculate_FPTS(df_features)

print('MAE:', calculate_MAE(df_features['FPTS_pred'], df_features['FPTS']))
print('RMSE:', calculate_RMSE(df_features['FPTS_pred'], df_features['FPTS']))

### Linear Regression with basic 9 variables 

In [None]:
X = df_features.loc[:, basic]
X = MinMaxScaler().fit_transform(X)
y = df_features['FPTS'].values.reshape(-1,1).flatten()

reg = LinearRegression()
cross_val(reg, X, y)

### Feature Selection with Feature Importances

In [None]:
features = ['Salary', 'Starter', 'Rest', 'Rota_All', 'Rota_Pos', 'Home', 'SG', 'F', 'C', 'Value', 'FPTS_std',
             'PTS', '3P', 'AST', 'TRB', 'STL', 'BLK', 'TOV', 'DD', 'TD', 'MP', 'FT', 'FTA', 'FGA', '3PA', 'DRB',
             'ORB', 'USG_perc', 'DRtg', 'ORtg', 'AST_perc', 'DRB_perc', 'ORB_perc', 'BLK_perc', 'TOV_perc', 
             'STL_perc', 'eFG_perc', 'FG_perc', '3P_perc', 'FT_perc']
len(features)

In [None]:
X = df_features.loc[:, features]
X = MinMaxScaler().fit_transform(X)
y = df_features['FPTS'].values.reshape(-1,1).flatten()

In [None]:
model = GradientBoostingRegressor()
model.fit(X, y)

In [None]:
top_features = pd.Series(model.feature_importances_, index = features).sort_values()
top_features.plot(kind = "barh", figsize=(15,10) ,title='Top Features')
plt.show()

In [None]:
omit_lowest = 10
selected = list(top_features[omit_lowest:].index)
len(top_features[omit_lowest:])

### Linear Regression with Selected Features

In [None]:
X = df_features.loc[:, selected]
X = MinMaxScaler().fit_transform(X)
y = df_features['FPTS'].values.reshape(-1,1).flatten()

reg = LinearRegression()

reg.fit(X, y)
cross_val(reg, X, y)

### LightGBM and Hyperparameter Tuning with Bayesian Optimization

In [None]:
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from bayes_opt.observer import JSONLogger, ScreenLogger
from bayes_opt.event import Events

In [None]:
X = df_features.loc[:, features]
X = MinMaxScaler().fit_transform(X)

In [None]:
def bayes_parameter_opt_lgb(X, y, init_round, opt_round, n_folds, random_seed):
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)

    def lgb_eval(feature_fraction, bagging_fraction, lambda_l1, lambda_l2, max_depth, num_leaves, 
                 min_split_gain, min_child_weight, learning_rate, n_estimators):
        params = {
            "objective" : "regression",
            "max_bin": 255,
            "bagging_freq": 1,
            "min_child_samples": 20,
            "boosting": "gbdt",
            "verbosity": 1,
            "early_stopping_round": 200,
            "metric" : 'rmse'
        }
        
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['max_depth'] = int(round(max_depth))
        params['num_leaves'] = int(round(num_leaves))
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        params['learning_rate'] = learning_rate
        params['n_estimators'] = int(round(n_estimators))
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed,
                           verbose_eval=None, stratified=False)
        
        # Print RMSE for each round of lgbBO for rough tracking of the optimization process
        min_index = cv_result['rmse-mean'].index(min(cv_result['rmse-mean']))
        print("RMSE: {} +- {}".format(round(cv_result['rmse-mean'][min_index],7),
                                      round(cv_result['rmse-stdv'][min_index],7)))
        
        return (-1.0 * np.array(cv_result['rmse-mean'])).max()
    
    lgbBO = BayesianOptimization(lgb_eval, {'feature_fraction': (0.3, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'max_depth': (5, 200),
                                            'num_leaves' : (10, 500),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (0, 10),
                                            'learning_rate': (0.01, 0.1),
                                            'n_estimators': (100, 5000)
                                           },
                                 random_state=random_seed)
    
    # Save progress for each round into a JSON file which can be monitored on a editor (i.e. VSCode)
    # This somehow suppresses the terminal output (https://github.com/fmfn/BayesianOptimization/issues/167)
    logger = JSONLogger(path=DATA_DIR+"/Logs/{}.json".format(pd.Timestamp.now().strftime('%Y%m%d-%Hh%Mm')))
    lgbBO.subscribe(Events.OPTMIZATION_STEP, logger)
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round, acq='ei')
    
    return lgbBO.max['params']

In [None]:
opt_params = bayes_parameter_opt_lgb(X, y,
                                     init_round=50,
                                     opt_round=50,
                                     n_folds=5,
                                     random_seed=23)

In [None]:
df_params = pd.read_json(DATA_DIR+'/Logs/20190527-10h37m.json', lines=True)
df_params = df_params.loc[:,['target', 'params']].sort_values(by='target', ascending=False).reset_index()

In [None]:
df_params.head(5)

In [None]:
opt_params = df_params.loc[0, 'params']

In [None]:
for key in opt_params.keys():
    if key in ['max_depth', 'num_leaves', 'n_estimators']:
        opt_params[key] = int(round(opt_params[key]))

In [None]:
err_buf = []
n_iters = 5

for i in tqdm(range(n_iters)):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=i)
    d_train = lgb.Dataset(X_train, label=y_train)
    d_valid = lgb.Dataset(X_valid, label=y_valid)
    watchlist = [d_valid]
    
    model = lgb.train(opt_params, d_train, watchlist, verbose_eval=1)

    preds = model.predict(X_valid)
    err = calculate_RMSE(preds, y_valid)    
    err_buf.append(err)
    print('RMSE: ' + str(err))
    
print('\nMean RMSE: ' + str(np.mean(err_buf)) + ' +/- ' + str(np.std(err_buf)))

In [None]:
import xgboost as xgb

opt_params_xgb = {'max_depth':6, 'n_estimators':250, 'min_child_weight':4, 'colsample_bytree':0.6, 
                  'colsample_bylevel':0.7, 'subsample':1.0, 'gamma':0.0, 'learning_rate':0.026944654231987667}

reg = xgb.XGBRegressor(**opt_params_xgb)
cross_val(reg, X, y, show_train=True)

###  Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping 

In [None]:
es_cb = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')

In [None]:
def model_1():
    model = Sequential()
    model.add(Dense(X.shape[1], input_dim=X.shape[1], activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    model.summary()
    return model

In [None]:
def model_2():
    model = Sequential()
    model.add(Dense(X.shape[1], input_dim=X.shape[1], activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    model.summary()
    return model

In [None]:
def model_3():
    model = Sequential()
    model.add(Dense(X.shape[1], input_dim=X.shape[1], activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    model.summary()
    return model

In [None]:
model = KerasRegressor(build_fn=model_1,
                       epochs=30,
                       batch_size=32,
                       validation_split=0.2,
                       shuffle=True,
                       verbose=1)
h1 = model.fit(X, y)

In [None]:
plt.plot(h1.history['loss'])  
plt.plot(h1.history['val_loss'])  
plt.title('Model Loss')  
plt.ylabel('Loss')  
plt.xlabel('Epoch')  
plt.legend(['Train', 'Validation'], loc='upper right')  
plt.show()

In [None]:
model = KerasRegressor(build_fn=model_2,
                       epochs=30,
                       batch_size=32,
                       validation_split=0.2,
                       shuffle=True,
                       verbose=1)

h2 = model.fit(X, y)

In [None]:
model = KerasRegressor(build_fn=model_3,
                       epochs=30,
                       batch_size=32,
                       validation_split=0.2,
                       shuffle=True,
                       verbose=1)

h3 = model.fit(X, y)

In [None]:
for hist in [h1, h3]:
    plt.subplot(111)  
    plt.plot(hist.history['loss'])  
    plt.plot(hist.history['val_loss'])  
    plt.title('Model Loss')  
    plt.ylabel('Loss')  
    plt.xlabel('Epoch')  
    plt.legend(['Train', 'Validation'], loc='upper right')  
    plt.show()

In [None]:
model = KerasRegressor(build_fn=model_1,
                       epochs=30,
                       batch_size=32,
                       validation_split=0.2,
                       shuffle=True,
                       verbose=1)

kfold = KFold(n_splits=5, shuffle=True)

In [None]:
results_MAE = cross_val_score(model, X, y, cv=kfold, n_jobs=1, scoring='neg_mean_absolute_error')
results_RMSE = cross_val_score(model, X, y, cv=kfold, n_jobs=1, scoring='neg_mean_squared_error')

In [None]:
print(np.sqrt(-results_RMSE))
print("Results: %.4f RMSE" % np.sqrt(np.mean(-results_RMSE)))

print(np.sqrt(-results_MAE))
print("Results: %.4f MAE" % np.mean(-results_MAE))

In [None]:
model = KerasRegressor(build_fn=model_3,
                       epochs=15,
                       batch_size=32,
                       validation_split=0.2,
                       shuffle=True,
                       verbose=2)

kfold = KFold(n_splits=5, shuffle=True)

In [None]:
results_MAE = cross_val_score(model, X, y, cv=kfold, n_jobs=1, scoring='neg_mean_absolute_error')
results_RMSE = cross_val_score(model, X, y, cv=kfold, n_jobs=1, scoring='neg_mean_squared_error')

In [None]:
print(np.sqrt(-results_RMSE))
print("Results: %.4f RMSE" % np.sqrt(np.mean(-results_RMSE)))

print(np.sqrt(-results_MAE))
print("Results: %.4f MAE" % np.mean(-results_MAE))

### Prediction

In [None]:
### Train Test Split
X = df_features.sort_values(by=['Date','Name']).reset_index(drop=True)

target_month = 201903

start = 20190301
end = 20190331

test_indices = (df_features['Date'] >= start) & (df_features['Date'] <= end)
train_indices = [not value for value in test_indices]

X_train = df_features.loc[train_indices, selected]
X_test = df_features.loc[test_indices, selected]

y_train = df_features.loc[train_indices, 'FPTS'].values.reshape(-1,1).flatten()
y_test = df_features.loc[test_indices, 'FPTS'].values.reshape(-1,1).flatten()

# X_train = MinMaxScaler().fit_transform(X_train)
# X_test = MinMaxScaler().fit_transform(X_test)

In [None]:
X_train.shape, X_test.shape

In [None]:
pred_baseline = df_baseline.loc[(df_baseline['Date'] >= start) & (df_baseline['Date'] <= end), 'FPTS_pred'].reset_index(drop=True)
actual = df_baseline.loc[(df_baseline['Date'] >= start) & (df_baseline['Date'] <= end), 'FPTS'].reset_index(drop=True)

print(calculate_MAE(pred_baseline, actual))
print(calculate_RMSE(pred_baseline, actual))

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)
pred_lm = reg.predict(X_test)

print(calculate_MAE(pred_lm, y_test))
print(calculate_RMSE(pred_lm, y_test))

In [None]:
d_train = lgb.Dataset(X_train, label=y_train)
d_test = lgb.Dataset(X_test, label=y_test)

watchlist = [d_valid]

opt_params = df_params.loc[0, 'params']

model = lgb.train(opt_params, d_train, watchlist, verbose_eval=1)
pred_gbm = model.predict(X_test)

print(calculate_MAE(pred_gbm, y_test))
print(calculate_RMSE(pred_gbm, y_test))

In [None]:
best_parameters = {'max_depth':6, 'n_estimators':250, 'min_child_weight':4, 'colsample_bytree':0.6, 
                   'colsample_bylevel':0.7, 'subsample':1.0, 'gamma':0.0, 'learning_rate':0.026944654231987667}

reg = xgb.XGBRegressor(**best_parameters)
reg.fit(X_train, y_train, verbose=1)
pred_xgb = reg.predict(X_test)

print(calculate_MAE(pred_xgb, y_test))
print(calculate_RMSE(pred_xgb, y_test))

In [None]:
def advanced_model():
    model = Sequential()
    model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(64, activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    return model

In [None]:
model = KerasRegressor(build_fn=advanced_model,
                       epochs=30,
                       batch_size=64,
                       validation_split=0.2,
                       shuffle=True,
                       verbose=1)

h = model.fit(X_train, y_train)

In [None]:
pred_nn = model.predict(X_test)
print(calculate_MAE(pred_nn, y_test))
print(calculate_RMSE(pred_nn, y_test))

### Write prediction into csv

In [None]:
df_pred = df_features.loc[test_indices, ['Date', 'Name', 'Team', 'FPTS', 'Pos', 'Salary']]
df_pred['Pred'] = pred_gbm

In [None]:
df_pred.to_csv(os.path.join(DATA_DIR, 'Predictions/{}.csv'.format(pd.Timestamp.now().strftime('%Y%m%d-%Hh%Mm'))), index=False)