### Robustness Checks with Added Noise and t-test 

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [None]:
cwd = os.getcwd().replace('/notebooks','')
data_dir = os.path.join(cwd, 'data')
np.random.seed(6)

In [None]:
def cross_val(reg_base, X, y, show_train=False):
    mae_results_train = []
    rmse_results_train = []
    
    mae_results_test = []
    rmse_results_test = []
    
    std_results_test = []
    
    for k in tqdm(range(5)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=k)
        reg = reg_base
        
        reg.fit(X_train, y_train)
        
        y_pred_train = reg.predict(X_train)
        
        mae_results_train.append(calculate_MAE(y_pred_train, y_train))
        rmse_results_train.append(calculate_RMSE(y_pred_train, y_train))
        
        y_pred_test = reg.predict(X_test)
        
        mae_results_test.append(calculate_MAE(y_pred_test, y_test))
        rmse_results_test.append(calculate_RMSE(y_pred_test, y_test))
    
    if show_train==True:
        print('[TRAIN]')
        print('MAE:', np.mean(mae_results_train))
        print('RSME:', np.mean(rmse_results_train))
        print('\n[TEST]')
    
    print('MAE:', np.mean(mae_results_test))
    print('RSME:', np.mean(rmse_results_test))
    return rmse_results_test

In [None]:
weighting = 'quad'

df_features = csv_concatenate(os.path.join(data_dir, 'Dataframes','modelling', 'features', weighting))
df_features['FPTS_pred'] = calculate_FPTS(df_features)

print('MAE:', calculate_MAE(df_features['FPTS_pred'], df_features['FPTS']))
print('RMSE:', calculate_RMSE(df_features['FPTS_pred'], df_features['FPTS']))

In [None]:
selected = ['TOV_perc', 'BLK_perc', 'STL', 'STL_perc', 'DRtg', 'ORB', 'eFG_perc','ORB_perc', 'FT_perc', '3P_perc',
            'Home','DD','Rota_Pos', 'DRB_perc', 'BLK', 'AST_perc', 'Rota_All', 'MP', 'FPTS_std', 'Value', 'FT',
            'Rest', 'AST', 'TOV', 'PTS', 'TRB', 'USG_perc', 'Starter', 'Salary']

In [None]:
X = df_features.loc[:, selected]
X = MinMaxScaler().fit_transform(X)
y = df_features['FPTS'].values.reshape(-1,1).flatten()

In [None]:
best_parameters = {'max_depth':6, 'n_estimators':250, 'min_child_weight':4, 'colsample_bytree':0.6, 
                   'colsample_bylevel':0.7, 'subsample':1.0, 'gamma':0.0, 'learning_rate':0.026944654231987667}

#MAE: 6.848631675865012, RSME: 8.958142274893145
                
reg = xgb.XGBRegressor(**best_parameters)
results = cross_val(reg, X, y, show_train=True)

### Add Gaussian Noise

In [None]:
cont = ['TOV_perc', 'BLK_perc', 'STL', 'STL_perc', 'DRtg', 'ORB', 'eFG_perc','ORB_perc', 'FT_perc', '3P_perc',
        'DD','Rota_Pos', 'DRB_perc', 'BLK', 'AST_perc', 'Rota_All', 'MP', 'FPTS_std', 'Value', 'FT',
        'Rest', 'AST', 'TOV', 'PTS', 'TRB', 'USG_perc', 'Salary']

binary = ['Home', 'Starter']

In [None]:
X_cont = df_features.loc[:, cont]
X_cont = MinMaxScaler().fit_transform(X_cont)


noise = np.random.normal(0, 1, [X_cont.shape[0], X_cont.shape[1]]) 
noise = MinMaxScaler([0,0.2]).fit_transform(noise)
X_cont = X_cont + noise

X_binary = df_features.loc[:, binary]
X_binary = MinMaxScaler().fit_transform(X_binary)

X = np.concatenate([X_cont, X_binary], axis=1)

In [None]:
reg = xgb.XGBRegressor(**best_parameters)
cross_val(reg, X, y)

### Calculating the t-statistic

In [None]:
null = 9.9434
alternative = 8.9581
t = (9.9434-8.9581)/(std/5)
print(round(t, 2),'>15.54 at 0.1% significance level with df=4')