### This is a new version of MLEAP scripts, started in late Aug 2022.
It will combine IProject_MLEAP_ANN and IP_MLEAP script, while improving them.

#### Outline

1. Load libraries and data.
2. pEDA. Look at feature distribution, fix them if they do not look right.
3. Train-test split. Most likely couple years into test set. 2015-2018?. Impute missing values.
4. Transform numerical features, add ohe for inds.
5. Fit classic models: ols as a baseline, then xgb.
6. Fit DL.


In [1]:
# 0. Import libraries #

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, time, math, re, warnings, random, gc, dill, optuna, pickle
import statsmodels.api as sm
from random import sample

from sklearn.model_selection import train_test_split, KFold, PredefinedSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNetCV
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

plt.style.use('seaborn-white')
warnings.simplefilter(action='ignore')
pd.set_option('display.max_columns', 110)
gc.enable()

In [2]:
### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

In [3]:
# Detect TPU, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [4]:
#min_prd_list = range(100, 676, 25)
min_prd_list = [450, 550]
#min_prd_list = range(100, 691, 10)

windows_width = 3*12
cv_regularizer=0.3
optuna_trials = 50
time0 = time.time()

results = pd.DataFrame(columns = ['min_prd', 'xgbf_train', 'xgbf_val', 'xgbf_test', 
                                  'xgbgs_train', 'xgbgs_val', 'xgbgs_test', 
                                  'xgbo_train', 'xgbo_val', 'xgbo_test',
                                 'xgbonr_train', 'xgbonr_val', 'xgbonr_test'])
results.min_prd = min_prd_list

for min_prd in min_prd_list:

    with open('../input/kaggle-46pkl/IMLEAP_v4.pkl', 'rb') as pickled_one:
        df = pickle.load(pickled_one)
    df = df[df.prd.isin(range(min_prd-1, min_prd+windows_width+3))]
    df_cnt = df.count()
    empty_cols = list(df_cnt[df_cnt<int(df.shape[0]/2)].index)
    df.drop(columns=empty_cols, inplace=True)
    #display(df.shape, df.head(), df.year.describe(), df.count())

    df = df[(df.RET>-50)&(df.RET<75)]
    meanret = df.groupby('prd').RET.mean().to_frame().reset_index().rename(columns={'RET':'mRET'})
    df = pd.merge(df, meanret, on='prd', how='left')
    df.RET = df.RET-df.mRET
    df.drop(columns='mRET', inplace=True)

    features_miss_dummies = ['amhd', 'BAspr']
    for col in features_miss_dummies:
        if col in df.columns:
            df[col+'_miss'] = df[col].isnull().astype(int)

    temp_cols = ['PERMNO', 'year', 'prd']
    df.reset_index(inplace=True, drop=True)
    X = df.copy()
    y = X.pop('RET')

    train_indx = X.prd<(min_prd+windows_width-1)
    val_indx = X.prd==(min_prd+windows_width-1)
    val_indx_extra = X.prd==(min_prd+windows_width+1)
    test_indx = X.prd==(min_prd+windows_width)

    X_train = X[train_indx]
    X_val = X[val_indx]
    X_val_extra = X[val_indx_extra]
    X_test = X[test_indx]
    y_train = y[train_indx]
    y_val = y[val_indx]
    y_val_extra = y[val_indx_extra]
    y_test = y[test_indx]

    #display(X_train.head(3), X_train.tail(3), y_train.head(3), y_train.tail(3))
    display(X_train.shape, X_val.shape, X_test.shape, X_train.prd.describe(), X_val.prd.describe(), X_test.prd.describe())

    X_train.drop(columns=temp_cols, inplace=True)
    X_val.drop(columns=temp_cols, inplace=True)
    X_val_extra.drop(columns=temp_cols, inplace=True)
    X_test.drop(columns=temp_cols, inplace=True)

    #display(X_train.tail())
    col_cat = ['ind']
    col_num = [x for x in X_train.columns if x not in col_cat]
    for col in col_num:
        X_train[col] = X_train[col].fillna(X_train[col].median())
        X_val[col] = X_val[col].fillna(X_train[col].median())
        X_val_extra[col] = X_val_extra[col].fillna(X_train[col].median())
        X_test[col] = X_test[col].fillna(X_train[col].median())
    for col in col_cat:
        X_train[col] = X_train[col].fillna(value=-1000)
        X_val[col] = X_val[col].fillna(value=-1000)
        X_val_extra[col] = X_val_extra[col].fillna(value=-1000)
        X_test[col] = X_test[col].fillna(value=-1000)

    #display(X_train.tail())
    feature_transformer = ColumnTransformer([('num', StandardScaler(), col_num),
                                            ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore", drop='if_binary'), col_cat)], 
                                            remainder="passthrough")

    print('Number of features before transformation: ', X_train.shape)
    train_index, val_index, val_index_extra, test_index = X_train.index, X_val.index, X_val_extra.index, X_test.index
    X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
    X_val = pd.DataFrame(feature_transformer.transform(X_val), columns=feature_transformer.get_feature_names_out())
    X_val_extra = pd.DataFrame(feature_transformer.transform(X_val_extra), columns=feature_transformer.get_feature_names_out())
    X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
    print('time to do feature proprocessing: ')
    print('Number of features after transformation: ', X_train.shape, X_val.shape, X_val_extra.shape, X_test.shape)
    X_train.index = train_index
    X_val.index = val_index
    X_val_extra.index = val_index_extra
    X_test.index = test_index
    #display(X_train.tail())

    X = pd.concat([X_train, X_val])
    y = pd.concat([y_train, y_val])
    #display(X,y)

    X_ = pd.concat([X_train, X_val, X_val_extra])
    y_ = pd.concat([y_train, y_val, y_val_extra])
    #display(X,y, X_,y_)

    print('mae of a constant model', mean_absolute_error(df.RET, np.ones(df.shape[0])*(df.RET.mean())))
    print('R2 of a constant model', r2_score(df.RET, np.ones(df.shape[0])*(df.RET.mean())))

    xgb1 = XGBRegressor(tree_method = 'gpu_hist', n_estimators=400, max_depth=4, eta=0.02, colsample_bytree=0.4, subsample=0.6)
    xgb1.fit(X_train, y_train)
    print('XGB train:', mean_absolute_error(y_train, xgb1.predict(X_train)), r2_score(y_train, xgb1.predict(X_train)))
    print('XGB val:', mean_absolute_error(y_val, xgb1.predict(X_val)), r2_score(y_val, xgb1.predict(X_val)))
    print('XGB val extra:', mean_absolute_error(y_val_extra, xgb1.predict(X_val_extra)), r2_score(y_val_extra, xgb1.predict(X_val_extra)))
    print('XGB test:', mean_absolute_error(y_test, xgb1.predict(X_test)), r2_score(y_test, xgb1.predict(X_test)))

    results.loc[results.min_prd==min_prd,'xgbf_train':'xgbf_test'] = \
    [r2_score(y_train, xgb1.predict(X_train)), 
    r2_score(y_val, xgb1.predict(X_val)),
    r2_score(y_test, xgb1.predict(X_test))]

    time1 = time.time()

    # Create a list where train data indices are -1 and validation data indices are 0
    split_index = [-1 if x in X_train.index else 0 for x in X.index]
    pds = PredefinedSplit(test_fold = split_index)

    xgb = XGBRegressor(tree_method = 'gpu_hist')
    param_grid = {'n_estimators':[400, 600, 800], 'max_depth':[2,3,4], 'eta':[0.006, 0.012, 0.02], 
                  'subsample':[0.6], 'colsample_bytree':[0.6]}
    xgbgs = GridSearchCV(estimator = xgb, cv=pds, param_grid=param_grid)

    # Fit with all data
    xgbgs.fit(X_, y_)

    print('XGB', xgbgs.best_params_, xgbgs.best_score_, time.time()-time1)
    print('XGB train:', mean_absolute_error(y_train, xgbgs.predict(X_train)), r2_score(y_train, xgbgs.predict(X_train)))
    print('XGB validation:', mean_absolute_error(y_val, xgbgs.predict(X_val)), r2_score(y_val, xgbgs.predict(X_val)))
    print('XGB validation extra:', mean_absolute_error(y_val_extra, xgbgs.predict(X_val_extra)), r2_score(y_val_extra, xgbgs.predict(X_val_extra)))
    print('XGB test:', mean_absolute_error(y_test, xgbgs.predict(X_test)), r2_score(y_test, xgbgs.predict(X_test)))

    results.loc[results.min_prd==min_prd,'xgbgs_train':'xgbgs_test'] = \
    [r2_score(y_train, xgbgs.predict(X_train)), 
    r2_score(y_val, xgbgs.predict(X_val)),
    r2_score(y_test, xgbgs.predict(X_test))]

    time1 = time.time()
    def objective(trial, cv_runs=1, n_splits=2, n_jobs=-1):

        params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 500, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.0005, 0.03),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.05, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.1, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 50.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 500.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 100.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 100)    }

        model = XGBRegressor(**params, njobs=-1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose = False)

        score_train = r2_score(y_train, model.predict(X_train))
        score_val = r2_score(y_val, model.predict(X_val))
        score_val_extra = r2_score(y_val_extra, model.predict(X_val_extra)) 
        score_val = (score_val+score_val_extra)/2
        overfit = np.abs(score_train-score_val)

        return score_val-cv_regularizer*overfit


    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=optuna_trials)
    print('Total time for hypermarameter optimization ', time.time()-time1)
    hp = study.best_params
    for key, value in hp.items():
        print(f"{key:>20s} : {value}")
    print(f"{'best objective value':>20s} : {study.best_value}")
    optuna_hyperpars = study.best_params
    optuna_hyperpars['tree_method']='gpu_hist'
    optuna_xgb = XGBRegressor(**optuna_hyperpars)
    optuna_xgb.fit(X, y)
    print('Optuna XGB train: \n', 
          mean_absolute_error(y_train, optuna_xgb.predict(X_train)), r2_score(y_train, optuna_xgb.predict(X_train)), '\nvalidation \n',
          mean_absolute_error(y_val, optuna_xgb.predict(X_val)), r2_score(y_val, optuna_xgb.predict(X_val)),
          mean_absolute_error(y_val_extra, optuna_xgb.predict(X_val_extra)), r2_score(y_val_extra, optuna_xgb.predict(X_val_extra)), '\ntest \n',
          mean_absolute_error(y_test, optuna_xgb.predict(X_test)), r2_score(y_test, optuna_xgb.predict(X_test)))

    results.loc[results.min_prd==min_prd,'xgbo_train':'xgbo_test'] = \
    [r2_score(y_train, optuna_xgb.predict(X_train)), 
    r2_score(y_val, optuna_xgb.predict(X_val)),
    r2_score(y_test, optuna_xgb.predict(X_test))]

    
    # now w/o regularization    
    time1 = time.time()
    def objective(trial, cv_runs=1, n_splits=2, n_jobs=-1):

        params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 500, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.0005, 0.03),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.05, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.1, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 50.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 500.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 100.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 100)    }

        model = XGBRegressor(**params, njobs=-1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose = False)

        score_val = r2_score(y_val, model.predict(X_val))
        score_val_extra = r2_score(y_val_extra, model.predict(X_val_extra)) 
        score_val = (score_val+score_val_extra)/2

        return score_val


    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=optuna_trials)
    print('Total time for hypermarameter optimization ', time.time()-time1)
    hp = study.best_params
    for key, value in hp.items():
        print(f"{key:>20s} : {value}")
    print(f"{'best objective value':>20s} : {study.best_value}")
    optuna_hyperpars = study.best_params
    optuna_hyperpars['tree_method']='gpu_hist'
    optuna_xgb = XGBRegressor(**optuna_hyperpars)
    optuna_xgb.fit(X, y)
    print('Optuna no regularization XGB train: \n', 
          mean_absolute_error(y_train, optuna_xgb.predict(X_train)), r2_score(y_train, optuna_xgb.predict(X_train)), '\nvalidation \n',
          mean_absolute_error(y_val, optuna_xgb.predict(X_val)), r2_score(y_val, optuna_xgb.predict(X_val)),
          mean_absolute_error(y_val_extra, optuna_xgb.predict(X_val_extra)), r2_score(y_val_extra, optuna_xgb.predict(X_val_extra)), '\ntest \n',
          mean_absolute_error(y_test, optuna_xgb.predict(X_test)), r2_score(y_test, optuna_xgb.predict(X_test)))

    results.loc[results.min_prd==min_prd,'xgbonr_train':'xgbonr_test'] = \
    [r2_score(y_train, optuna_xgb.predict(X_train)), 
    r2_score(y_val, optuna_xgb.predict(X_val)),
    r2_score(y_test, optuna_xgb.predict(X_test))]

    display(results)

(99700, 47)

(2833, 47)

(2787, 47)

count    99700.000000
mean       466.677944
std         10.329035
min        449.000000
25%        458.000000
50%        467.000000
75%        476.000000
max        484.000000
Name: prd, dtype: float64

count    2833.0
mean      485.0
std         0.0
min       485.0
25%       485.0
50%       485.0
75%       485.0
max       485.0
Name: prd, dtype: float64

count    2787.0
mean      486.0
std         0.0
min       486.0
25%       486.0
50%       486.0
75%       486.0
max       486.0
Name: prd, dtype: float64

Number of features before transformation:  (99700, 44)
time to do feature proprocessing: 
Number of features after transformation:  (99700, 92) (2833, 92) (2727, 92) (2787, 92)
mae of a constant model 10.210542205300575
R2 of a constant model 0.0
XGB train: 9.670539161825753 0.04047777643589612
XGB val: 14.119854935793931 -0.011400105894938894
XGB val extra: 12.667085128683043 -0.009290873117867138
XGB test: 13.45023251492294 0.0008179448039563608
XGB {'colsample_bytree': 0.6, 'eta': 0.006, 'max_depth': 2, 'n_estimators': 400, 'subsample': 0.6} -0.0014937493919477962 41.834593296051025
XGB train: 9.804290496236602 0.00898440965303382
XGB validation: 13.960111789536413 0.010042227685765681
XGB validation extra: 12.641099390315963 0.003887766344075394
XGB test: 13.452277073068617 0.008149983693027663


[32m[I 2022-08-27 00:47:50,656][0m A new study created in memory with name: no-name-e875f872-d9b3-4f92-881f-fc9d841cab49[0m
[32m[I 2022-08-27 00:47:51,224][0m Trial 0 finished with value: -0.0007265291061437695 and parameters: {'n_estimators': 866, 'max_depth': 4, 'learning_rate': 0.02946866574654414, 'colsample_bytree': 0.7004908071798813, 'subsample': 0.8650019044752588, 'alpha': 19.797983010810842, 'lambda': 338.3189353997928, 'gamma': 33.5380466463627, 'min_child_weight': 0.20592496401715538}. Best is trial 0 with value: -0.0007265291061437695.[0m
[32m[I 2022-08-27 00:47:51,739][0m Trial 1 finished with value: -0.0009094442743605935 and parameters: {'n_estimators': 650, 'max_depth': 3, 'learning_rate': 0.01325299515644477, 'colsample_bytree': 0.2990319817594579, 'subsample': 0.8661658646515018, 'alpha': 1.4988019691227545, 'lambda': 54.3118413213327, 'gamma': 1.2996531668257383, 'min_child_weight': 7.649554023983412}. Best is trial 0 with value: -0.0007265291061437695.[0m


Total time for hypermarameter optimization  30.53866720199585
        n_estimators : 971
           max_depth : 2
       learning_rate : 0.024334782050585386
    colsample_bytree : 0.9057994192782877
           subsample : 0.2976425176778151
               alpha : 0.47827668577248345
              lambda : 16.35981377840171
               gamma : 2.0810286286633868e-07
    min_child_weight : 0.4810122977731772
best objective value : 0.00031110849886244467
Optuna XGB train: 
 9.72859471789888 0.02423348931867686 
validation 
 13.912509022342867 0.023819598776590678 12.647600183627393 -0.005055102106645837 
test 
 13.389030515477902 0.009544358916269147


[32m[I 2022-08-27 00:48:24,790][0m A new study created in memory with name: no-name-01a6c6df-f5ea-4f52-9dd3-fa5d65c1d4ac[0m
[32m[I 2022-08-27 00:48:25,263][0m Trial 0 finished with value: -0.0008953176017372222 and parameters: {'n_estimators': 648, 'max_depth': 4, 'learning_rate': 0.02601633610308092, 'colsample_bytree': 0.08388983973247395, 'subsample': 0.5269402505764262, 'alpha': 0.11925493669957936, 'lambda': 7.620301986449849, 'gamma': 0.0003789255528772363, 'min_child_weight': 0.6960884748459083}. Best is trial 0 with value: -0.0008953176017372222.[0m
[32m[I 2022-08-27 00:48:25,806][0m Trial 1 finished with value: -0.000634411152681924 and parameters: {'n_estimators': 906, 'max_depth': 5, 'learning_rate': 0.026205397896207943, 'colsample_bytree': 0.28721328431055615, 'subsample': 0.12827408588624015, 'alpha': 2.8772763197456417, 'lambda': 101.6080385167994, 'gamma': 18.651202955977872, 'min_child_weight': 4.6699732152429965}. Best is trial 1 with value: -0.000634411152681

Total time for hypermarameter optimization  28.151900053024292
        n_estimators : 846
           max_depth : 2
       learning_rate : 0.008560138152457993
    colsample_bytree : 0.9245558855779281
           subsample : 0.4103808444598669
               alpha : 18.473488948031964
              lambda : 31.349779529849965
               gamma : 0.004300106539303193
    min_child_weight : 1.7917430812465125
best objective value : 0.00056961366691749
Optuna no regularization XGB train: 
 9.764602203250904 0.015343040291121435 
validation 
 13.956890496055795 0.013097737411393529 12.639451211300443 -0.004039511993314093 
test 
 13.419957277037822 0.007159205573067706


Unnamed: 0,min_prd,xgbf_train,xgbf_val,xgbf_test,xgbgs_train,xgbgs_val,xgbgs_test,xgbo_train,xgbo_val,xgbo_test,xgbonr_train,xgbonr_val,xgbonr_test
0,450,0.040478,-0.0114,0.000818,0.008984,0.010042,0.00815,0.024233,0.02382,0.009544,0.015343,0.013098,0.007159
1,550,,,,,,,,,,,,


(78603, 47)

(2065, 47)

(2059, 47)

count    78603.000000
mean       566.290854
std         10.352088
min        549.000000
25%        557.000000
50%        566.000000
75%        575.000000
max        584.000000
Name: prd, dtype: float64

count    2065.0
mean      585.0
std         0.0
min       585.0
25%       585.0
50%       585.0
75%       585.0
max       585.0
Name: prd, dtype: float64

count    2059.0
mean      586.0
std         0.0
min       586.0
25%       586.0
50%       586.0
75%       586.0
max       586.0
Name: prd, dtype: float64

Number of features before transformation:  (78603, 44)
time to do feature proprocessing: 
Number of features after transformation:  (78603, 92) (2065, 92) (2042, 92) (2059, 92)
mae of a constant model 8.175653033280861
R2 of a constant model 0.0
XGB train: 8.126013299046427 0.0500953377217731
XGB val: 7.527006619549129 -0.010190099713712542
XGB val extra: 6.837919375551958 0.018002988465907643
XGB test: 6.7424119650891425 -0.0009784770822560684
XGB {'colsample_bytree': 0.6, 'eta': 0.006, 'max_depth': 4, 'n_estimators': 400, 'subsample': 0.6} -0.004847000815498381 37.86766219139099
XGB train: 8.215210209331026 0.0231606141626165
XGB validation: 7.490845302602235 0.005404060755053175
XGB validation extra: 6.823230532344146 0.022118759568284685
XGB test: 6.743501158234606 0.0001022404600998783


[32m[I 2022-08-27 00:49:39,104][0m A new study created in memory with name: no-name-184fb5fb-27df-4b40-8b9b-e926f16eba89[0m
[32m[I 2022-08-27 00:49:39,551][0m Trial 0 finished with value: -0.0021910083604527062 and parameters: {'n_estimators': 929, 'max_depth': 3, 'learning_rate': 0.019990577754940182, 'colsample_bytree': 0.6243257757614618, 'subsample': 0.8605684474369651, 'alpha': 37.325672517453505, 'lambda': 0.26077295619444163, 'gamma': 0.0002506564663551603, 'min_child_weight': 0.6092207239830253}. Best is trial 0 with value: -0.0021910083604527062.[0m
[32m[I 2022-08-27 00:49:40,105][0m Trial 1 finished with value: -0.0016698733449104775 and parameters: {'n_estimators': 916, 'max_depth': 5, 'learning_rate': 0.028017696143870757, 'colsample_bytree': 0.6048860437229358, 'subsample': 0.9054073379960189, 'alpha': 9.105333672798453, 'lambda': 0.7439982012888658, 'gamma': 1.098242065015018e-06, 'min_child_weight': 4.69941116989102}. Best is trial 1 with value: -0.00166987334491

Total time for hypermarameter optimization  30.797585010528564
        n_estimators : 719
           max_depth : 4
       learning_rate : 0.029850007974330347
    colsample_bytree : 0.2792337582549027
           subsample : 0.3071600200670509
               alpha : 6.700556893259565
              lambda : 2.363612527944004
               gamma : 1.1872054570162817e-09
    min_child_weight : 8.58598326833255
best objective value : 0.0011345088839243567
Optuna XGB train: 
 8.024718679818443 0.08139392224433217 
validation 
 7.374187578746526 0.0543716430651342 6.862126920589324 0.01050752229402696 
test 
 6.750379642554292 -0.005417369507786596


[32m[I 2022-08-27 00:50:13,818][0m A new study created in memory with name: no-name-b0056a1d-8ee9-44d7-8aaf-72a3d649ca09[0m
[32m[I 2022-08-27 00:50:14,232][0m Trial 0 finished with value: -0.0007940918494059868 and parameters: {'n_estimators': 695, 'max_depth': 3, 'learning_rate': 0.012477869160831292, 'colsample_bytree': 0.43201366679568437, 'subsample': 0.9058406166499534, 'alpha': 1.5349582175111465, 'lambda': 4.220512964926569, 'gamma': 16.4312196292642, 'min_child_weight': 62.26001527308806}. Best is trial 0 with value: -0.0007940918494059868.[0m
[32m[I 2022-08-27 00:50:14,620][0m Trial 1 finished with value: -0.0022271878550667035 and parameters: {'n_estimators': 624, 'max_depth': 3, 'learning_rate': 0.014650874718615309, 'colsample_bytree': 0.8133125193500063, 'subsample': 0.19308862002659244, 'alpha': 1.1919762992828908, 'lambda': 27.2573208629768, 'gamma': 1.2381487788311002e-09, 'min_child_weight': 14.01508151733115}. Best is trial 0 with value: -0.0007940918494059868

Total time for hypermarameter optimization  27.238728046417236
        n_estimators : 601
           max_depth : 5
       learning_rate : 0.014114177668153254
    colsample_bytree : 0.9355411327925054
           subsample : 0.7585749517569261
               alpha : 3.132363982312066
              lambda : 0.15604121631006554
               gamma : 0.0002555067323765672
    min_child_weight : 0.10247191965990914
best objective value : 0.0025222084481339446
Optuna no regularization XGB train: 
 7.985496920436526 0.09800171906091115 
validation 
 7.3373597276472085 0.06864964987222566 6.867395673761703 0.008540262154366984 
test 
 6.7451673466885955 -0.0062645317540666046


Unnamed: 0,min_prd,xgbf_train,xgbf_val,xgbf_test,xgbgs_train,xgbgs_val,xgbgs_test,xgbo_train,xgbo_val,xgbo_test,xgbonr_train,xgbonr_val,xgbonr_test
0,450,0.040478,-0.0114,0.000818,0.008984,0.010042,0.00815,0.024233,0.02382,0.009544,0.015343,0.013098,0.007159
1,550,0.050095,-0.01019,-0.000978,0.023161,0.005404,0.000102,0.081394,0.054372,-0.005417,0.098002,0.06865,-0.006265


In [5]:
results.iloc[:,1:].mean()
# cv_regularizer = 0.5
# optuna_trials = 80
print(time.time()-time0)

230.40159392356873


In [6]:
results.to_csv('kp103_temp_xgp.csv')

# in the first run, gs performed best with r2 of 1.1 vs 0.7% for optuna with cv_reg=0.2 vs 0.5% for simple xgb.

In [7]:
# general point:
# compared to NN, xgb is harder to regularize
# in NN, you can simply shrink coefficient towards constant prediction.
# in xgb, you can not do that. the only way to regularize is via hyperparameters.
# in other words, by tweaking hyperpars, in NN you can approach R^2=0.0 prediction from a constant model arbitrarily close
# in xgb, you can not do that.
# by setting eta as low as 0.1% you can bring r2 down to 0.1%, but lowering eta further actyally increases abs(r2).