### This is a new version of MLEAP scripts, started in late Aug 2022.
It will combine IProject_MLEAP_ANN and IP_MLEAP script, while improving them.

#### Outline

1. Load libraries and data.
2. pEDA. Look at feature distribution, fix them if they do not look right.
3. Train-test split. Most likely couple years into test set. 2015-2018?. Impute missing values.
4. Transform numerical features, add ohe for inds.
5. Fit classic models: ols as a baseline, then xgb.
6. Fir DL.


Notes:
ideally, I want to use time-based cross-validation.
since I have panel data, it is not a trivial task.
need to find some solution online.
e.g., https://towardsdatascience.com/time-based-cross-validation-d259b13d42b8.

for now, will try to do siple for loop.


In [27]:
# 0. Import libraries #

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, time, math, re, warnings, random, gc, dill, optuna, pickle
import statsmodels.api as sm
from random import sample

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNetCV
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

plt.style.use('seaborn-white')
warnings.simplefilter(action='ignore')
pd.set_option('display.max_columns', 110)
gc.enable()

In [28]:
### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

In [29]:
# Detect TPU, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [30]:
# for loop to see appx performance over the whole sample with some rolling window

time0 = time.time()

min_prd_list = range(100, 676, 25)
windows_width = 3*12
cv_regularizer=0.04
optuna_trials = 20

results = pd.DataFrame(columns = ['min_prd', 'xgbf', 'xgbgs', 'xgbo'])
results.min_prd = min_prd_list

for min_prd in min_prd_list:
    
    
    with open('../input/kaggle-46pkl/IMLEAP_v4.pkl', 'rb') as pickled_one:
        df = pickle.load(pickled_one)
    df = df[df.prd.isin(range(min_prd-1, min_prd+windows_width+2))]
    df_cnt = df.count()
    empty_cols = list(df_cnt[df_cnt<int(df.shape[0]/2)].index)
    df.drop(columns=empty_cols, inplace=True)
    display(df.shape, df.head(), df.year.describe(), df.count())
    
    df = df[(df.RET>-50)&(df.RET<75)]
    meanret = df.groupby('prd').RET.mean().to_frame().reset_index().rename(columns={'RET':'mRET'})
    df = pd.merge(df, meanret, on='prd', how='left')
    df.RET = df.RET-df.mRET
    df.drop(columns='mRET', inplace=True)

    features_miss_dummies = ['amhd', 'BAspr']
    for col in features_miss_dummies:
        if col in df.columns:
            df[col+'_miss'] = df[col].isnull().astype(int)

    temp_cols = ['PERMNO', 'prd', 'year']
    train = df[df.prd<(min_prd+windows_width)]
    test = df[df.prd==(min_prd+windows_width)]
    train.drop(columns=temp_cols, inplace=True)
    test.drop(columns=temp_cols, inplace=True)

    col_ignore = ['RET']
    col_cat = ['ind']
    col_num = [x for x in train.columns if x not in col_ignore+col_cat]
    for col in col_num:
        train[col] = train[col].fillna(train[col].median())
        test[col] = test[col].fillna(train[col].median())
    for col in col_cat:
        train[col] = train[col].fillna(value=-1000)
        test[col] = test[col].fillna(value=-1000)

    X_train = train.copy()
    y_train = X_train.pop('RET')
    X_test = test.copy()
    y_test = X_test.pop('RET')

    feature_transformer = ColumnTransformer([('num', StandardScaler(), col_num),
                                            ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore", drop='if_binary'), col_cat)], 
                                            remainder="passthrough")

    print('Number of features before transformation: ', X_train.shape)
    X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
    X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
    print('time to do feature proprocessing: ')
    print('Number of features after transformation: ', X_train.shape)

    print('mae of a constant model', mean_absolute_error(df.RET, np.ones(df.shape[0])*(df.RET.mean())))
    print('R2 of a constant model', r2_score(df.RET, np.ones(df.shape[0])*(df.RET.mean())))

    xgb1 = XGBRegressor(tree_method = 'gpu_hist', n_estimators=300, max_depth=5, eta=0.03, colsample_bytree=0.6)
    xgb1.fit(X_train, y_train)
    print('XGB train:', mean_absolute_error(y_train, xgb1.predict(X_train)), r2_score(y_train, xgb1.predict(X_train)))

    time1 = time.time()
    xgb = XGBRegressor(tree_method = 'gpu_hist')
    param_grid = {'n_estimators':[400, 700], 'max_depth':[2,3,4], 'eta':[0.006, 0.012, 0.02], 'subsample':[0.6], 'colsample_bytree':[0.6]}
    xgbm = GridSearchCV(xgb, param_grid, cv=2, verbose=2, scoring='r2')
    xgbm.fit(X_train, y_train)
    print('XGB', xgbm.best_params_, xgbm.best_score_, time.time()-time1)
    print('XGB train:', mean_absolute_error(y_train, xgbm.predict(X_train)), r2_score(y_train, xgbm.predict(X_train)), time.time()-time1)

    time1 = time.time()
    def objective(trial, cv_runs=1, n_splits=2, n_jobs=-1):

        params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 500, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.001, 0.05),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.3, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 30.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 200.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 50)    }

        temp_out = []

        for i in range(cv_runs):

            X = X_train
            y = y_train
            model = XGBRegressor(**params, njobs=-1)
            rkf = KFold(n_splits=n_splits, shuffle=True)
            X_values = X.values
            y_values = y.values
            y_pred = np.zeros_like(y_values)
            y_pred_train = np.zeros_like(y_values)
            for train_index, test_index in rkf.split(X_values):
                X_A, X_B = X_values[train_index, :], X_values[test_index, :]
                y_A, y_B = y_values[train_index], y_values[test_index]
                model.fit(X_A, y_A, eval_set=[(X_B, y_B)], verbose = False)
                y_pred[test_index] = model.predict(X_B)
                y_pred_train[train_index] = model.predict(X_A)

            score_train = r2_score(y_train, y_pred_train)
            score_test = r2_score(y_train, y_pred) 
            overfit = (score_train-score_test)
            temp_out.append(score_test-cv_regularizer*overfit)

        return (np.mean(temp_out))

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=optuna_trials)
    print('Total time for hypermarameter optimization ', time.time()-time1)
    hp = study.best_params
    for key, value in hp.items():
        print(f"{key:>20s} : {value}")
    print(f"{'best objective value':>20s} : {study.best_value}")
    optuna_hyperpars = study.best_params
    optuna_hyperpars['tree_method']='gpu_hist'
    optuna_xgb = XGBRegressor(**optuna_hyperpars)
    optuna_xgb.fit(X_train, y_train)
    print('Optuna XGB train:', 
          mean_absolute_error(y_train, optuna_xgb.predict(X_train)), r2_score(y_train, optuna_xgb.predict(X_train)), time.time()-time1)

    # Evaluate performance of XGB models:
    r2_xgb1 = r2_score(y_test, xgb1.predict(X_test))
    r2_xgbgs = r2_score(y_test, xgbm.predict(X_test))
    r2_xgbo = r2_score(y_test, optuna_xgb.predict(X_test))

    print('Min_prd: ', min_prd)
    print('Constant guess: ', mean_absolute_error(y_test, np.ones(len(y_test))*y_test.mean()), 
          r2_score(y_test, np.ones(len(y_test))*y_test.mean()))
    print('XGB test:', mean_absolute_error(y_test, xgb1.predict(X_test)), r2_xgb1)
    print('XGB GS test:', mean_absolute_error(y_test, xgbm.predict(X_test)), r2_xgbgs)
    print('Optuna XGB test:', mean_absolute_error(y_test, optuna_xgb.predict(X_test)), r2_xgbo)

    results.loc[results.min_prd==min_prd,'xgbf':'xgbo'] = r2_xgb1, r2_xgbgs, r2_xgbo
    
print(time.time()-time0, results)

(31363, 40)

Unnamed: 0,PERMNO,prd,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
24,10006,99,30.00761,1966,-1.8425,25.0,-0.483038,0.179707,0.260061,0.077403,5.4095,23.511075,1.475168,1.402148,1.282759,1.336867,3.3976,1.506242,1.833065,1.648786,5.695347,-0.417907,0.135547,0.222194,0.067345,5.434935,1.516443,2.6538,1.479387,4.2841,1.436185,4.0548,1.669988,2.6538,18.881331,0.765013,0.702024,1.195248,1.19392,1.231535
25,10006,100,22.973353,1966,-13.9454,25.0,-0.483038,0.179707,0.260061,0.077403,-1.8425,29.337614,1.47898,0.790153,0.754158,1.239788,1.4671,0.877792,1.758302,1.629121,5.680309,-0.417907,0.135547,0.222194,0.067345,5.444597,1.475168,3.3976,1.498098,3.1745,1.411486,4.5856,1.606326,3.3976,-0.948286,0.890444,0.81015,1.250279,1.236353,1.235768
26,10006,101,0.562306,1966,-4.8379,25.0,-0.483038,0.179707,0.260061,0.077403,-13.9454,20.186279,1.582242,1.380279,1.368466,1.238541,2.1292,2.117274,1.705033,1.723923,5.521941,-0.417907,0.135547,0.222194,0.067345,5.491554,1.47898,1.4671,1.516443,2.6538,1.439482,3.1527,1.516363,1.4671,-1.27889,0.985169,0.90836,1.250547,1.195605,1.222002
27,10006,102,-6.780997,1966,-2.9268,25.0,-0.483038,0.179707,0.260061,0.077403,-4.8379,5.036237,1.641163,1.941973,1.031749,1.220998,5.3222,2.173122,1.674638,1.793334,5.476547,-0.417907,0.135547,0.222194,0.067345,5.479246,1.582242,2.1292,1.475168,3.3976,1.479387,4.2841,1.522883,2.1292,-2.716252,1.199252,1.070752,1.265243,1.20419,1.233468
28,10006,103,-8.513334,1966,0.9968,25.0,-0.483038,0.179707,0.260061,0.077403,-2.9268,-9.564465,1.713909,2.434331,1.877713,1.166406,7.8011,2.697039,1.892122,1.886296,5.451468,-0.417907,0.135547,0.222194,0.067345,5.582127,1.641163,5.3222,1.47898,1.4671,1.498098,3.1745,1.485769,5.3222,-6.697673,1.477127,1.395748,1.287226,1.312899,1.275484


count    31363.000000
mean      1967.710965
std          1.003547
min       1966.000000
25%       1967.000000
50%       1968.000000
75%       1969.000000
max       1969.000000
Name: year, dtype: float64

PERMNO          31363
prd             31363
mom242          31136
year            31363
RET             31363
ind             31363
bm              31363
op              31363
gp              31363
inv             31363
mom11           31363
mom122          31363
amhd            29013
ivol_capm       31361
ivol_ff5        31361
beta_bw         31363
MAX             31363
vol1m           31360
vol6m           31357
vol12m          31350
size            31363
lbm             31363
lop             31363
lgp             31363
linv            31363
llme            31363
l1amhd          28926
l1MAX           31362
l3amhd          28737
l3MAX           31360
l6amhd          28420
l6MAX           31357
l12amhd         27713
l12MAX          31362
l12mom122       31191
l12ivol_capm    31349
l12ivol_ff5     31349
l12beta_bw      31352
l12vol6m        31339
l12vol12m       31166
dtype: int64

Number of features before transformation:  (29248, 37)
time to do feature proprocessing: 
Number of features after transformation:  (29248, 81)
mae of a constant model 6.553607277016165
R2 of a constant model 0.0
XGB train: 6.110803010839697 0.1613195732684598
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.5s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.5s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0

[32m[I 2022-08-25 20:26:59,584][0m A new study created in memory with name: no-name-42a6b2fb-da50-4d0d-b517-c775148162db[0m


XGB {'colsample_bytree': 0.6, 'eta': 0.006, 'max_depth': 2, 'n_estimators': 400, 'subsample': 0.6} 0.007141845676770453 33.45027804374695
XGB train: 6.501211644578929 0.017764871358617373 33.639312744140625


[32m[I 2022-08-25 20:27:03,002][0m Trial 0 finished with value: 0.00837475305945023 and parameters: {'n_estimators': 811, 'max_depth': 3, 'learning_rate': 0.015809168749134343, 'colsample_bytree': 0.7303260396555257, 'subsample': 0.821827140244139, 'alpha': 4.585072642654388, 'lambda': 172.61709004372338, 'gamma': 1.246469979908026e-07, 'min_child_weight': 2.789055118345178}. Best is trial 0 with value: 0.00837475305945023.[0m
[32m[I 2022-08-25 20:27:06,074][0m Trial 1 finished with value: -0.0035525262176526323 and parameters: {'n_estimators': 785, 'max_depth': 3, 'learning_rate': 0.042476489217899926, 'colsample_bytree': 0.25503878339370606, 'subsample': 0.5739258233263153, 'alpha': 4.034067844032434, 'lambda': 0.36699803832709577, 'gamma': 7.90757179229881e-10, 'min_child_weight': 49.49373547616465}. Best is trial 0 with value: 0.00837475305945023.[0m
[32m[I 2022-08-25 20:27:09,789][0m Trial 2 finished with value: 0.008508418281056118 and parameters: {'n_estimators': 923, 'm

Total time for hypermarameter optimization  69.65527367591858
        n_estimators : 515
           max_depth : 3
       learning_rate : 0.010410483464710794
    colsample_bytree : 0.3695446944536873
           subsample : 0.5267656343893188
               alpha : 0.12323610234667441
              lambda : 55.78352402158807
               gamma : 7.219311092685869
    min_child_weight : 1.2647527759912247
best objective value : 0.011514032845086612
Optuna XGB train: 6.450757514878683 0.03505892810226641 70.69066262245178
Min_prd:  100
Constant guess:  6.674820474635347 0.0
XGB test: 6.797339346041856 -0.03392572645519665
XGB GS test: 6.768171325492354 -0.01937416663345548
Optuna XGB test: 6.77696546348316 -0.02464833390884169


(40682, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
49,10006,124,57.022356,27.230111,1968,-10.5198,25.0,-0.22327,0.183384,0.269118,0.100395,3.4619,12.086336,1.536049,1.37765,1.078594,0.854703,4.982,1.945352,2.653852,2.174071,5.86758,-0.149515,0.173745,0.242714,0.119169,5.740148,1.573985,21.135115,1.636271,4.2086,1.653423,2.5316,1.833293,21.135115,6.066114,1.906794,1.54551,1.069805,1.668624,1.814489
50,10006,125,45.576895,32.149862,1968,-6.2596,25.0,-0.22327,0.183384,0.269118,0.100395,-10.5198,24.581595,1.493226,2.217285,1.948001,0.794842,2.7293,2.301335,2.7978,2.238881,5.751293,-0.149515,0.173745,0.242714,0.119169,5.660875,1.536049,4.982,1.641999,2.2022,1.585251,1.7344,1.792446,4.982,27.909142,1.503199,1.346875,1.007441,1.599164,1.760787
51,10006,126,16.380849,21.566933,1968,4.3219,25.0,-0.22327,0.183384,0.269118,0.100395,-6.2596,13.253947,1.469247,4.800074,4.663338,0.777908,7.2477,4.297812,3.251701,2.512787,5.691229,-0.149515,0.173745,0.242714,0.119169,5.648296,1.493226,2.7293,1.573985,21.135115,1.624355,2.1833,1.744677,2.7293,25.117139,1.051638,0.849621,0.989661,1.421761,1.677152
52,10006,127,32.954723,53.105785,1968,9.7618,25.0,-0.22327,0.183384,0.269118,0.100395,4.3219,11.098127,1.375395,1.477982,1.203894,0.787439,4.4095,1.764988,3.249921,2.531631,5.737749,-0.149515,0.173745,0.242714,0.119169,5.606947,1.469247,7.2477,1.536049,4.982,1.636271,4.2086,1.727351,7.2477,26.865911,1.174851,1.018713,1.014681,1.442506,1.555119
53,10006,128,34.099293,39.4152,1968,3.945,25.0,-0.22327,0.183384,0.269118,0.100395,9.7618,21.666549,1.246353,1.734429,1.621548,0.750551,5.4695,1.618184,3.253052,2.540064,5.82476,-0.149515,0.173745,0.242714,0.119169,5.549943,1.375395,4.4095,1.493226,2.7293,1.641999,2.2022,1.738743,4.4095,20.035787,1.903329,1.637691,0.996995,1.592287,1.605068


count    40682.000000
mean      1969.798437
std          0.978257
min       1968.000000
25%       1969.000000
50%       1970.000000
75%       1971.000000
max       1971.000000
Name: year, dtype: float64

PERMNO          40682
prd             40682
mom482          35826
mom242          40286
year            40682
RET             40682
ind             40682
bm              40682
op              40682
gp              40682
inv             40678
mom11           40682
mom122          40682
amhd            37268
ivol_capm       40681
ivol_ff5        40681
beta_bw         40682
MAX             40682
vol1m           40681
vol6m           40682
vol12m          40678
size            40682
lbm             40682
lop             40682
lgp             40682
linv            40682
llme            40682
l1amhd          37313
l1MAX           40682
l3amhd          37398
l3MAX           40682
l6amhd          37547
l6MAX           40682
l12amhd         37988
l12MAX          40682
l12mom122       40497
l12ivol_capm    40673
l12ivol_ff5     40673
l12beta_bw      40680
l12vol6m        40637
l12vol12m       40296
dtype: int64

Number of features before transformation:  (38133, 38)
time to do feature proprocessing: 
Number of features after transformation:  (38133, 82)
mae of a constant model 7.1541100803200415
R2 of a constant model 0.0
XGB train: 6.700516343240393 0.15021333381905422
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.5s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.5s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta

[32m[I 2022-08-25 20:28:47,329][0m A new study created in memory with name: no-name-a7ff125b-761e-458c-837c-69df14e61abb[0m


XGB train: 7.036265023358253 0.043365857443366185 34.68202042579651


[32m[I 2022-08-25 20:28:51,373][0m Trial 0 finished with value: 0.015288317643964829 and parameters: {'n_estimators': 983, 'max_depth': 3, 'learning_rate': 0.01154814845863925, 'colsample_bytree': 0.2293283693559149, 'subsample': 0.3650444826578865, 'alpha': 0.15329303614416784, 'lambda': 0.49187784197675005, 'gamma': 1.2155606555846599e-05, 'min_child_weight': 0.1531219653388143}. Best is trial 0 with value: 0.015288317643964829.[0m
[32m[I 2022-08-25 20:28:55,283][0m Trial 1 finished with value: 0.01121760968834697 and parameters: {'n_estimators': 509, 'max_depth': 5, 'learning_rate': 0.0281792222801985, 'colsample_bytree': 0.42170961849016797, 'subsample': 0.8712022017176322, 'alpha': 0.12345806987126445, 'lambda': 29.718661974350074, 'gamma': 4.0955384605193656e-10, 'min_child_weight': 25.38752740649166}. Best is trial 0 with value: 0.015288317643964829.[0m
[32m[I 2022-08-25 20:28:57,773][0m Trial 2 finished with value: 0.012197031031307788 and parameters: {'n_estimators': 5

Total time for hypermarameter optimization  81.06092238426208
        n_estimators : 709
           max_depth : 4
       learning_rate : 0.008387138866353446
    colsample_bytree : 0.8445083533050689
           subsample : 0.3022790767756601
               alpha : 1.294635770631884
              lambda : 57.654590387654764
               gamma : 1.032000595807169e-06
    min_child_weight : 10.280741968150178
best objective value : 0.01978513447381966
Optuna XGB train: 6.971079713781372 0.058627818958291056 83.22256517410278
Min_prd:  125
Constant guess:  6.801798451895731 0.0
XGB test: 6.690736270893462 0.001886781638528956
XGB GS test: 6.7089180299557585 0.006943571377436508
Optuna XGB test: 6.683467341627174 0.004293040007036231


(50984, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
74,10006,149,-11.907145,-28.068949,1970,8.9537,25.0,-0.192505,0.140962,0.224605,0.154915,4.9963,-17.560372,2.5138,1.195243,1.150948,0.868877,2.5408,1.271715,1.649324,1.41005,5.406362,-0.516957,0.139113,0.231174,0.041194,5.536724,2.464311,2.0168,2.158325,2.7241,1.844002,1.8199,1.418393,2.0168,-25.445548,1.197438,0.833119,0.936794,1.8533,2.111649
75,10006,150,-0.991806,-21.116609,1970,-4.5062,25.0,-0.192505,0.140962,0.224605,0.154915,8.9537,-12.90143,2.586287,1.067049,1.027625,0.862274,2.9512,1.165072,1.689843,1.429232,5.497059,-0.516957,0.139113,0.231174,0.041194,5.536724,2.5138,2.5408,2.316701,5.599,1.876815,2.5908,1.450799,2.5408,-15.063563,0.765013,0.647098,1.029569,1.737736,2.018194
76,10006,151,11.783248,-29.778642,1970,8.8171,25.0,-0.192505,0.140962,0.224605,0.154915,-4.5062,-14.344204,2.608496,1.592679,1.337876,0.828411,2.3886,1.61032,1.667813,1.457507,5.455755,-0.516957,0.139113,0.231174,0.041194,5.644581,2.586287,2.9512,2.464311,2.0168,1.998218,1.967,1.574844,2.9512,-19.087141,1.129631,1.097229,1.015623,1.335658,1.622773
77,10006,152,4.517407,-29.833415,1970,1.5353,25.0,-0.192505,0.140962,0.224605,0.154915,8.8171,-17.941026,2.622414,1.200474,1.015513,0.831283,2.5782,1.351295,1.411577,1.459923,5.531153,-0.516957,0.139113,0.231174,0.041194,5.634556,2.608496,2.3886,2.5138,2.5408,2.158325,2.7241,1.667734,2.3886,-18.329996,1.325809,1.023188,1.006169,1.294217,1.63513
78,10006,153,18.690525,-30.734165,1971,4.0036,25.0,-0.192505,0.140962,0.224605,0.154915,1.5353,-8.976912,2.604032,0.765013,0.696519,0.82754,1.4066,0.86687,1.234667,1.431408,5.545719,-0.516957,0.139113,0.231174,0.041194,5.621882,2.622414,2.5782,2.586287,2.9512,2.316701,5.599,1.76202,2.5782,-21.681417,1.111892,0.742139,0.998186,1.266684,1.593312


count    50984.000000
mean      1971.849776
std          0.964612
min       1970.000000
25%       1971.000000
50%       1972.000000
75%       1973.000000
max       1973.000000
Name: year, dtype: float64

PERMNO          50984
prd             50984
mom482          43631
mom242          50521
year            50984
RET             50984
ind             50984
bm              50984
op              50984
gp              50984
inv             50962
mom11           50984
mom122          50984
amhd            44589
ivol_capm       50981
ivol_ff5        50981
beta_bw         50984
MAX             50984
vol1m           50979
vol6m           50984
vol12m          50984
size            50984
lbm             50984
lop             50984
lgp             50984
linv            50984
llme            50984
l1amhd          44685
l1MAX           50984
l3amhd          44913
l3MAX           50984
l6amhd          45179
l6MAX           50984
l12amhd         45585
l12MAX          50984
l12mom122       50788
l12ivol_capm    50980
l12ivol_ff5     50980
l12beta_bw      50984
l12vol6m        50942
l12vol12m       50509
dtype: int64

Number of features before transformation:  (47985, 38)
time to do feature proprocessing: 
Number of features after transformation:  (47985, 84)
mae of a constant model 7.537774691011074
R2 of a constant model 0.0
XGB train: 7.029949276574985 0.12444542849884155
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.5s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.5s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:30:50,065][0m A new study created in memory with name: no-name-988d4d85-bbbd-414a-b82c-8423eb0176dc[0m


XGB train: 7.264593573026108 0.053180788794843004 36.91641306877136


[32m[I 2022-08-25 20:30:58,377][0m Trial 0 finished with value: -0.014986951626088657 and parameters: {'n_estimators': 977, 'max_depth': 5, 'learning_rate': 0.039496235929273746, 'colsample_bytree': 0.47755882240892644, 'subsample': 0.4937362023406756, 'alpha': 7.205838412740077, 'lambda': 71.99196404996096, 'gamma': 8.921361117124002, 'min_child_weight': 2.582826224626684}. Best is trial 0 with value: -0.014986951626088657.[0m
[32m[I 2022-08-25 20:31:02,047][0m Trial 1 finished with value: 0.006455059442313697 and parameters: {'n_estimators': 676, 'max_depth': 3, 'learning_rate': 0.03871467579249942, 'colsample_bytree': 0.1423404311938064, 'subsample': 0.5440010105600128, 'alpha': 1.5160563582558173, 'lambda': 5.831062709432314, 'gamma': 6.652695464129989e-07, 'min_child_weight': 7.790574045374248}. Best is trial 1 with value: 0.006455059442313697.[0m
[32m[I 2022-08-25 20:31:04,585][0m Trial 2 finished with value: 0.01641066848331539 and parameters: {'n_estimators': 535, 'max_

Total time for hypermarameter optimization  79.1415696144104
        n_estimators : 535
           max_depth : 3
       learning_rate : 0.013858241605672308
    colsample_bytree : 0.8103376027975585
           subsample : 0.9436046217238852
               alpha : 0.3786176803261718
              lambda : 80.96062129548245
               gamma : 1.6575510918590155e-10
    min_child_weight : 0.2916098749750353
best objective value : 0.01641066848331539
Optuna XGB train: 7.305544621701751 0.03659721634402702 80.53257203102112
Min_prd:  150
Constant guess:  8.924405010653333 0.0
XGB test: 8.760029922668958 0.009361205960809249
XGB GS test: 8.769167863016227 0.018258046195946398
Optuna XGB test: 8.762136421188014 0.019385496099062816


(59945, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
99,10006,174,-8.863675,24.298177,1972,1.7951,25.0,-0.12852,0.102706,0.189707,0.057118,3.9857,-13.084494,1.727817,0.765013,0.647098,0.838275,1.7642,0.86687,1.356555,1.390944,5.658946,-0.062534,0.116521,0.201542,0.081797,5.774872,1.737888,2.8289,1.627297,2.3539,1.606479,2.5787,1.817857,2.8289,34.408776,0.765013,0.647098,0.879409,1.51586,1.391454
100,10006,175,-8.89543,19.337724,1972,-6.623,25.0,-0.12852,0.102706,0.189707,0.057118,1.7951,2.102147,1.706153,0.916285,0.827985,0.797384,1.977,0.938854,1.161795,1.337902,5.68066,-0.062534,0.116521,0.201542,0.081797,5.657089,1.727817,1.7642,1.662882,1.7081,1.633409,3.6509,1.772195,1.7642,39.316493,1.461319,1.333326,0.980061,1.402991,1.390889
101,10006,176,-23.761723,5.490262,1972,-2.1741,25.0,-0.12852,0.102706,0.189707,0.057118,-6.623,7.548838,1.804927,1.910382,1.747425,0.685677,1.5767,1.941405,1.246814,1.360487,5.603795,-0.062534,0.116521,0.201542,0.081797,5.614529,1.706153,1.977,1.737888,2.8289,1.621238,5.1584,1.690663,1.977,13.329088,1.33676,1.041994,1.01034,1.508092,1.425334
102,10006,177,-27.752649,2.121427,1973,-6.4767,25.0,-0.12852,0.102706,0.189707,0.057118,-2.1741,-4.357035,1.89013,1.299676,1.228433,0.781756,2.8151,1.426847,1.273572,1.401402,5.587195,-0.062534,0.116521,0.201542,0.081797,5.663813,1.804927,1.5767,1.727817,1.7642,1.627297,2.3539,1.63698,1.5767,7.864953,0.840915,0.653553,0.982159,1.507691,1.433275
103,10006,178,-20.022258,-1.415186,1973,-8.0078,25.0,-0.12852,0.102706,0.189707,0.057118,-6.4767,-6.618875,1.939386,1.020299,0.940395,0.771634,1.5254,1.104205,1.29738,1.415379,5.524929,-0.062534,0.116521,0.201542,0.081797,5.668655,1.89013,2.8151,1.706153,1.977,1.662882,1.7081,1.606,2.8151,8.899177,0.765013,0.647098,1.008138,1.454644,1.423127


count    59945.000000
mean      1973.906748
std          0.951204
min       1972.000000
25%       1973.000000
50%       1974.000000
75%       1975.000000
max       1975.000000
Name: year, dtype: float64

PERMNO          59945
prd             59945
mom482          50928
mom242          58102
year            59945
RET             59945
ind             59945
bm              59945
op              59945
gp              59945
inv             59921
mom11           59945
mom122          59945
amhd            45301
ivol_capm       59933
ivol_ff5        59933
beta_bw         59945
MAX             59945
vol1m           59925
vol6m           59929
vol12m          59919
size            59945
lbm             59945
lop             59945
lgp             59945
linv            59945
llme            59945
l1amhd          45543
l1MAX           59944
l3amhd          46032
l3MAX           59943
l6amhd          46751
l6MAX           59940
l12amhd         48575
l12MAX          59944
l12mom122       59661
l12ivol_capm    59930
l12ivol_ff5     59930
l12beta_bw      59941
l12vol6m        59872
l12vol12m       57924
dtype: int64

Number of features before transformation:  (56061, 38)
time to do feature proprocessing: 
Number of features after transformation:  (56061, 85)
mae of a constant model 8.558942696185106
R2 of a constant model 0.0
XGB train: 8.164309167669105 0.1153934212396065
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0

[32m[I 2022-08-25 20:32:52,737][0m A new study created in memory with name: no-name-ac791e38-bfe0-4620-84d4-0df22ed7b334[0m


XGB train: 8.31880391898526 0.07409681937414536 39.4064404964447


[32m[I 2022-08-25 20:32:58,043][0m Trial 0 finished with value: 0.018514943850713315 and parameters: {'n_estimators': 831, 'max_depth': 4, 'learning_rate': 0.0030593424229678166, 'colsample_bytree': 0.5415326874933835, 'subsample': 0.5984404864569632, 'alpha': 8.350014148514163, 'lambda': 3.534050812904276, 'gamma': 3.630810963124753e-05, 'min_child_weight': 7.631115727348554}. Best is trial 0 with value: 0.018514943850713315.[0m
[32m[I 2022-08-25 20:33:01,076][0m Trial 1 finished with value: 0.020374639823574302 and parameters: {'n_estimators': 591, 'max_depth': 3, 'learning_rate': 0.0212008815190838, 'colsample_bytree': 0.5020753284865785, 'subsample': 0.5212851063881505, 'alpha': 7.661214511865505, 'lambda': 10.21063366054143, 'gamma': 4.194026018821317e-05, 'min_child_weight': 0.523520157461911}. Best is trial 1 with value: 0.020374639823574302.[0m
[32m[I 2022-08-25 20:33:08,456][0m Trial 2 finished with value: 0.001950797565977872 and parameters: {'n_estimators': 843, 'max

Total time for hypermarameter optimization  79.07651734352112
        n_estimators : 681
           max_depth : 3
       learning_rate : 0.012352255179372554
    colsample_bytree : 0.630017570188665
           subsample : 0.8263993213290479
               alpha : 0.39241857146648035
              lambda : 188.57096900189796
               gamma : 1.2189481265207593e-07
    min_child_weight : 11.760249354667314
best objective value : 0.02210224742800277
Optuna XGB train: 8.438046684016827 0.03911375520040972 80.86829543113708
Min_prd:  175
Constant guess:  8.25713871823768 0.0
XGB test: 8.288244243217152 -0.008671408117530532
XGB GS test: 8.282135384707418 -0.002783352890953994
Optuna XGB test: 8.279505662507296 -0.0008202101224408231


(74160, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
124,10006,199,-21.326783,-27.592396,1974,7.46,25.0,-0.095507,0.111088,0.19813,0.145981,14.5694,-33.89205,2.612473,2.720668,2.269109,0.912102,5.8604,2.919359,2.337286,2.451826,5.314277,0.012504,0.109494,0.199984,0.018882,5.573985,2.463967,3.2387,2.37247,6.6347,2.400974,3.3731,2.461242,3.2387,-10.931905,1.70399,1.341432,0.816881,1.535675,1.535648
125,10006,200,-15.979064,-20.880103,1974,-14.9857,25.0,-0.095507,0.111088,0.19813,0.145981,7.46,-10.641475,2.712756,2.445177,2.150569,0.895589,5.0044,2.517541,2.414667,2.482452,5.378412,0.012504,0.109494,0.199984,0.018882,5.400138,2.612473,5.8604,2.41141,5.1454,2.349388,2.6834,2.454348,5.8604,-4.724409,1.712226,1.61932,0.855923,1.606818,1.565273
126,10006,201,-25.154673,-19.973751,1975,22.5261,25.0,-0.095507,0.111088,0.19813,0.145981,-14.9857,-34.175124,2.796417,1.958997,1.64116,0.853395,3.7549,2.233901,2.438581,2.319402,5.224262,0.012504,0.109494,0.199984,0.018882,5.782123,2.712756,5.0044,2.463967,3.2387,2.386218,3.9473,2.46481,5.0044,-17.450559,2.707695,2.40491,0.931452,2.173476,1.863954
127,10006,202,-15.116342,-6.138191,1975,5.7854,25.0,-0.095507,0.111088,0.19813,0.145981,22.5261,-43.065927,2.84953,1.996934,1.390339,0.798751,6.7922,1.687576,2.442646,2.256743,5.432138,0.012504,0.109494,0.199984,0.018882,5.771265,2.796417,3.7549,2.612473,5.8604,2.37247,6.6347,2.433094,3.7549,28.761815,1.482107,1.414542,1.011982,2.404947,2.013676
128,10006,203,-18.160941,-9.913526,1975,-3.6453,25.0,-0.095507,0.111088,0.19813,0.145981,5.7854,-26.745104,2.86267,2.014875,1.868052,0.806562,4.9189,2.116001,2.397099,2.261266,5.477258,0.012504,0.109494,0.199984,0.018882,5.71744,2.84953,6.7922,2.712756,5.0044,2.41141,5.1454,2.395316,6.7922,37.577008,2.066732,1.866476,0.992914,2.510439,2.032948


count    74160.000000
mean      1976.049757
std          0.951021
min       1974.000000
25%       1975.000000
50%       1976.000000
75%       1977.000000
max       1978.000000
Name: year, dtype: float64

PERMNO          74160
prd             74160
mom482          54046
mom242          71549
year            74160
RET             74160
ind             74160
bm              74160
op              74160
gp              74160
inv             74145
mom11           74160
mom122          74160
amhd            45795
ivol_capm       74145
ivol_ff5        74145
beta_bw         74160
MAX             74160
vol1m           74124
vol6m           74103
vol12m          74048
size            74160
lbm             74160
lop             74160
lgp             74160
linv            74160
llme            74160
l1amhd          45652
l1MAX           74158
l3amhd          45391
l3MAX           74149
l6amhd          45090
l6MAX           74127
l12amhd         45135
l12MAX          74158
l12mom122       72354
l12ivol_capm    74063
l12ivol_ff5     74063
l12beta_bw      74101
l12vol6m        73795
l12vol12m       71835
dtype: int64

Number of features before transformation:  (69310, 38)
time to do feature proprocessing: 
Number of features after transformation:  (69310, 85)
mae of a constant model 7.861186206831879
R2 of a constant model 0.0
XGB train: 7.600086536833699 0.10428903353658525
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:34:58,537][0m A new study created in memory with name: no-name-7b087586-0e0e-42f6-aaf6-aa722ea1e219[0m


XGB train: 7.731967824273231 0.06495266870263083 41.918771743774414


[32m[I 2022-08-25 20:35:06,759][0m Trial 0 finished with value: -0.017480878919772437 and parameters: {'n_estimators': 835, 'max_depth': 5, 'learning_rate': 0.048413636947056665, 'colsample_bytree': 0.34244229462823017, 'subsample': 0.7648419304842575, 'alpha': 17.07778781810277, 'lambda': 6.243702986672548, 'gamma': 0.3363649898490194, 'min_child_weight': 5.030066166702621}. Best is trial 0 with value: -0.017480878919772437.[0m
[32m[I 2022-08-25 20:35:14,505][0m Trial 1 finished with value: 0.013755001573807563 and parameters: {'n_estimators': 795, 'max_depth': 5, 'learning_rate': 0.00761188937345186, 'colsample_bytree': 0.17320538707515393, 'subsample': 0.4862969292505399, 'alpha': 9.640530254327272, 'lambda': 115.77951708351858, 'gamma': 0.00034832353843975046, 'min_child_weight': 0.24449324289712906}. Best is trial 1 with value: 0.013755001573807563.[0m
[32m[I 2022-08-25 20:35:20,157][0m Trial 2 finished with value: 0.0151619776721157 and parameters: {'n_estimators': 563, '

Total time for hypermarameter optimization  104.98953151702881
        n_estimators : 644
           max_depth : 4
       learning_rate : 0.003567597519292111
    colsample_bytree : 0.9391713475226822
           subsample : 0.47756823338305215
               alpha : 0.2318314220373237
              lambda : 3.2570048381870285
               gamma : 0.004740881684304755
    min_child_weight : 10.148655265325896
best objective value : 0.016289454064616363
Optuna XGB train: 7.836983763119917 0.030940659282065797 107.67837500572205
Min_prd:  200
Constant guess:  6.8284052824169645 0.0
XGB test: 6.723532156790887 0.012098797248819793
XGB GS test: 6.736076781695843 0.010889794009659393
Optuna XGB test: 6.758095387179109 0.007636890323768797


(86617, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
149,10006,224,-2.366213,38.90382,1976,9.7961,25.0,0.487787,0.124561,0.219545,0.023495,0.6156,24.003392,1.901191,0.984374,0.787596,0.802318,2.7468,1.211152,1.164005,1.364113,5.622267,0.562476,0.107357,0.197156,0.162551,5.391124,1.931497,1.4516,1.895809,2.1869,2.025173,1.4066,2.413926,1.4516,-1.586476,1.154582,1.120293,0.84282,1.482218,1.77916
150,10006,225,21.8183,31.99966,1977,1.0635,25.0,0.487787,0.124561,0.219545,0.023495,9.7961,25.368526,1.914337,1.215784,1.011524,0.82857,2.6743,1.243655,1.191593,1.385471,5.719358,0.562476,0.107357,0.197156,0.162551,5.391124,1.901191,2.7468,1.889733,2.562,1.985085,3.9655,2.252034,2.7468,18.010264,0.891562,0.775986,0.772055,1.400632,1.614133
151,10006,226,21.822937,25.721451,1977,-2.5956,25.0,0.487787,0.124561,0.219545,0.023495,1.0635,14.105366,1.897737,0.815442,0.700757,0.78162,2.1569,0.86687,1.158762,1.189728,5.733493,0.562476,0.107357,0.197156,0.162551,5.582603,1.914337,2.6743,1.931497,1.4516,1.970979,1.7263,2.171167,2.6743,-4.14792,2.437163,2.002417,0.814432,1.668506,1.703926
152,10006,227,15.859033,31.116312,1977,-0.0164,25.0,0.487787,0.124561,0.219545,0.023495,-2.5956,11.744288,1.907512,1.593928,1.484515,0.778512,4.8943,1.578993,1.259617,1.189544,5.697775,0.562476,0.107357,0.197156,0.162551,5.603824,1.897737,2.1569,1.901191,2.7468,1.895809,2.1869,2.125732,2.1569,9.306295,1.674217,1.578206,0.804047,1.694692,1.667251
153,10006,228,13.940975,34.038497,1977,4.6925,25.0,0.487787,0.124561,0.219545,0.023495,-0.0164,5.392767,1.922541,1.20252,1.060071,0.771213,1.8924,1.243518,1.228898,1.189544,5.701405,0.562476,0.107357,0.197156,0.162551,5.63991,1.907512,4.8943,1.914337,2.6743,1.889733,2.562,2.078258,4.8943,17.070464,1.304524,1.025788,0.812099,1.607106,1.619101


count    86617.000000
mean      1978.074396
std          0.947723
min       1976.000000
25%       1977.000000
50%       1978.000000
75%       1979.000000
max       1980.000000
Name: year, dtype: float64

PERMNO          86617
prd             86617
mom482          72687
mom242          84429
year            86617
RET             86617
ind             86617
bm              86617
op              86617
gp              86617
inv             86574
mom11           86617
mom122          86617
amhd            49136
ivol_capm       86601
ivol_ff5        86601
beta_bw         86617
MAX             86617
vol1m           86551
vol6m           86496
vol12m          86386
size            86617
lbm             86617
lop             86617
lgp             86617
linv            86617
llme            86617
l1amhd          48938
l1MAX           86613
l3amhd          48550
l3MAX           86592
l6amhd          47948
l6MAX           86568
l12amhd         46559
l12MAX          86613
l12mom122       82984
l12ivol_capm    86431
l12ivol_ff5     86431
l12beta_bw      86509
l12vol6m        86014
l12vol12m       85416
dtype: int64

Number of features before transformation:  (82013, 38)
time to do feature proprocessing: 
Number of features after transformation:  (82013, 85)
mae of a constant model 7.7454386005664935
R2 of a constant model 0.0
XGB train: 7.384893941000947 0.0828598058566341
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:37:34,612][0m A new study created in memory with name: no-name-9f7db972-f75c-4494-8bf3-dc44987488f8[0m


XGB train: 7.483541971299699 0.053299962005836754 45.05655264854431


[32m[I 2022-08-25 20:37:39,591][0m Trial 0 finished with value: 0.011794808934071426 and parameters: {'n_estimators': 622, 'max_depth': 4, 'learning_rate': 0.016958475971440366, 'colsample_bytree': 0.13371013751424343, 'subsample': 0.8384707775972222, 'alpha': 0.12819699545139285, 'lambda': 19.429037240979593, 'gamma': 0.49533111883986464, 'min_child_weight': 1.2598683978033811}. Best is trial 0 with value: 0.011794808934071426.[0m
[32m[I 2022-08-25 20:37:44,452][0m Trial 1 finished with value: 0.012672678305798636 and parameters: {'n_estimators': 859, 'max_depth': 3, 'learning_rate': 0.01014131486519938, 'colsample_bytree': 0.12255263234902204, 'subsample': 0.470355892022717, 'alpha': 0.3622937164385349, 'lambda': 0.6516320681240616, 'gamma': 8.027898109114712e-08, 'min_child_weight': 26.589155297801575}. Best is trial 1 with value: 0.012672678305798636.[0m
[32m[I 2022-08-25 20:37:54,811][0m Trial 2 finished with value: 0.013596185898776883 and parameters: {'n_estimators': 963

Total time for hypermarameter optimization  111.90343046188354
        n_estimators : 701
           max_depth : 3
       learning_rate : 0.016556589404129664
    colsample_bytree : 0.4065998411555825
           subsample : 0.5362055623259836
               alpha : 1.5234730901197817
              lambda : 4.027889687332672
               gamma : 9.48207934555906e-06
    min_child_weight : 44.67666021919712
best objective value : 0.01447795241982691
Optuna XGB train: 7.532710768229944 0.03515546864141861 114.39416766166687
Min_prd:  225
Constant guess:  10.789309041003193 0.0
XGB test: 10.584344777730841 0.0246590710211394
XGB GS test: 10.597557070825852 0.024404214808965596
Optuna XGB test: 10.602054445815849 0.024533244962536838


(85749, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
174,10006,249,15.44049,-13.483924,1979,-1.9895,25.0,0.344796,0.12391,0.224713,0.08608,-2.38,-9.398265,1.806977,1.121349,1.068897,0.843122,2.761,1.18351,1.465939,1.408005,5.595858,0.246732,0.128162,0.228031,0.051037,5.707839,1.735479,2.7014,1.882601,1.4066,1.911064,2.5307,1.878987,2.7014,-2.840033,1.10569,0.886346,0.936283,1.255599,1.22199
175,10006,250,7.656898,-12.128055,1979,-1.8823,25.0,0.344796,0.12391,0.224713,0.08608,-1.9895,1.438464,1.78546,1.139783,0.994278,0.807884,1.9893,1.186615,1.436818,1.338013,5.583587,0.246732,0.128162,0.228031,0.051037,5.576376,1.806977,2.761,1.824922,3.1948,1.979787,2.2025,1.919239,2.761,-2.197162,1.708117,1.620786,0.976068,1.289759,1.310662
176,10006,251,13.910247,-12.205583,1979,7.6646,25.0,0.344796,0.12391,0.224713,0.08608,-1.8823,1.566341,1.729562,0.792122,0.658194,0.804317,2.0897,0.962481,1.390096,1.338323,5.554358,0.246732,0.128162,0.228031,0.051037,5.542759,1.78546,1.9893,1.735479,2.7014,1.932696,3.3774,1.943009,1.9893,-12.452139,0.828827,0.717779,0.988329,1.308817,1.268762
177,10006,252,27.035379,-9.191941,1979,7.0125,25.0,0.344796,0.12391,0.224713,0.08608,7.6646,-3.544113,1.714285,1.41333,1.23472,0.836807,3.4514,1.590503,1.451393,1.387657,5.635703,0.246732,0.128162,0.228031,0.051037,5.5805,1.729562,2.0897,1.806977,2.761,1.882601,1.4066,1.951792,2.0897,-14.287881,0.990059,0.892042,0.992036,1.315554,1.254191
178,10006,253,12.516907,-8.290927,1979,-3.907,25.0,0.344796,0.12391,0.224713,0.08608,7.0125,-5.728738,1.721803,1.33536,1.185008,0.845385,3.3821,1.438224,1.410186,1.388858,5.711155,0.246732,0.128162,0.228031,0.051037,5.68215,1.714285,3.4514,1.78546,1.9893,1.824922,3.1948,1.942359,3.4514,-15.414663,1.309469,0.923172,0.948718,1.362666,1.272217


count    85749.000000
mean      1980.137028
std          0.947544
min       1979.000000
25%       1979.000000
50%       1980.000000
75%       1981.000000
max       1982.000000
Name: year, dtype: float64

PERMNO          85749
prd             85749
mom482          80250
mom242          84505
year            85749
RET             85749
ind             85749
bm              85749
op              85749
gp              85749
inv             85737
mom11           85749
mom122          85749
amhd            49668
ivol_capm       85738
ivol_ff5        85738
beta_bw         85749
MAX             85749
vol1m           85690
vol6m           85641
vol12m          85532
size            85749
lbm             85749
lop             85749
lgp             85749
linv            85749
llme            85749
l1amhd          49554
l1MAX           85746
l3amhd          49309
l3MAX           85726
l6amhd          48799
l6MAX           85697
l12amhd         47442
l12MAX          85746
l12mom122       84890
l12ivol_capm    85580
l12ivol_ff5     85580
l12beta_bw      85644
l12vol6m        85271
l12vol12m       84763
dtype: int64

Number of features before transformation:  (81307, 38)
time to do feature proprocessing: 
Number of features after transformation:  (81307, 85)
mae of a constant model 8.116751240947098
R2 of a constant model 0.0
XGB train: 7.8489579147754664 0.08026146156005265
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta

[32m[I 2022-08-25 20:40:17,820][0m A new study created in memory with name: no-name-3efb788b-cd07-4337-99bc-36b04dc24e77[0m


XGB train: 8.016250291531522 0.031354032007939625 45.50160336494446


[32m[I 2022-08-25 20:40:25,493][0m Trial 0 finished with value: 0.0006103057716062722 and parameters: {'n_estimators': 991, 'max_depth': 4, 'learning_rate': 0.039715185693922794, 'colsample_bytree': 0.3044855299741879, 'subsample': 0.8642642025859553, 'alpha': 0.2034515796835656, 'lambda': 112.28746913704283, 'gamma': 0.054001199826481334, 'min_child_weight': 5.082308123428978}. Best is trial 0 with value: 0.0006103057716062722.[0m
[32m[I 2022-08-25 20:40:29,395][0m Trial 1 finished with value: 0.009345943286151147 and parameters: {'n_estimators': 802, 'max_depth': 2, 'learning_rate': 0.00997950899495097, 'colsample_bytree': 0.7157617217345404, 'subsample': 0.8402113131280957, 'alpha': 1.4014308381872345, 'lambda': 44.466147812169005, 'gamma': 1.1039066333222566e-06, 'min_child_weight': 0.4203740734080024}. Best is trial 1 with value: 0.009345943286151147.[0m
[32m[I 2022-08-25 20:40:33,706][0m Trial 2 finished with value: 0.010221703186951224 and parameters: {'n_estimators': 72

Total time for hypermarameter optimization  94.83282732963562
        n_estimators : 620
           max_depth : 3
       learning_rate : 0.015418111313419566
    colsample_bytree : 0.6014203404940136
           subsample : 0.9447919074508159
               alpha : 3.877424066038285
              lambda : 0.15962393232819785
               gamma : 4.474944977432207e-08
    min_child_weight : 25.721454743805236
best objective value : 0.010697830193806964
Optuna XGB train: 8.015395809073299 0.029664219580690987 96.95533061027527
Min_prd:  250
Constant guess:  7.309043435698192 0.0
XGB test: 7.3501955324514405 -0.004758927497545029
XGB GS test: 7.307363885485978 0.003917956329142047
Optuna XGB test: 7.317106562321939 0.00593865869626331


(85011, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
199,10006,274,37.459012,59.70085,1981,-3.2759,25.0,0.51329,0.111463,0.226944,0.098149,9.8296,6.843997,1.283075,1.471052,1.422033,0.972026,4.0494,1.629143,1.728732,1.872404,6.13005,0.545005,0.118539,0.224607,0.1738,5.907127,1.292463,3.293,1.272274,2.2447,1.206043,3.976,1.301524,3.293,9.938743,1.724933,1.648195,1.034048,1.662274,1.467041
200,10006,275,32.174173,43.778358,1981,-1.7113,25.0,0.51329,0.111463,0.226944,0.098149,-3.2759,25.866894,1.260908,1.104413,0.732167,0.959088,2.6871,1.273156,1.607273,1.853357,6.107744,0.545005,0.118539,0.224607,0.1738,5.832092,1.283075,4.0494,1.268666,6.0923,1.296753,4.1908,1.308714,4.0494,34.869606,1.18734,1.172984,1.045721,1.718915,1.508377
201,10006,276,21.81087,32.034169,1981,-7.7127,25.0,0.51329,0.111463,0.226944,0.098149,-1.7113,49.228166,1.200905,1.244822,0.936037,0.876275,3.1196,1.359986,1.517022,1.666039,6.090044,0.545005,0.118539,0.224607,0.1738,5.643256,1.260908,2.6871,1.292463,3.293,1.324853,4.091,1.282287,2.6871,16.788256,2.176823,1.783361,1.098593,2.078251,1.694509
202,10006,277,13.128183,28.372543,1981,5.5495,25.0,0.51329,0.111463,0.226944,0.098149,-7.7127,52.793976,1.102388,1.03419,0.905935,0.832826,2.3112,1.060035,1.523327,1.546585,6.021416,0.545005,0.118539,0.224607,0.1738,5.615421,1.200905,3.1196,1.283075,4.0494,1.272274,2.2447,1.266251,3.1196,-10.965141,1.950141,1.698754,1.111977,2.044336,1.789059
203,10006,278,22.130729,27.011807,1981,-8.8825,25.0,0.51329,0.111463,0.226944,0.098149,5.5495,30.979424,1.017502,0.944081,0.780361,0.80814,3.5774,1.012657,1.34096,1.497498,6.072026,0.545005,0.118539,0.224607,0.1738,5.677941,1.102388,2.3112,1.260908,2.6871,1.268666,6.0923,1.22824,2.3112,-11.056036,1.381642,1.0586,1.116465,2.087183,1.821832


count    85011.000000
mean      1982.240404
std          0.975249
min       1981.000000
25%       1981.000000
50%       1982.000000
75%       1983.000000
max       1984.000000
Name: year, dtype: float64

PERMNO          85011
prd             85011
mom482          78370
mom242          83792
year            85011
RET             85011
ind             85011
bm              85011
op              85011
gp              85011
inv             85001
mom11           85011
mom122          85011
amhd            51414
ivol_capm       84999
ivol_ff5        84999
beta_bw         85011
MAX             85011
vol1m           84968
vol6m           84955
vol12m          84886
size            85011
lbm             85011
lop             85011
lgp             85011
linv            85011
llme            85011
l1amhd          50650
l1MAX           85008
l3amhd          49057
l3MAX           84993
l6amhd          46663
l6MAX           84974
l12amhd         45569
l12MAX          85008
l12mom122       83983
l12ivol_capm    84873
l12ivol_ff5     84873
l12beta_bw      84933
l12vol6m        84542
l12vol12m       83998
dtype: int64

Number of features before transformation:  (80386, 38)
time to do feature proprocessing: 
Number of features after transformation:  (80386, 85)
mae of a constant model 8.383451711057207
R2 of a constant model 0.0
XGB train: 8.111317686498943 0.09241572742493986
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:42:43,550][0m A new study created in memory with name: no-name-16b7aa50-9f18-4d62-80f3-af81be75bb4d[0m


XGB train: 8.227672560980455 0.05910536002236311 45.516396284103394


[32m[I 2022-08-25 20:42:47,615][0m Trial 0 finished with value: 0.014061348592122625 and parameters: {'n_estimators': 663, 'max_depth': 3, 'learning_rate': 0.02251978511800619, 'colsample_bytree': 0.6595321042913489, 'subsample': 0.37913276511248595, 'alpha': 1.0068298446202433, 'lambda': 0.24943068905678328, 'gamma': 7.4181592327416975e-06, 'min_child_weight': 10.092520808053056}. Best is trial 0 with value: 0.014061348592122625.[0m
[32m[I 2022-08-25 20:42:52,267][0m Trial 1 finished with value: 0.016563321936989738 and parameters: {'n_estimators': 981, 'max_depth': 2, 'learning_rate': 0.02947310288107735, 'colsample_bytree': 0.867913262766826, 'subsample': 0.7087426325386783, 'alpha': 0.10866131290191476, 'lambda': 0.5119021459601684, 'gamma': 0.0009192854024913386, 'min_child_weight': 1.1015320312148122}. Best is trial 1 with value: 0.016563321936989738.[0m
[32m[I 2022-08-25 20:42:56,201][0m Trial 2 finished with value: 0.016878145798262675 and parameters: {'n_estimators': 7

Total time for hypermarameter optimization  114.61047339439392
        n_estimators : 693
           max_depth : 5
       learning_rate : 0.011868495043167594
    colsample_bytree : 0.43539904579824357
           subsample : 0.46481811563000125
               alpha : 29.008487087999395
              lambda : 191.6403821494657
               gamma : 3.0020182671759544e-09
    min_child_weight : 0.11334395933925108
best objective value : 0.018248456036676686
Optuna XGB train: 8.228583416495713 0.05303035505421494 118.83995366096497
Min_prd:  275
Constant guess:  7.313710789903448 0.0
XGB test: 7.315589976502754 0.013317218293824484
XGB GS test: 7.303835723170866 0.017266134436935476
Optuna XGB test: 7.280995652870029 0.022633949782203455


(89414, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
224,10006,299,-19.434584,-38.39036,1983,0.1605,25.0,0.467418,0.097309,0.220497,0.118136,-4.5159,-9.790204,1.628314,1.03303,0.727775,0.814121,1.4386,1.157497,1.825477,1.691859,5.560153,0.274945,0.118348,0.237469,0.091161,5.825327,1.626665,4.1337,1.715833,4.1367,1.781569,5.0932,1.422049,4.1337,-30.659974,0.986262,0.964204,0.843484,1.687154,1.437885
225,10006,300,-21.388149,-35.843377,1983,5.9567,25.0,0.467418,0.097309,0.220497,0.118136,0.1605,-7.157811,1.59834,1.063466,0.936522,0.799767,2.3259,1.152686,1.614224,1.667907,5.568027,0.274945,0.118348,0.237469,0.091161,5.74077,1.628314,1.4386,1.671621,4.0013,1.786906,4.5785,1.44086,1.4386,-30.101733,1.193522,1.015454,0.852702,1.554393,1.44087
226,10006,301,-13.964189,-34.528211,1983,-1.4988,25.0,0.467418,0.097309,0.220497,0.118136,5.9567,-12.456541,1.521537,1.313733,1.155573,0.808885,4.1307,1.43344,1.474423,1.647114,5.632566,0.274945,0.118348,0.237469,0.091161,5.811722,1.59834,2.3259,1.626665,4.1337,1.762098,3.9561,1.526839,2.3259,-29.730986,1.64575,1.498313,0.865615,1.602011,1.493237
227,10006,302,-17.68153,-29.776396,1983,2.7008,25.0,0.467418,0.097309,0.220497,0.118136,-1.4988,0.904921,1.48534,2.426025,2.027979,0.82545,4.3466,2.604619,1.663623,1.780087,5.621046,0.274945,0.118348,0.237469,0.091161,5.718464,1.521537,4.1307,1.628314,1.4386,1.715833,4.1367,1.559589,4.1307,-29.282674,0.983334,0.916348,0.864828,1.585183,1.48668
228,10006,303,-18.595559,-22.961625,1983,-2.1893,25.0,0.784207,0.088427,0.222761,-0.032575,2.7008,8.15636,1.298865,1.393089,1.343269,0.796333,3.7153,1.412173,1.616113,1.743533,5.654198,0.467418,0.097309,0.220497,0.118136,5.650019,1.48534,4.3466,1.59834,2.3259,1.671621,4.0013,1.691079,4.3466,-28.654955,1.557424,1.486731,0.901204,1.618876,1.559004


count    89414.000000
mean      1984.326616
std          0.986718
min       1983.000000
25%       1984.000000
50%       1984.000000
75%       1985.000000
max       1986.000000
Name: year, dtype: float64

PERMNO          89414
prd             89414
mom482          77366
mom242          87910
year            89414
RET             89414
ind             89414
bm              89414
op              89414
gp              89414
inv             89384
mom11           89414
mom122          89414
amhd            67483
ivol_capm       89406
ivol_ff5        89406
beta_bw         89414
MAX             89414
vol1m           89392
vol6m           89332
vol12m          89193
size            89414
lbm             89414
lop             89414
lgp             89414
linv            89414
llme            89414
l1amhd          66888
l1MAX           89409
l3amhd          65668
l3MAX           89393
l6amhd          63736
l6MAX           89361
l12amhd         59151
l12MAX          89409
l12mom122       88531
l12ivol_capm    89301
l12ivol_ff5     89301
l12beta_bw      89339
l12vol6m        89013
l12vol12m       88279
dtype: int64

Number of features before transformation:  (84583, 38)
time to do feature proprocessing: 
Number of features after transformation:  (84583, 85)
mae of a constant model 8.393769117304654
R2 of a constant model 0.0
XGB train: 8.000355608201236 0.09705730394764789
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:45:30,623][0m A new study created in memory with name: no-name-19b3fbb8-271b-402d-a253-b31d352c4992[0m


XGB train: 8.165630249798962 0.04491956015580256 44.55995202064514


[32m[I 2022-08-25 20:45:36,827][0m Trial 0 finished with value: 0.01133172457126716 and parameters: {'n_estimators': 563, 'max_depth': 5, 'learning_rate': 0.024621442014199153, 'colsample_bytree': 0.6692765609583952, 'subsample': 0.5708882627737302, 'alpha': 0.8095008713389503, 'lambda': 26.843108540541774, 'gamma': 2.5419559569322462e-08, 'min_child_weight': 0.3486141817549073}. Best is trial 0 with value: 0.01133172457126716.[0m
[32m[I 2022-08-25 20:45:46,700][0m Trial 1 finished with value: 0.013649453276395202 and parameters: {'n_estimators': 937, 'max_depth': 5, 'learning_rate': 0.008922820238053766, 'colsample_bytree': 0.5645332963763817, 'subsample': 0.5073385590714552, 'alpha': 10.532285470795207, 'lambda': 1.6395672693649879, 'gamma': 1.550942136082451e-07, 'min_child_weight': 5.159343485952076}. Best is trial 1 with value: 0.013649453276395202.[0m
[32m[I 2022-08-25 20:45:52,358][0m Trial 2 finished with value: 0.014390491494072023 and parameters: {'n_estimators': 907,

Total time for hypermarameter optimization  106.48562169075012
        n_estimators : 674
           max_depth : 3
       learning_rate : 0.015507139165931536
    colsample_bytree : 0.9265980975459144
           subsample : 0.6849594479584783
               alpha : 0.10086193844245119
              lambda : 87.68914911460818
               gamma : 1.1236155405156776e-10
    min_child_weight : 0.11509837084394178
best objective value : 0.018872476727951493
Optuna XGB train: 8.175172129404523 0.039448178976969284 109.0122184753418
Min_prd:  300
Constant guess:  9.021598745386575 0.0
XGB test: 9.081703222798415 -0.009404386780343499
XGB GS test: 9.050739697306778 -0.0005889493404265256
Optuna XGB test: 9.051690485775001 -0.0018547694606243503


(92653, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
1809,10031,339,,,1986,4.8854,42.0,-0.662806,0.088703,0.587186,-0.033767,-3.1516,18.939107,,0.765013,0.647098,0.407036,1.4066,0.86687,1.133002,1.21848,2.518412,-0.216519,0.088703,0.587186,-0.033767,2.308691,,1.4066,,2.9112,,2.7087,,1.4066,-18.859893,0.765013,0.647098,0.531826,1.376322,
1810,10031,351,,-18.540757,1987,-0.46,42.0,-0.594143,0.05461,0.738385,0.69761,-0.48,-2.232171,,0.765013,0.647098,0.454047,1.4066,0.86687,1.133002,1.189544,2.54508,-0.216519,0.088703,0.587186,-0.033767,2.633187,,1.4066,,2.8331,,1.4066,,1.4066,6.666485,1.923407,1.545611,0.473251,1.593294,
1811,10031,352,,-21.316912,1987,15.3195,42.0,-0.594143,0.05461,0.738385,0.69761,-0.46,-7.233473,,0.765013,0.647098,0.46571,1.4066,0.86687,1.133002,1.189544,2.54508,-0.216519,0.088703,0.587186,-0.033767,2.668694,,1.4066,,1.4066,,1.4066,,1.4066,37.460954,1.417848,1.194093,0.558271,1.620862,
1812,10031,353,,-6.534634,1987,-2.7227,42.0,-0.594143,0.05461,0.738385,0.69761,15.3195,-7.233473,,3.389156,2.914959,0.434743,15.7675,3.445551,1.532002,1.253984,2.691684,-0.216519,0.088703,0.587186,-0.033767,2.645164,,1.4066,,1.4066,,2.9112,,1.4066,42.902406,0.765013,0.647098,0.571238,1.632742,
1813,10031,354,,-8.523998,1987,-23.8558,42.0,-0.594143,0.05461,0.738385,0.69761,-2.7227,2.196919,,1.130335,1.101901,0.451557,2.36,1.142533,1.601022,1.276181,2.668694,-0.216519,0.088703,0.587186,-0.033767,2.645164,,15.7675,,1.4066,,2.8331,,15.7675,57.128111,0.765013,0.647098,0.582214,1.612387,


count    92653.000000
mean      1986.389032
std          1.001490
min       1985.000000
25%       1986.000000
50%       1986.000000
75%       1987.000000
max       1988.000000
Name: year, dtype: float64

PERMNO          92653
prd             92653
mom482          76796
mom242          91021
year            92653
RET             92653
ind             92653
bm              92653
op              92653
gp              92653
inv             92559
mom11           92653
mom122          92653
amhd            73401
ivol_capm       92648
ivol_ff5        92648
beta_bw         92653
MAX             92653
vol1m           92638
vol6m           92569
vol12m          92424
size            92653
lbm             92653
lop             92653
lgp             92653
linv            92653
llme            92653
l1amhd          73486
l1MAX           92650
l3amhd          73591
l3MAX           92636
l6amhd          73644
l6MAX           92620
l12amhd         74003
l12MAX          92650
l12mom122       92052
l12ivol_capm    92563
l12ivol_ff5     92563
l12beta_bw      92596
l12vol6m        92297
l12vol12m       91407
dtype: int64

Number of features before transformation:  (87485, 38)
time to do feature proprocessing: 
Number of features after transformation:  (87485, 85)
mae of a constant model 8.904606616730787
R2 of a constant model 0.0
XGB train: 8.592757719404272 0.08396147947934063
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:48:07,714][0m A new study created in memory with name: no-name-3619763e-f793-4526-b6b3-5282027900cd[0m


XGB train: 8.781819533294557 0.029098676615708197 44.513224363327026


[32m[I 2022-08-25 20:48:13,343][0m Trial 0 finished with value: 0.005809530005552209 and parameters: {'n_estimators': 711, 'max_depth': 4, 'learning_rate': 0.034012829502802704, 'colsample_bytree': 0.6377745880564336, 'subsample': 0.7915834430286488, 'alpha': 3.556950829683322, 'lambda': 179.36878343604005, 'gamma': 1.5524516954314835e-07, 'min_child_weight': 1.0613723371756534}. Best is trial 0 with value: 0.005809530005552209.[0m
[32m[I 2022-08-25 20:48:20,774][0m Trial 1 finished with value: 0.00332523141157047 and parameters: {'n_estimators': 687, 'max_depth': 5, 'learning_rate': 0.017649328941305203, 'colsample_bytree': 0.4010402318747599, 'subsample': 0.3845140031582333, 'alpha': 0.7389622850836521, 'lambda': 4.333247252188515, 'gamma': 1.7151950026360052, 'min_child_weight': 0.6207617073354732}. Best is trial 0 with value: 0.005809530005552209.[0m
[32m[I 2022-08-25 20:48:24,273][0m Trial 2 finished with value: 0.00814621122268955 and parameters: {'n_estimators': 559, 'ma

Total time for hypermarameter optimization  114.28232026100159
        n_estimators : 617
           max_depth : 2
       learning_rate : 0.02325594233014819
    colsample_bytree : 0.3201790452489346
           subsample : 0.8549237333671368
               alpha : 0.6654114246647382
              lambda : 1.6249527925645046
               gamma : 1.2623897196432703e-05
    min_child_weight : 0.40488639555076245
best objective value : 0.010046485631344401
Optuna XGB train: 8.80627536154854 0.021212191335875485 115.96125650405884
Min_prd:  325
Constant guess:  7.5647841018428155 0.0
XGB test: 7.577887957775383 -0.006069675653999962
XGB GS test: 7.52660420458592 0.004754988127891879
Optuna XGB test: 7.5167968941732335 0.006591995701068831


(94366, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
0,10005,375,,-72.706514,1989,-0.7,30.0,0.490174,-0.214332,0.0,-0.230583,-0.71,-33.516946,,0.765013,0.647098,0.84344,1.4066,0.86687,1.485001,3.167929,-0.424011,0.75014,-0.084639,0.015282,0.306039,-0.087557,,1.4066,,1.4066,,1.4066,,1.4066,-52.647854,0.765013,0.647098,0.380573,2.621943,6.82792
1,10005,376,,-68.53901,1989,-20.74,30.0,0.490174,-0.214332,0.0,-0.230583,-0.7,-48.453613,,0.765013,0.647098,0.569757,1.4066,0.86687,1.48487,2.598143,-0.424011,0.75014,-0.084639,0.015282,0.306039,0.163748,,1.4066,,1.4066,,1.4066,,1.4066,-61.937195,5.362759,5.279563,0.44174,3.313448,4.436441
2,10005,377,,-60.041983,1989,-25.65,30.0,0.490174,-0.214332,0.0,-0.230583,-20.74,-48.510651,,4.169273,3.399652,0.580613,1.4066,4.364358,2.310132,2.88207,-0.647218,0.75014,-0.084639,0.015282,0.306039,0.163748,,1.4066,,1.4066,,1.4066,,1.4066,-43.100755,0.765013,0.647098,0.456509,3.313442,4.197256
3,10005,378,,-67.083392,1989,-0.68,30.0,0.490174,-0.214332,0.0,-0.230583,-22.380465,-38.209694,,5.565847,5.250996,0.663164,1.4066,6.81372,3.190628,2.799597,-0.934794,0.75014,-0.084639,0.015282,0.306039,-0.241753,,1.4066,,1.4066,,1.4066,,1.4066,-22.391633,5.852977,5.112499,0.452695,4.107071,4.246072
4,10005,379,,-70.826677,1989,32.6433,30.0,0.490174,-0.214332,0.0,-0.230583,-0.68,-53.776947,,0.765013,0.647098,0.372045,1.4066,0.86687,2.840962,2.26516,-0.934794,0.75014,-0.084639,0.015282,0.306039,-0.241753,,1.4066,,1.4066,,1.4066,,1.4066,-36.888819,4.932686,3.77844,0.664521,4.378863,3.604576


count    94366.000000
mean      1988.458640
std          1.010222
min       1987.000000
25%       1988.000000
50%       1988.000000
75%       1989.000000
max       1990.000000
Name: year, dtype: float64

PERMNO          94366
prd             94366
mom482          78252
mom242          92830
year            94366
RET             94366
ind             94366
bm              94366
op              94366
gp              94366
inv             94274
mom11           94366
mom122          94366
amhd            70952
ivol_capm       94366
ivol_ff5        94366
beta_bw         94366
MAX             94366
vol1m           94355
vol6m           94256
vol12m          94078
size            94366
lbm             94366
lop             94366
lgp             94366
linv            94366
llme            94366
l1amhd          71191
l1MAX           94364
l3amhd          71672
l3MAX           94343
l6amhd          72310
l6MAX           94323
l12amhd         73411
l12MAX          94364
l12mom122       93899
l12ivol_capm    94261
l12ivol_ff5     94261
l12beta_bw      94292
l12vol6m        94044
l12vol12m       93168
dtype: int64

Number of features before transformation:  (88871, 38)
time to do feature proprocessing: 
Number of features after transformation:  (88871, 86)
mae of a constant model 8.817138442623339
R2 of a constant model 0.0
XGB train: 8.478471230016714 0.08294077560577218
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:50:55,152][0m A new study created in memory with name: no-name-25967b89-5e43-47ea-b848-74b71dce51d0[0m


XGB train: 8.656638085401468 0.03297978664514112 47.90201115608215


[32m[I 2022-08-25 20:50:59,904][0m Trial 0 finished with value: 0.008209587951112724 and parameters: {'n_estimators': 726, 'max_depth': 3, 'learning_rate': 0.040297692152042736, 'colsample_bytree': 0.9034893109445548, 'subsample': 0.8919079951062683, 'alpha': 3.2912638286910356, 'lambda': 29.261468598020347, 'gamma': 0.00012417339382724668, 'min_child_weight': 12.404995209513311}. Best is trial 0 with value: 0.008209587951112724.[0m
[32m[I 2022-08-25 20:51:03,265][0m Trial 1 finished with value: 0.008200462093778987 and parameters: {'n_estimators': 638, 'max_depth': 2, 'learning_rate': 0.049456083017704114, 'colsample_bytree': 0.2874180245243252, 'subsample': 0.7742478255237798, 'alpha': 0.9011644304040384, 'lambda': 0.41498038093097556, 'gamma': 1.5778684010949337e-08, 'min_child_weight': 0.9858773376377351}. Best is trial 0 with value: 0.008209587951112724.[0m
[32m[I 2022-08-25 20:51:10,091][0m Trial 2 finished with value: 0.0063056390616766204 and parameters: {'n_estimators'

Total time for hypermarameter optimization  115.52088236808777
        n_estimators : 845
           max_depth : 3
       learning_rate : 0.009068990449564034
    colsample_bytree : 0.5400978236732108
           subsample : 0.7096671572739489
               alpha : 22.975358291197413
              lambda : 117.76905647098188
               gamma : 1.3886427266615174e-10
    min_child_weight : 17.147587781497684
best objective value : 0.01192189822824498
Optuna XGB train: 8.678826805920373 0.02459732478735055 118.47987055778503
Min_prd:  350
Constant guess:  8.80632708605012 0.0
XGB test: 8.900224224272616 -0.014594664544531177
XGB GS test: 8.84693806544295 -0.005260274491055261
Optuna XGB test: 8.838932127253665 -0.003355999634563789


(98052, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
0,10005,375,,-72.706514,1989,-0.7,30.0,0.490174,-0.214332,0.0,-0.230583,-0.71,-33.516946,,0.765013,0.647098,0.84344,1.4066,0.86687,1.485001,3.167929,-0.424011,0.75014,-0.084639,0.015282,0.306039,-0.087557,,1.4066,,1.4066,,1.4066,,1.4066,-52.647854,0.765013,0.647098,0.380573,2.621943,6.82792
1,10005,376,,-68.53901,1989,-20.74,30.0,0.490174,-0.214332,0.0,-0.230583,-0.7,-48.453613,,0.765013,0.647098,0.569757,1.4066,0.86687,1.48487,2.598143,-0.424011,0.75014,-0.084639,0.015282,0.306039,0.163748,,1.4066,,1.4066,,1.4066,,1.4066,-61.937195,5.362759,5.279563,0.44174,3.313448,4.436441
2,10005,377,,-60.041983,1989,-25.65,30.0,0.490174,-0.214332,0.0,-0.230583,-20.74,-48.510651,,4.169273,3.399652,0.580613,1.4066,4.364358,2.310132,2.88207,-0.647218,0.75014,-0.084639,0.015282,0.306039,0.163748,,1.4066,,1.4066,,1.4066,,1.4066,-43.100755,0.765013,0.647098,0.456509,3.313442,4.197256
3,10005,378,,-67.083392,1989,-0.68,30.0,0.490174,-0.214332,0.0,-0.230583,-22.380465,-38.209694,,5.565847,5.250996,0.663164,1.4066,6.81372,3.190628,2.799597,-0.934794,0.75014,-0.084639,0.015282,0.306039,-0.241753,,1.4066,,1.4066,,1.4066,,1.4066,-22.391633,5.852977,5.112499,0.452695,4.107071,4.246072
4,10005,379,,-70.826677,1989,32.6433,30.0,0.490174,-0.214332,0.0,-0.230583,-0.68,-53.776947,,0.765013,0.647098,0.372045,1.4066,0.86687,2.840962,2.26516,-0.934794,0.75014,-0.084639,0.015282,0.306039,-0.241753,,1.4066,,1.4066,,1.4066,,1.4066,-36.888819,4.932686,3.77844,0.664521,4.378863,3.604576


count    98052.000000
mean      1990.551962
std          1.006492
min       1989.000000
25%       1990.000000
50%       1991.000000
75%       1991.000000
max       1992.000000
Name: year, dtype: float64

PERMNO          98052
prd             98052
mom482          81859
mom242          96658
year            98052
RET             98052
ind             98052
bm              98052
op              98052
gp              98052
inv             97951
mom11           98052
mom122          98052
amhd            70084
ivol_capm       98049
ivol_ff5        98049
beta_bw         98052
MAX             98052
vol1m           98041
vol6m           97967
vol12m          97805
size            98052
lbm             98052
lop             98052
lgp             98052
linv            98052
llme            98052
l1amhd          70160
l1MAX           98051
l3amhd          70307
l3MAX           98034
l6amhd          70538
l6MAX           98022
l12amhd         71705
l12MAX          98051
l12mom122       97737
l12ivol_capm    97949
l12ivol_ff5     97949
l12beta_bw      97986
l12vol6m        97780
l12vol12m       96892
dtype: int64

Number of features before transformation:  (92069, 38)
time to do feature proprocessing: 
Number of features after transformation:  (92069, 86)
mae of a constant model 9.959264525018126
R2 of a constant model 0.0
XGB train: 9.70897017052431 0.06965662748584933
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.3s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0

[32m[I 2022-08-25 20:53:44,430][0m A new study created in memory with name: no-name-f909fb93-ca5a-4bca-8e2c-40a486ba3434[0m


XGB train: 9.888658675456666 0.025484454735415718 47.231123208999634


[32m[I 2022-08-25 20:53:48,372][0m Trial 0 finished with value: 0.006097047337995761 and parameters: {'n_estimators': 638, 'max_depth': 2, 'learning_rate': 0.014091136261926582, 'colsample_bytree': 0.7657592741211049, 'subsample': 0.8317122201794536, 'alpha': 0.13012639407147297, 'lambda': 180.40218401672968, 'gamma': 2.2277040293118926, 'min_child_weight': 18.85841234906928}. Best is trial 0 with value: 0.006097047337995761.[0m
[32m[I 2022-08-25 20:53:53,376][0m Trial 1 finished with value: 0.005803787153266028 and parameters: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.01352322830983041, 'colsample_bytree': 0.6854230011835314, 'subsample': 0.8048467774376769, 'alpha': 2.7528075345233973, 'lambda': 0.2773401387911674, 'gamma': 0.14808118657536465, 'min_child_weight': 0.47963653885367113}. Best is trial 0 with value: 0.006097047337995761.[0m
[32m[I 2022-08-25 20:53:59,897][0m Trial 2 finished with value: 0.0016206577369834007 and parameters: {'n_estimators': 553, '

Total time for hypermarameter optimization  114.0503785610199
        n_estimators : 638
           max_depth : 2
       learning_rate : 0.014091136261926582
    colsample_bytree : 0.7657592741211049
           subsample : 0.8317122201794536
               alpha : 0.13012639407147297
              lambda : 180.40218401672968
               gamma : 2.2277040293118926
    min_child_weight : 18.85841234906928
best objective value : 0.006097047337995761
Optuna XGB train: 9.937875927825779 0.011762849940295084 115.91636371612549
Min_prd:  375
Constant guess:  9.550258459737007 0.0
XGB test: 9.528634419328005 0.001776083736201195
XGB GS test: 9.503797490986711 0.006979402041612848
Optuna XGB test: 9.498108312795948 0.008173245684614527


(99307, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
263,10010,399,283.349209,170.270066,1991,14.7642,12.0,-1.763275,0.178266,0.82803,0.349928,-13.6553,59.863713,1.269015,3.168363,2.753972,1.340979,6.5211,3.155761,3.825545,3.964292,1.709402,4.658076,-0.665503,0.215755,0.874858,0.193029,4.197954,1.303864,7.2371,1.481481,4.484977,9.2373,0.689655,5.531972,5.4495,2.666667,6.251243,7.2371,1.25,105.899232,5.558217,4.995912,1.019399,5.667817,5.290583
264,10010,400,283.349209,170.270066,1991,10.5694,12.0,-1.763275,0.178266,0.82803,0.349928,14.7642,11.031084,1.2508,2.607292,2.407733,1.317632,7.178,2.694437,3.645754,3.826436,1.492537,4.800046,-0.665503,0.215755,0.874858,0.193029,4.436899,1.269015,6.5211,1.709402,2.476694,8.7479,1.503759,5.081887,9.6156,1.0,6.231516,6.5211,2.020202,105.899232,4.219118,3.955294,1.094237,5.676083,5.315685
265,10010,401,283.349209,170.270066,1991,-24.3011,12.0,-1.763275,0.178266,0.82803,0.349928,10.5694,71.406961,1.198504,2.826425,2.613714,1.122542,8.3759,3.266944,3.547472,3.565168,0.671141,4.960313,-0.665503,0.215755,0.874858,0.193029,4.149217,1.2508,7.178,1.492537,1.303864,7.2371,1.481481,4.90617,7.975,1.694915,6.204714,7.178,1.351351,105.899232,3.588282,2.519782,1.442106,6.022361,5.494856
266,10010,402,283.349209,170.270066,1991,-14.333,12.0,-1.763275,0.178266,0.82803,0.349928,-22.380465,105.899232,1.138996,5.012703,3.518111,1.187104,8.4676,5.131519,3.878334,3.760797,1.73913,4.687965,-0.665503,0.215755,0.874858,0.193029,4.051237,1.198504,8.3759,0.671141,1.269015,6.5211,1.709402,4.484977,9.2373,0.689655,6.049708,8.3759,1.470588,105.899232,3.145976,2.794118,1.488411,5.660681,5.441735
267,10010,403,283.349209,170.270066,1991,-1.4001,12.0,-1.763275,0.178266,0.82803,0.349928,-14.333,88.254769,1.051825,3.935613,2.822852,1.288922,12.363,4.489517,3.720855,3.783681,2.803738,4.539475,-0.665503,0.215755,0.874858,0.193029,3.926769,1.138996,8.4676,1.73913,1.2508,7.178,1.492537,2.476694,8.7479,1.503759,5.832994,8.4676,3.389831,105.899232,3.990279,3.948573,1.612393,5.553907,5.30042


count    99307.000000
mean      1992.608507
std          1.007642
min       1991.000000
25%       1992.000000
50%       1993.000000
75%       1993.000000
max       1994.000000
Name: year, dtype: float64

PERMNO          99307
prd             99307
mom482          86889
mom242          98087
year            99307
RET             99307
ind             99307
bm              99307
op              99307
gp              99307
inv             99235
mom11           99307
mom122          99307
amhd            76497
ivol_capm       99305
ivol_ff5        99305
beta_bw         99307
MAX             99307
vol1m           99296
vol6m           99255
vol12m          99143
BAspr           73203
size            99307
lbm             99307
lop             99307
lgp             99307
linv            99307
llme            99307
l1amhd          76319
l1MAX           99304
l1BAspr         71684
l3amhd          75954
l3MAX           99293
l3BAspr         68572
l6amhd          75349
l6MAX           99289
l6BAspr         64086
l12amhd         74590
l12MAX          99304
l12BAspr        55213
l12mom122       99009
l12ivol_capm    99262
l12ivol_ff5     99262
l12beta_bw      99286
l12vol6m        99085
l12vol12m 

Number of features before transformation:  (93329, 44)
time to do feature proprocessing: 
Number of features after transformation:  (93329, 92)
mae of a constant model 9.672994876869886
R2 of a constant model 0.0
XGB train: 9.420535194195514 0.07314486258088526
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.3s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.4s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:56:34,277][0m A new study created in memory with name: no-name-8eeb336f-cd06-4008-8a54-75b767b7ca38[0m


XGB train: 9.595724262339575 0.026934979178415253 50.27580285072327


[32m[I 2022-08-25 20:56:38,474][0m Trial 0 finished with value: 0.005759122604463357 and parameters: {'n_estimators': 825, 'max_depth': 2, 'learning_rate': 0.04823598942492452, 'colsample_bytree': 0.80499973985738, 'subsample': 0.9156988534699153, 'alpha': 0.876543519301316, 'lambda': 31.5793389972349, 'gamma': 0.0006225576404953054, 'min_child_weight': 2.12656446496109}. Best is trial 0 with value: 0.005759122604463357.[0m
[32m[I 2022-08-25 20:56:43,173][0m Trial 1 finished with value: 0.005997799603415438 and parameters: {'n_estimators': 850, 'max_depth': 2, 'learning_rate': 0.013372048296814976, 'colsample_bytree': 0.8250614612843398, 'subsample': 0.7389447898319327, 'alpha': 3.3110558293805097, 'lambda': 9.035560293433845, 'gamma': 1.5501442021534912e-09, 'min_child_weight': 33.23759950154523}. Best is trial 1 with value: 0.005997799603415438.[0m
[32m[I 2022-08-25 20:56:48,983][0m Trial 2 finished with value: -0.005091092811945744 and parameters: {'n_estimators': 743, 'max_

Total time for hypermarameter optimization  102.224609375
        n_estimators : 665
           max_depth : 3
       learning_rate : 0.008154389391467096
    colsample_bytree : 0.8967515230587886
           subsample : 0.5942647055111392
               alpha : 0.3080017108605482
              lambda : 189.5739859207184
               gamma : 1.8293111355227924e-07
    min_child_weight : 0.8814218444600123
best objective value : 0.007488828931675319
Optuna XGB train: 9.629880927657037 0.016660258026348185 104.81796431541443
Min_prd:  400
Constant guess:  9.657962431562893 0.0
XGB test: 9.62087685771785 0.006133389918914767
XGB GS test: 9.622542370658485 0.007228779508588468
Optuna XGB test: 9.629691695743013 0.004854936781629293


(102182, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
288,10010,424,148.13418,-62.697082,1993,-15.6346,12.0,-1.623669,0.013844,0.291257,0.8003,-7.3829,-24.536334,2.356309,3.171143,2.755517,1.025037,7.1319,3.247245,3.250157,3.536904,3.773585,4.011162,-1.838236,0.165652,0.658097,0.8003,4.343682,2.344284,4.9111,5.357143,2.257265,2.8461,3.896104,2.140223,15.6507,2.083333,1.239298,4.9111,3.797468,-49.976941,4.62406,3.990327,1.453285,3.616714,4.064687
289,10010,425,101.76139,-58.326806,1993,-11.6236,12.0,-1.623669,0.013844,0.291257,0.8003,-15.6346,-20.026525,2.319279,2.464863,2.396834,0.93789,4.2443,2.455998,2.837847,3.526523,4.444444,3.844108,-1.838236,0.165652,0.658097,0.8003,4.211913,2.356309,7.1319,3.773585,2.288018,2.8461,4.6875,2.159371,14.988,2.197802,1.56279,7.1319,2.857143,-48.526911,2.413109,2.010203,1.480784,3.467116,4.028793
290,10010,426,117.655807,-58.532664,1993,20.2928,12.0,-1.623669,0.013844,0.291257,0.8003,-11.6236,-22.505261,2.375564,3.422118,2.553819,0.914418,5.3934,3.461057,3.038084,3.576669,2.631579,3.72348,-1.838236,0.165652,0.658097,0.8003,4.076367,2.319279,4.2443,4.444444,2.344284,4.9111,5.357143,2.243236,5.4835,2.469136,1.764717,4.2443,3.389831,-40.574374,2.450042,2.158362,1.394343,3.457382,3.812707
291,10010,427,155.756932,-48.367102,1993,-13.016,12.0,-1.623669,0.013844,0.291257,0.8003,20.2928,-36.48873,2.538076,5.092653,4.290265,0.700595,14.6231,5.425358,3.609285,3.578975,1.960784,3.910066,-1.838236,0.165652,0.658097,0.8003,4.154188,2.375564,5.3934,2.631579,2.356309,7.1319,3.773585,2.257265,2.8461,3.896104,1.971426,5.3934,1.470588,-39.60537,5.309489,4.505264,1.380782,3.965839,3.896158
292,10010,428,61.482636,-65.093531,1993,20.5017,12.0,-1.623669,0.013844,0.291257,0.8003,-13.016,-36.684897,2.590794,3.55916,2.977536,0.799717,8.4986,3.874691,3.789539,3.641639,5.0,3.773491,-1.838236,0.165652,0.658097,0.8003,4.343945,2.538076,14.6231,1.960784,2.319279,4.2443,4.444444,2.288018,2.8461,4.6875,2.095885,14.6231,2.631579,-33.948972,2.597573,2.26997,1.512959,4.014497,3.84077


count    102182.000000
mean       1994.716124
std           0.994935
min        1993.000000
25%        1994.000000
50%        1995.000000
75%        1996.000000
max        1996.000000
Name: year, dtype: float64

PERMNO          102182
prd             102182
mom482           87382
mom242          100438
year            102182
RET             102182
ind             102182
bm              102182
op              102182
gp              102182
inv             102148
mom11           102182
mom122          102182
amhd             85532
ivol_capm       102178
ivol_ff5        102178
beta_bw         102182
MAX             102182
vol1m           102172
vol6m           102108
vol12m          101974
BAspr           100566
size            102182
lbm             102182
lop             102182
lgp             102182
linv            102182
llme            102182
l1amhd           85396
l1MAX           102175
l1BAspr         100597
l3amhd           85087
l3MAX           102157
l3BAspr         100635
l6amhd           84563
l6MAX           102141
l6BAspr         100531
l12amhd          83700
l12MAX          102175
l12BAspr         94697
l12mom122       101709
l12ivol_capm    102101
l12ivol_ff5     102101
l12beta_bw 

Number of features before transformation:  (95853, 44)
time to do feature proprocessing: 
Number of features after transformation:  (95853, 92)
mae of a constant model 9.375745237906912
R2 of a constant model 0.0
XGB train: 9.060360096757641 0.07527572391660609
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 20:59:13,415][0m A new study created in memory with name: no-name-ea8d5e35-5d64-4488-b6de-2328760a8309[0m


XGB train: 9.264390698311827 0.018518375219927052 50.35354495048523


[32m[I 2022-08-25 20:59:18,081][0m Trial 0 finished with value: 0.006866988004475612 and parameters: {'n_estimators': 902, 'max_depth': 2, 'learning_rate': 0.023516894023226927, 'colsample_bytree': 0.3218806645071921, 'subsample': 0.3329226495903268, 'alpha': 0.46744653606850045, 'lambda': 0.534702536348529, 'gamma': 8.163747030006883, 'min_child_weight': 0.4712755923752075}. Best is trial 0 with value: 0.006866988004475612.[0m
[32m[I 2022-08-25 20:59:26,077][0m Trial 1 finished with value: 0.007179150575405751 and parameters: {'n_estimators': 922, 'max_depth': 4, 'learning_rate': 0.003156037731425779, 'colsample_bytree': 0.6781060558438197, 'subsample': 0.45565910128925075, 'alpha': 2.4681648123857562, 'lambda': 18.86556410547153, 'gamma': 0.9457583603406172, 'min_child_weight': 1.7461703259213068}. Best is trial 1 with value: 0.007179150575405751.[0m
[32m[I 2022-08-25 20:59:37,890][0m Trial 2 finished with value: -0.035757909662161885 and parameters: {'n_estimators': 985, 'ma

Total time for hypermarameter optimization  116.27700185775757
        n_estimators : 675
           max_depth : 2
       learning_rate : 0.01567688865911201
    colsample_bytree : 0.5643224582137997
           subsample : 0.6698422019465207
               alpha : 11.121249929987147
              lambda : 21.20477068580261
               gamma : 6.257263680842202e-07
    min_child_weight : 0.10842289502104378
best objective value : 0.008109866342916206
Optuna XGB train: 9.273415089986655 0.015569196331451929 118.31046509742737
Min_prd:  425
Constant guess:  9.364989596407753 0.0
XGB test: 9.259611324257023 0.00927523096303362
XGB GS test: 9.274234808150892 0.007465935485271635
Optuna XGB test: 9.279454702242596 0.007930612205281684


(108887, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
350,10011,449,283.349209,120.876558,1995,-0.43,34.0,-2.486223,0.258106,0.875338,0.516551,-7.4467,62.087334,2.203319,3.397123,3.021503,0.591144,5.2897,3.321408,4.042566,3.530399,4.166667,4.475688,-1.862198,0.132619,0.757579,0.335288,3.974058,2.158299,11.8414,3.816794,2.246729,3.7639,2.5,2.492715,3.4878,0.934579,4.213494,11.8414,1.351351,100.655293,1.538877,1.288538,0.647352,2.929669,3.074934
351,10011,450,205.913318,95.440525,1995,-8.8033,34.0,-2.486223,0.258106,0.875338,0.516551,-0.43,43.018972,2.055711,1.691854,1.403498,0.529444,3.4263,1.903369,2.962005,3.527886,1.680672,4.475688,-1.862198,0.132619,0.757579,0.335288,4.025352,2.203319,5.2897,4.166667,2.232079,3.3123,1.694915,2.295714,11.8848,2.564103,3.981594,5.2897,2.380952,53.833664,1.642887,1.515819,0.763396,2.74982,2.941439
352,10011,451,221.03639,71.141338,1995,-9.5109,34.0,-2.486223,0.258106,0.875338,0.516551,-8.8033,4.809002,1.994961,4.082701,3.614374,0.889628,8.0435,4.223638,2.857356,3.4293,1.886792,4.388676,-1.862198,0.132619,0.757579,0.335288,4.334673,2.055711,3.4263,1.680672,2.158299,11.8414,3.816794,2.282986,17.071,1.481481,3.937745,3.4263,1.550388,43.713086,2.723997,2.325965,0.67079,2.191232,2.755011
353,10011,452,168.594228,68.49689,1995,-6.49,34.0,-2.486223,0.258106,0.875338,0.516551,-9.5109,-24.282511,2.029321,1.958872,1.492302,0.955259,3.9016,1.971703,2.845821,3.36399,0.990099,4.579211,-1.862198,0.132619,0.757579,0.335288,4.591956,1.994961,8.0435,1.886792,2.203319,5.2897,4.166667,2.246729,3.7639,2.5,3.821606,8.0435,2.112676,73.239085,4.861236,3.538108,0.631505,2.827082,3.005351
354,10011,453,236.841444,69.211874,1996,8.0806,34.0,-2.486223,0.258106,0.875338,0.516551,-6.49,-26.926242,1.99877,2.60995,2.444418,0.89182,4.5215,2.700381,2.99323,3.36139,3.0,4.517336,-1.862198,0.132619,0.757579,0.335288,4.532237,2.029321,3.9016,0.990099,2.055711,3.4263,1.680672,2.232079,3.3123,1.694915,3.248493,3.9016,1.538462,105.899232,2.666218,2.28462,0.69345,2.876936,3.023678


count    108887.000000
mean       1996.781535
std           0.968163
min        1995.000000
25%        1996.000000
50%        1997.000000
75%        1998.000000
max        1998.000000
Name: year, dtype: float64

PERMNO          108887
prd             108887
mom482           87758
mom242          106704
year            108887
RET             108887
ind             108887
bm              108887
op              108887
gp              108887
inv             108790
mom11           108887
mom122          108887
amhd             95617
ivol_capm       108881
ivol_ff5        108881
beta_bw         108887
MAX             108887
vol1m           108877
vol6m           108802
vol12m          108644
BAspr           105438
size            108887
lbm             108887
lop             108887
lgp             108887
linv            108887
llme            108887
l1amhd           95533
l1MAX           108883
l1BAspr         105560
l3amhd           95274
l3MAX           108865
l3BAspr         105775
l6amhd           94783
l6MAX           108849
l6BAspr         105924
l12amhd          94024
l12MAX          108883
l12BAspr        106224
l12mom122       108559
l12ivol_capm    108798
l12ivol_ff5     108798
l12beta_bw 

Number of features before transformation:  (102533, 44)
time to do feature proprocessing: 
Number of features after transformation:  (102533, 92)
mae of a constant model 10.140774622556364
R2 of a constant model 0.0
XGB train: 9.659322244479393 0.07092865382863278
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, e

[32m[I 2022-08-25 21:02:08,509][0m A new study created in memory with name: no-name-ed33f6a0-9fda-4cdc-9602-60d897324a9c[0m


XGB train: 9.832710286259864 0.02838988710259338 52.64358353614807


[32m[I 2022-08-25 21:02:13,441][0m Trial 0 finished with value: 0.00979115834059853 and parameters: {'n_estimators': 865, 'max_depth': 2, 'learning_rate': 0.023065710668784666, 'colsample_bytree': 0.6976659977019947, 'subsample': 0.6651324375532808, 'alpha': 0.5589226950553211, 'lambda': 0.14365537125435116, 'gamma': 2.0891348309890828e-05, 'min_child_weight': 2.2317107767158255}. Best is trial 0 with value: 0.00979115834059853.[0m
[32m[I 2022-08-25 21:02:22,970][0m Trial 1 finished with value: -0.010728277805102916 and parameters: {'n_estimators': 773, 'max_depth': 5, 'learning_rate': 0.0396561082687114, 'colsample_bytree': 0.7008145627161387, 'subsample': 0.5738330908133162, 'alpha': 3.731904248586211, 'lambda': 69.85233544992262, 'gamma': 0.4173643551843362, 'min_child_weight': 0.39570301870778596}. Best is trial 0 with value: 0.00979115834059853.[0m
[32m[I 2022-08-25 21:02:31,936][0m Trial 2 finished with value: -0.01438942884426983 and parameters: {'n_estimators': 771, 'ma

Total time for hypermarameter optimization  114.0547513961792
        n_estimators : 733
           max_depth : 3
       learning_rate : 0.008268722836425688
    colsample_bytree : 0.34251300952544905
           subsample : 0.6220085981026944
               alpha : 1.4632707328387815
              lambda : 7.527001248509026
               gamma : 0.1684758109865483
    min_child_weight : 7.547306535443816
best objective value : 0.010214535970146946
Optuna XGB train: 9.862107081799374 0.020109116383312253 117.08282375335693
Min_prd:  450
Constant guess:  13.5583345263632 0.0
XGB test: 13.366165935209699 0.01283646666221916
XGB GS test: 13.402638448397843 0.009184515519970238
Optuna XGB test: 13.41197447194481 0.00815673566504127


(105719, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
375,10011,474,26.008669,-25.175669,1997,4.3137,34.0,-0.605004,0.14208,0.464046,0.8003,-3.8686,-19.026681,2.047956,1.21784,1.214213,0.513672,2.2136,1.234739,2.129959,2.396758,1.176471,4.412075,-1.885951,0.25031,0.753169,0.766547,4.607743,2.012275,3.5094,2.824859,2.016314,3.2787,2.12766,1.831441,8.4297,2.941176,1.530987,3.5094,1.923077,-18.211667,1.110868,0.961548,0.646142,2.599593,2.582935
376,10011,475,23.676379,-20.179582,1997,1.3049,34.0,-0.605004,0.14208,0.464046,0.8003,4.3137,-7.383117,2.008437,4.74904,4.067699,0.506943,20.2132,4.991303,2.652903,2.68762,1.675978,4.458326,-1.885951,0.25031,0.753169,0.766547,4.438922,2.047956,2.2136,1.176471,2.00644,2.8381,2.298851,2.009115,9.0709,2.097902,1.626085,2.2136,3.333333,-10.711172,2.210422,2.152782,0.639891,2.612252,2.373065
377,10011,476,33.362887,-18.402128,1997,2.8533,34.0,-0.605004,0.14208,0.464046,0.8003,1.3049,0.489275,1.989774,0.918331,0.857661,0.504465,1.6739,1.096745,2.366091,2.661125,0.534759,4.491021,-1.885951,0.25031,0.753169,0.766547,4.405243,2.008437,20.2132,1.675978,2.012275,3.5094,2.824859,2.051244,8.2123,2.247191,1.703041,20.2132,2.380952,-17.068809,1.7922,1.420329,0.639312,2.095041,2.374053
378,10011,477,34.705872,-17.296715,1998,2.2582,34.0,-0.605004,0.14208,0.464046,0.8003,2.8533,16.164007,1.928822,0.899183,0.704377,0.528401,3.8669,0.86687,2.303914,2.488976,0.537634,4.523811,-1.885951,0.25031,0.753169,0.766547,4.278491,1.989774,1.6739,0.534759,2.047956,2.2136,1.176471,2.016314,3.2787,2.12766,1.771183,1.6739,2.816901,-14.734834,3.190956,3.166502,0.520025,2.248575,2.392976
607,10016,474,-9.74097,14.793497,1997,-9.742,21.0,-0.791018,0.072323,0.2959,0.014439,4.9171,-0.607002,1.665406,1.261913,1.176158,0.581543,3.0901,1.264878,1.758746,2.02596,0.416667,5.310389,-1.049362,-0.000786,0.269403,-0.230583,5.210249,1.772101,1.7982,1.785714,1.863802,4.8364,1.801802,1.744671,2.5832,2.040816,1.802498,1.7982,2.830189,-2.871914,1.078966,0.990447,0.500919,2.088419,2.299801


count    105719.000000
mean       1998.813175
std           0.951883
min        1997.000000
25%        1998.000000
50%        1999.000000
75%        2000.000000
max        2000.000000
Name: year, dtype: float64

PERMNO          105719
prd             105719
mom482           85412
mom242          103700
year            105719
RET             105719
ind             105719
bm              105719
op              105719
gp              105719
inv             105654
mom11           105719
mom122          105719
amhd             95259
ivol_capm       105718
ivol_ff5        105718
beta_bw         105719
MAX             105719
vol1m           105716
vol6m           105677
vol12m          105605
BAspr            99273
size            105719
lbm             105719
lop             105719
lgp             105719
linv            105719
llme            105719
l1amhd           95253
l1MAX           105717
l1BAspr          99691
l3amhd           95191
l3MAX           105708
l3BAspr         100442
l6amhd           95088
l6MAX           105707
l6BAspr         100452
l12amhd          94895
l12MAX          105717
l12BAspr        101260
l12mom122       105606
l12ivol_capm    105695
l12ivol_ff5     105695
l12beta_bw 

Number of features before transformation:  (99209, 44)
time to do feature proprocessing: 
Number of features after transformation:  (99209, 92)
mae of a constant model 12.065001504034083
R2 of a constant model 0.0
XGB train: 11.542158028170723 0.0683018951190858
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.3s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta

[32m[I 2022-08-25 21:05:02,155][0m A new study created in memory with name: no-name-7805a597-b972-4875-9d7d-c610c9c89fc5[0m


XGB train: 11.757632120495467 0.027636870661727486 52.19143843650818


[32m[I 2022-08-25 21:05:08,683][0m Trial 0 finished with value: 0.009766068295833298 and parameters: {'n_estimators': 518, 'max_depth': 5, 'learning_rate': 0.01861593342147072, 'colsample_bytree': 0.2323609184215765, 'subsample': 0.564220068794236, 'alpha': 0.25878484424939097, 'lambda': 9.389710674326418, 'gamma': 0.008952725288914299, 'min_child_weight': 0.2947546649591775}. Best is trial 0 with value: 0.009766068295833298.[0m
[32m[I 2022-08-25 21:05:16,137][0m Trial 1 finished with value: 0.011936011435644519 and parameters: {'n_estimators': 894, 'max_depth': 4, 'learning_rate': 0.008537321954931943, 'colsample_bytree': 0.2930143447949545, 'subsample': 0.7692188977885224, 'alpha': 0.11172280910297488, 'lambda': 52.891288887538195, 'gamma': 3.128679408045531e-09, 'min_child_weight': 0.47247711184827795}. Best is trial 1 with value: 0.011936011435644519.[0m
[32m[I 2022-08-25 21:05:24,707][0m Trial 2 finished with value: 0.011503219607737801 and parameters: {'n_estimators': 737

Total time for hypermarameter optimization  133.79924774169922
        n_estimators : 831
           max_depth : 4
       learning_rate : 0.010502156218834937
    colsample_bytree : 0.6345463422907152
           subsample : 0.8129198571758328
               alpha : 0.42710880135067364
              lambda : 155.01198150902675
               gamma : 1.3873974842266697e-10
    min_child_weight : 0.5537201152755507
best objective value : 0.012965150933984067
Optuna XGB train: 11.714226808212636 0.034695843641973756 138.6061770915985
Min_prd:  475
Constant guess:  13.648124608226308 0.0
XGB test: 13.493621350849795 0.01653321511831174
XGB GS test: 13.485694602288772 0.018099312155868463
Optuna XGB test: 13.4876446904041 0.017724635657002108


(95298, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
632,10016,499,34.913656,26.266253,1999,8.1926,21.0,-1.693802,0.079165,0.235056,-0.230583,-4.1875,35.573893,1.207979,1.936031,1.745435,0.388086,5.1903,1.980852,1.943183,1.817426,2.597403,5.827506,-0.92299,0.018059,0.235379,-0.079193,5.228399,1.455754,5.9423,1.269841,1.588418,4.2677,1.0,1.682986,2.8806,1.351351,2.093882,5.9423,2.654867,-20.659951,3.366311,3.204773,0.611095,2.517477,2.27978
633,10016,500,48.977513,40.863722,1999,-8.9248,21.0,-1.693802,0.079165,0.235056,-0.230583,8.1926,17.655505,1.219978,2.377887,1.967402,0.380211,4.7298,2.395814,2.129051,1.788896,0.609756,5.909571,-0.92299,0.018059,0.235379,-0.079193,5.330182,1.207979,5.1903,2.597403,1.527171,2.0228,0.986842,1.64553,3.3614,0.657895,2.140377,5.1903,1.239669,-2.757373,2.398036,2.32529,0.585464,2.588438,2.33483
634,10016,501,39.291939,28.261144,2000,-10.3438,21.0,-1.693802,0.079165,0.235056,-0.230583,-8.9248,19.780481,1.387478,2.819161,2.411974,0.368661,5.6404,2.912162,2.388399,1.92066,3.503185,5.820906,-0.92299,0.018059,0.235379,-0.079193,5.70596,1.219978,4.7298,0.609756,1.455754,5.9423,1.269841,1.668308,1.9557,0.649351,1.911826,4.7298,1.153846,13.040913,1.523856,1.359584,0.601084,2.584622,2.344358
635,10016,502,9.334185,17.50446,2000,12.8053,21.0,-1.693802,0.079165,0.235056,-0.230583,-10.3438,10.10004,1.519072,3.586856,3.250253,0.499061,6.6691,4.190953,2.769244,2.235977,0.352734,5.716281,-0.92299,0.018059,0.235379,-0.079193,5.700272,1.387478,5.6404,3.503185,1.207979,5.1903,2.597403,1.588418,4.2677,1.0,1.852761,5.6404,0.784314,19.507886,0.868158,0.806485,0.592462,2.562278,2.221828
636,10016,503,9.261171,14.471627,2000,-2.4181,21.0,-1.693802,0.079165,0.235056,-0.230583,12.8053,0.593516,1.585015,3.248652,2.874087,0.514101,6.8621,3.29922,3.038162,2.386511,2.054795,5.840578,-0.92299,0.018059,0.235379,-0.079193,5.684946,1.519072,6.6691,0.352734,1.219978,4.7298,0.609756,1.527171,2.0228,0.986842,1.817956,6.6691,1.515152,23.556268,1.370183,1.261058,0.595658,2.496284,2.125555


count    95298.000000
mean      2000.896399
std          0.960603
min       1999.000000
25%       2000.000000
50%       2001.000000
75%       2002.000000
max       2003.000000
Name: year, dtype: float64

PERMNO          95298
prd             95298
mom482          79488
mom242          93550
year            95298
RET             95298
ind             95298
bm              95298
op              95298
gp              95298
inv             95207
mom11           95298
mom122          95298
amhd            84633
ivol_capm       95293
ivol_ff5        95293
beta_bw         95298
MAX             95298
vol1m           95292
vol6m           95206
vol12m          95083
BAspr           88082
size            95298
lbm             95298
lop             95298
lgp             95298
linv            95298
llme            95298
l1amhd          84732
l1MAX           95292
l1BAspr         87985
l3amhd          84898
l3MAX           95262
l3BAspr         87768
l6amhd          85145
l6MAX           95239
l6BAspr         87471
l12amhd         85718
l12MAX          95292
l12BAspr        87125
l12mom122       95019
l12ivol_capm    95189
l12ivol_ff5     95189
l12beta_bw      95215
l12vol6m        95052
l12vol12m 

Number of features before transformation:  (88987, 44)
time to do feature proprocessing: 
Number of features after transformation:  (88987, 92)
mae of a constant model 12.533797015909979
R2 of a constant model 0.0
XGB train: 12.145985945010235 0.08088662998682272
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, et

[32m[I 2022-08-25 21:08:13,022][0m A new study created in memory with name: no-name-7300430d-cded-4cc2-8b50-58905c9cef95[0m


XGB train: 12.336848101879724 0.04798811067566022 48.51966881752014


[32m[I 2022-08-25 21:08:17,873][0m Trial 0 finished with value: 0.015036988860111391 and parameters: {'n_estimators': 899, 'max_depth': 2, 'learning_rate': 0.024564812074828003, 'colsample_bytree': 0.6365721970673731, 'subsample': 0.6924871799801764, 'alpha': 1.025825286496445, 'lambda': 0.6434245081022784, 'gamma': 0.00030781977021990584, 'min_child_weight': 5.646271975687925}. Best is trial 0 with value: 0.015036988860111391.[0m
[32m[I 2022-08-25 21:08:22,623][0m Trial 1 finished with value: 0.015103607709707974 and parameters: {'n_estimators': 580, 'max_depth': 4, 'learning_rate': 0.01790495828925795, 'colsample_bytree': 0.457669000640876, 'subsample': 0.4774399150902327, 'alpha': 1.402535820251213, 'lambda': 10.786693056743633, 'gamma': 0.2705482254398916, 'min_child_weight': 0.4918849408409259}. Best is trial 1 with value: 0.015103607709707974.[0m
[32m[I 2022-08-25 21:08:29,627][0m Trial 2 finished with value: 0.015455688164812962 and parameters: {'n_estimators': 909, 'max

Total time for hypermarameter optimization  120.91310548782349
        n_estimators : 998
           max_depth : 4
       learning_rate : 0.010418306713864362
    colsample_bytree : 0.3133044192167022
           subsample : 0.932849274772066
               alpha : 0.1010149308831013
              lambda : 30.586073755056848
               gamma : 6.280336232691761e-06
    min_child_weight : 8.749729572829054
best objective value : 0.01582082156622757
Optuna XGB train: 12.33544982274985 0.04674926543109492 126.35574221611023
Min_prd:  500
Constant guess:  10.208351337073477 0.0
XGB test: 9.932024684838842 0.0495424404432826
XGB GS test: 9.903912087752586 0.05459643833826966
Optuna XGB test: 9.899188038231635 0.055387304196670706


(88078, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
864,10019,524,-80.628208,-72.706514,2001,64.3237,22.0,0.456985,0.024248,0.438549,-0.207652,-18.4496,-61.937195,6.974454,7.874195,6.946893,0.574234,21.135115,8.239788,7.983689,7.756113,5.555556,1.839648,0.133067,0.095399,0.379546,0.8003,4.416116,6.841097,13.4046,7.5,6.24626,14.0495,4.0,4.172441,10.3268,8.108108,4.897832,13.4046,1.910828,-19.751637,7.874195,6.946893,0.828611,7.708467,6.632332
865,10019,525,-80.628208,-72.706514,2002,-36.172,22.0,0.456985,0.024248,0.438549,-0.207652,26.592465,-61.937195,7.02955,7.874195,6.946893,0.618013,21.135115,8.239788,7.983689,7.756113,1.904762,2.337228,0.133067,0.095399,0.379546,0.8003,3.878006,6.974454,21.135115,5.555556,6.488633,21.135115,11.504425,5.425547,21.135115,7.978723,4.867878,21.135115,6.060606,71.543684,7.537647,6.946893,0.761543,7.983689,6.587764
866,10019,526,-80.628208,-72.706514,2002,6.1732,22.0,0.456985,0.024248,0.438549,-0.207652,-22.380465,-61.937195,7.242336,7.874195,6.946893,0.669389,18.7547,8.239788,7.983689,7.756113,7.5,1.890441,0.133067,0.095399,0.379546,0.8003,4.290538,7.02955,21.135115,1.904762,6.841097,13.4046,7.5,6.018584,21.135115,3.846154,4.726778,21.135115,2.597403,12.326471,7.874195,6.946893,0.798752,7.983689,6.964654
867,10019,527,-80.628208,-72.706514,2002,52.7523,22.0,0.456985,0.024248,0.438549,-0.207652,6.1732,-61.937195,7.354029,7.257745,6.191064,0.708928,10.3826,7.253446,7.983689,7.756113,10.526316,1.951566,0.133067,0.095399,0.379546,0.8003,3.9215,7.242336,18.7547,7.5,6.974454,21.135115,5.555556,6.24626,14.0495,4.0,4.422981,18.7547,4.166667,74.308882,6.155932,5.038919,0.862209,7.983689,7.048842
868,10019,528,-80.628208,-72.706514,2002,2.1971,22.0,0.456985,0.024248,0.438549,-0.207652,26.592465,-61.937195,7.376474,7.874195,6.946893,0.689922,21.135115,8.239788,7.983689,7.756113,7.54717,2.376064,0.133067,0.095399,0.379546,0.8003,3.900881,7.354029,10.3826,10.526316,7.02955,21.135115,1.904762,6.488633,21.135115,11.504425,4.332441,10.3826,2.12766,44.013624,5.082835,3.907795,0.890379,7.983689,7.075186


count    88078.000000
mean      2003.040771
std          0.961813
min       2001.000000
25%       2002.000000
50%       2003.000000
75%       2004.000000
max       2005.000000
Name: year, dtype: float64

PERMNO          88078
prd             88078
mom482          76756
mom242          86597
year            88078
RET             88078
ind             88078
bm              88078
op              88078
gp              88078
inv             88013
mom11           88078
mom122          88078
amhd            79370
ivol_capm       88076
ivol_ff5        88076
beta_bw         88078
MAX             88078
vol1m           88074
vol6m           87967
vol12m          87795
BAspr           85032
size            88078
lbm             88078
lop             88078
lgp             88078
linv            88078
llme            88078
l1amhd          79247
l1MAX           88074
l1BAspr         85139
l3amhd          79005
l3MAX           88047
l3BAspr         85303
l6amhd          78778
l6MAX           88017
l6BAspr         85524
l12amhd         78656
l12MAX          88074
l12BAspr        82548
l12mom122       87808
l12ivol_capm    87955
l12ivol_ff5     87955
l12beta_bw      87988
l12vol6m        87793
l12vol12m 

Number of features before transformation:  (82903, 44)
time to do feature proprocessing: 
Number of features after transformation:  (82903, 92)
mae of a constant model 9.865920644923682
R2 of a constant model 0.0
XGB train: 9.607009612483386 0.08343238287855514
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 21:11:10,213][0m A new study created in memory with name: no-name-7d6d3ca2-7836-4a18-a8a1-1f1d789a6da3[0m


XGB train: 9.82901782792788 0.030747723504207736 47.32248830795288


[32m[I 2022-08-25 21:11:14,104][0m Trial 0 finished with value: 0.005271857158222116 and parameters: {'n_estimators': 832, 'max_depth': 2, 'learning_rate': 0.04931517656937908, 'colsample_bytree': 0.5824242082478082, 'subsample': 0.5498876580158043, 'alpha': 0.3508232780466895, 'lambda': 4.873293620927068, 'gamma': 1.1123842773930086e-05, 'min_child_weight': 13.922116143471701}. Best is trial 0 with value: 0.005271857158222116.[0m
[32m[I 2022-08-25 21:11:18,129][0m Trial 1 finished with value: -0.0034307505601002997 and parameters: {'n_estimators': 525, 'max_depth': 4, 'learning_rate': 0.04154668654512801, 'colsample_bytree': 0.16279475546246241, 'subsample': 0.5195737438702127, 'alpha': 1.2854001662645105, 'lambda': 0.8776983010417051, 'gamma': 4.626183374281864e-09, 'min_child_weight': 3.1018733212936973}. Best is trial 0 with value: 0.005271857158222116.[0m
[32m[I 2022-08-25 21:11:21,943][0m Trial 2 finished with value: 0.004460005361445562 and parameters: {'n_estimators': 6

Total time for hypermarameter optimization  115.24856281280518
        n_estimators : 868
           max_depth : 3
       learning_rate : 0.02026559337396688
    colsample_bytree : 0.25130285484184134
           subsample : 0.31197680711003545
               alpha : 13.145402763210186
              lambda : 33.91961971552681
               gamma : 3.528776628825982e-08
    min_child_weight : 16.864035531707575
best objective value : 0.008093638401111382
Optuna XGB train: 9.808368749871093 0.03429281339280321 118.16722297668457
Min_prd:  525
Constant guess:  8.126500500194087 0.0
XGB test: 8.060704934892494 0.026679110225521163
XGB GS test: 8.036618679857735 0.028794101222188617
Optuna XGB test: 8.031346259319342 0.03147307244347575


(85043, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
1106,10025,549,-69.024047,-64.972497,2004,12.8779,15.0,-0.513525,0.048325,0.323181,0.070749,26.592465,-45.175222,2.862324,1.756749,1.501836,0.850673,5.1208,1.82011,1.788478,4.071073,1.34715,4.377464,-1.137114,0.059508,0.350297,-0.071339,4.634901,2.897483,3.6375,1.243094,2.862958,4.2016,0.465116,2.743738,7.5292,0.134228,2.257347,3.6375,0.307692,-50.490385,4.315112,3.883518,0.608191,5.412216,4.166177
1107,10025,550,-68.58638,-58.749269,2004,0.3036,15.0,-0.513525,0.048325,0.323181,0.070749,12.8779,44.060028,2.804626,2.72042,2.583923,0.761546,9.1867,2.873858,1.940165,3.949224,0.364299,4.50483,-1.137114,0.059508,0.350297,-0.071339,3.969097,2.862324,5.1208,1.34715,2.887925,2.4589,0.527704,2.761634,4.4944,0.751315,2.380322,5.1208,3.012912,-53.104849,4.405369,3.496512,0.634109,5.720214,4.387683
1108,10025,551,-65.920808,-67.320879,2004,-3.1697,15.0,-0.513525,0.048325,0.323181,0.070749,0.3036,49.780673,2.835775,0.852188,0.657714,0.795531,1.5453,0.901612,1.929759,3.480026,0.089286,4.50846,-1.137114,0.059508,0.350297,-0.071339,4.05212,2.804626,9.1867,0.364299,2.897483,3.6375,1.243094,2.829408,1.8767,1.533019,2.31487,9.1867,0.140449,-61.937195,6.009503,5.567252,0.65194,6.150585,4.673124
1109,10025,552,-69.716441,-66.38608,2004,11.5088,15.0,-0.513525,0.048325,0.323181,0.070749,-3.1697,26.547382,2.756714,1.744959,1.594028,0.749173,5.0395,1.725115,1.939281,2.806362,0.722022,4.477179,-1.137114,0.059508,0.350297,-0.071339,4.224549,2.835775,1.5453,0.089286,2.862324,5.1208,1.34715,2.862958,4.2016,0.465116,2.500611,1.5453,0.229095,-61.937195,7.350406,5.120025,0.674696,6.845902,5.011527
1110,10025,553,-35.633534,-66.274593,2004,-10.9478,15.0,-0.513525,0.048325,0.323181,0.070749,11.5088,49.68028,2.669581,1.116454,0.902261,0.623812,3.5474,1.141249,1.858085,2.446564,0.595238,4.605384,-1.137114,0.059508,0.350297,-0.071339,4.054984,2.756714,5.0395,0.722022,2.804626,9.1867,0.364299,2.887925,2.4589,0.527704,2.630135,5.0395,0.860832,-61.937195,4.355598,3.619636,0.710192,5.358175,5.189241


count    85043.000000
mean      2005.128217
std          0.943302
min       2004.000000
25%       2004.000000
50%       2005.000000
75%       2006.000000
max       2007.000000
Name: year, dtype: float64

PERMNO          85043
prd             85043
mom482          77060
mom242          83859
year            85043
RET             85043
ind             85043
bm              85043
op              85043
gp              85043
inv             84909
mom11           85043
mom122          85043
amhd            81466
ivol_capm       85042
ivol_ff5        85042
beta_bw         85043
MAX             85043
vol1m           85042
vol6m           84874
vol12m          84608
BAspr           81271
size            85043
lbm             85043
lop             85043
lgp             85043
linv            85043
llme            85043
l1amhd          81364
l1MAX           85038
l1BAspr         81177
l3amhd          81123
l3MAX           85001
l3BAspr         81085
l6amhd          80680
l6MAX           84968
l6BAspr         80893
l12amhd         79796
l12MAX          85038
l12BAspr        81095
l12mom122       84856
l12ivol_capm    84929
l12ivol_ff5     84929
l12beta_bw      84963
l12vol6m        84788
l12vol12m 

Number of features before transformation:  (80668, 44)
time to do feature proprocessing: 
Number of features after transformation:  (80668, 92)
mae of a constant model 8.207385221748945
R2 of a constant model 0.0
XGB train: 7.982650268503723 0.08794051382124346
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 21:14:00,027][0m A new study created in memory with name: no-name-db09d280-a45b-4d1b-a5b1-5466e0b5441a[0m


XGB train: 8.162100935678442 0.03306926922351294 48.11498737335205


[32m[I 2022-08-25 21:14:10,898][0m Trial 0 finished with value: -0.035532998594841006 and parameters: {'n_estimators': 967, 'max_depth': 5, 'learning_rate': 0.036493494060618005, 'colsample_bytree': 0.8861460099756683, 'subsample': 0.5394896359239336, 'alpha': 3.559116036433234, 'lambda': 3.692340231453164, 'gamma': 4.342440041898271, 'min_child_weight': 0.24303445933922418}. Best is trial 0 with value: -0.035532998594841006.[0m
[32m[I 2022-08-25 21:14:13,928][0m Trial 1 finished with value: 0.005818748174866388 and parameters: {'n_estimators': 501, 'max_depth': 3, 'learning_rate': 0.00473181292412814, 'colsample_bytree': 0.1578953264905537, 'subsample': 0.6685764476150368, 'alpha': 6.997706440068765, 'lambda': 0.33394912321611453, 'gamma': 3.604534702674281e-10, 'min_child_weight': 0.8418535406718569}. Best is trial 1 with value: 0.005818748174866388.[0m
[32m[I 2022-08-25 21:14:19,480][0m Trial 2 finished with value: 0.0019380840710236137 and parameters: {'n_estimators': 954, 

Total time for hypermarameter optimization  100.19071412086487
        n_estimators : 587
           max_depth : 2
       learning_rate : 0.023625462161335403
    colsample_bytree : 0.4469533320911829
           subsample : 0.7732353524100077
               alpha : 1.8375933456955929
              lambda : 6.836245033046999
               gamma : 0.0012596064722560072
    min_child_weight : 9.953199864596234
best objective value : 0.009009396307618765
Optuna XGB train: 8.19340096650884 0.021033249982477242 101.80624341964722
Min_prd:  550
Constant guess:  6.757637506147568 0.0
XGB test: 6.746015825507648 -0.0022614119899928564
XGB GS test: 6.735048684373413 0.0013169627783855553
Optuna XGB test: 6.736441359054837 -0.0004336835411822282


(79401, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
1131,10025,574,-2.974832,138.462531,2006,3.1985,15.0,-0.896731,0.092108,0.384218,-0.009293,3.65,29.081776,1.206031,1.971906,1.655145,0.772577,4.24,1.953144,2.076001,1.899123,0.03827,5.40753,-0.458862,0.058436,0.356411,-0.027007,5.06914,1.170735,2.8956,1.29361,1.528294,4.0896,0.431241,1.961437,1.8887,0.458482,2.553807,2.8956,0.569064,32.664685,3.995464,3.486598,0.496637,2.144327,1.822242
1132,10025,575,-26.926201,141.474082,2006,22.0297,15.0,-0.896731,0.092108,0.384218,-0.009293,3.1985,23.810656,1.238849,1.932768,1.469588,0.733751,6.8663,2.112008,2.187022,1.887418,0.22214,5.442536,-0.458862,0.058436,0.356411,-0.027007,5.148162,1.206031,4.24,0.03827,1.165798,2.8,0.617829,1.779929,1.6955,0.24426,2.424054,4.24,0.440313,68.816168,1.778135,1.597251,0.546035,2.232185,1.894171
1133,10025,576,-2.145041,157.746715,2006,6.5899,15.0,-0.896731,0.092108,0.384218,-0.009293,22.0297,32.26278,1.233384,1.627409,1.481364,0.77107,5.3763,1.761302,1.849104,1.931074,0.121175,5.652782,-0.458862,0.058436,0.356411,-0.027007,5.12557,1.238849,6.8663,0.22214,1.170735,2.8956,1.29361,1.718863,15.0031,0.134831,2.330424,6.8663,0.049975,88.399166,1.210893,1.027329,0.534169,2.270802,1.854214
1134,10025,577,-6.58708,170.270066,2006,-6.4885,15.0,-0.896731,0.092108,0.384218,-0.009293,6.5899,71.854469,1.165663,0.967349,0.836301,0.720582,2.7182,1.024173,1.550442,1.889576,0.344037,5.721243,-0.458862,0.058436,0.356411,-0.027007,5.065979,1.233384,5.3763,0.121175,1.206031,4.24,0.03827,1.528294,4.0896,0.431241,2.285802,5.3763,0.633276,63.216325,1.581157,1.340521,0.569174,2.339334,1.887731
1135,10025,578,-13.76767,170.270066,2006,0.5515,15.0,-2.833924,0.138217,0.49263,-0.230583,-6.4885,98.151432,0.966655,1.729306,1.656696,0.677494,5.9039,1.766046,1.651156,1.792183,0.151057,5.658745,-0.458862,0.058436,0.356411,-0.027007,4.990124,1.165663,2.7182,0.344037,1.238849,6.8663,0.22214,1.165798,2.8,0.617829,2.261726,2.7182,0.685714,72.131876,2.662614,2.197339,0.626926,2.570129,1.990144


count    79401.000000
mean      2007.202554
std          0.972074
min       2006.000000
25%       2006.000000
50%       2007.000000
75%       2008.000000
max       2009.000000
Name: year, dtype: float64

PERMNO          79401
prd             79401
mom482          71843
mom242          77707
year            79401
RET             79401
ind             79401
bm              79401
op              79401
gp              79401
inv             79229
mom11           79401
mom122          79401
amhd            77019
ivol_capm       79400
ivol_ff5        79400
beta_bw         79401
MAX             79401
vol1m           79399
vol6m           79207
vol12m          78903
BAspr           78124
size            79401
lbm             79401
lop             79401
lgp             79401
linv            79401
llme            79401
l1amhd          77037
l1MAX           79398
l1BAspr         78035
l3amhd          77062
l3MAX           79357
l3BAspr         77897
l6amhd          77045
l6MAX           79325
l6BAspr         77744
l12amhd         76876
l12MAX          79398
l12BAspr        76893
l12mom122       79222
l12ivol_capm    79298
l12ivol_ff5     79298
l12beta_bw      79335
l12vol6m        79138
l12vol12m 

Number of features before transformation:  (74917, 44)
time to do feature proprocessing: 
Number of features after transformation:  (74917, 92)
mae of a constant model 9.37771448482403
R2 of a constant model 0.0
XGB train: 8.760247406031798 0.08801593642949712
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0

[32m[I 2022-08-25 21:16:30,274][0m A new study created in memory with name: no-name-5288407e-79e6-40be-95d0-8afa4ee78125[0m


XGB train: 8.96426959307641 0.0329700854269932 44.876866579055786


[32m[I 2022-08-25 21:16:37,500][0m Trial 0 finished with value: -0.01817829058720192 and parameters: {'n_estimators': 948, 'max_depth': 4, 'learning_rate': 0.036810429385138094, 'colsample_bytree': 0.7657108173552764, 'subsample': 0.5531484067987946, 'alpha': 4.10144474526287, 'lambda': 0.40489107646781697, 'gamma': 2.313574522906363e-05, 'min_child_weight': 4.342095520211626}. Best is trial 0 with value: -0.01817829058720192.[0m
[32m[I 2022-08-25 21:16:41,344][0m Trial 1 finished with value: 0.0028207286007919164 and parameters: {'n_estimators': 639, 'max_depth': 3, 'learning_rate': 0.04071450093723057, 'colsample_bytree': 0.16672943692574527, 'subsample': 0.8938768214972186, 'alpha': 1.1576882807905784, 'lambda': 0.8280540879257247, 'gamma': 2.7979366423936194e-10, 'min_child_weight': 2.6338992394639837}. Best is trial 1 with value: 0.0028207286007919164.[0m
[32m[I 2022-08-25 21:16:46,318][0m Trial 2 finished with value: 0.005772743866480837 and parameters: {'n_estimators': 8

Total time for hypermarameter optimization  86.84788966178894
        n_estimators : 509
           max_depth : 2
       learning_rate : 0.017147852308275726
    colsample_bytree : 0.3735095670847926
           subsample : 0.6802727435343728
               alpha : 21.759838932398416
              lambda : 18.033517505460377
               gamma : 9.34455689217273
    min_child_weight : 0.39756461408827354
best objective value : 0.007937796374567296
Optuna XGB train: 9.024606503172446 0.01470599127067962 88.24956822395325
Min_prd:  575
Constant guess:  14.023942927689596 0.0
XGB test: 14.192609780601327 -0.01490470258145593
XGB GS test: 14.087690210426105 -0.005434828800949587
Optuna XGB test: 14.097072510523505 -0.008217293847714968


(75993, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
1156,10025,599,158.189432,-5.022474,2008,-0.4991,15.0,-1.952836,0.208828,0.532784,0.079522,0.8335,-36.267527,0.5084,2.045092,1.564722,1.192492,6.4372,2.92494,2.532906,2.525805,0.553926,5.333826,-2.833924,0.138217,0.49263,-0.230583,5.885158,0.313061,5.7842,0.518303,0.195908,6.7404,0.416791,-0.073123,11.3755,0.664894,-0.123588,5.7842,0.385138,65.921853,1.894051,1.627898,0.908137,2.089061,2.006254
1157,10025,600,124.460223,-20.289089,2008,-6.9809,15.0,-1.952836,0.208828,0.532784,0.079522,-0.4991,-31.960679,0.67604,2.13246,1.283755,1.261232,9.8376,3.93523,2.800506,2.684324,0.977199,5.331117,-2.833924,0.138217,0.49263,-0.230583,5.835001,0.5084,6.4372,0.553926,0.21569,2.5582,0.190476,-0.027171,4.2986,0.163934,-0.216606,6.4372,0.476515,31.631842,2.000991,1.969442,0.882032,2.070222,2.027439
1158,10025,601,135.589552,-21.721917,2008,-7.3709,15.0,-1.952836,0.208828,0.532784,0.079522,-6.9809,-31.294902,0.816429,2.261645,1.714087,1.265152,5.5953,2.884987,2.977204,2.772837,0.210011,5.249631,-2.833924,0.138217,0.49263,-0.230583,5.811174,0.67604,9.8376,0.977199,0.313061,5.7842,0.518303,-0.0134,4.033,0.209974,-0.151384,9.8376,0.215054,16.640814,1.428514,1.255339,0.926147,1.908736,2.050792
1159,10025,602,106.971105,-29.424658,2008,-33.8723,15.0,-1.952836,0.208828,0.532784,0.079522,-7.3709,-38.829846,1.003851,2.089856,1.745884,1.263638,5.6901,2.598776,2.864899,2.79013,1.175517,5.175006,-2.833924,0.138217,0.49263,-0.230583,5.858889,0.816429,5.5953,0.210011,0.5084,6.4372,0.553926,0.195908,6.7404,0.416791,-0.186485,5.5953,0.287293,22.908957,2.017218,1.651721,1.007585,2.083395,2.093771
1160,10025,603,39.462652,-64.862586,2008,0.253,15.0,-1.584946,0.181465,0.518737,-0.020983,-22.380465,-43.58062,1.113378,3.987491,3.447501,1.263138,3.9483,4.136117,3.263374,3.009246,0.293945,4.760279,-1.952836,0.208828,0.532784,0.079522,5.828432,1.003851,5.6901,1.175517,0.67604,9.8376,0.977199,0.21569,2.5582,0.190476,-0.200377,5.6901,0.416393,27.70753,1.55209,1.325,1.109816,2.115923,2.128906


count    75993.000000
mean      2009.293435
std          0.988989
min       2008.000000
25%       2008.000000
50%       2009.000000
75%       2010.000000
max       2011.000000
Name: year, dtype: float64

PERMNO          75993
prd             75993
mom482          66357
mom242          74466
year            75993
RET             75993
ind             75993
bm              75993
op              75993
gp              75993
inv             75838
mom11           75993
mom122          75993
amhd            73690
ivol_capm       75992
ivol_ff5        75992
beta_bw         75993
MAX             75993
vol1m           75990
vol6m           75886
vol12m          75679
BAspr           75536
size            75993
lbm             75993
lop             75993
lgp             75993
linv            75993
llme            75993
l1amhd          73692
l1MAX           75992
l1BAspr         75518
l3amhd          73695
l3MAX           75972
l3BAspr         75506
l6amhd          73678
l6MAX           75956
l6BAspr         75445
l12amhd         73578
l12MAX          75992
l12BAspr        75332
l12mom122       75844
l12ivol_capm    75930
l12ivol_ff5     75930
l12beta_bw      75955
l12vol6m        75811
l12vol12m 

Number of features before transformation:  (71322, 44)
time to do feature proprocessing: 
Number of features after transformation:  (71322, 92)
mae of a constant model 10.037046635152372
R2 of a constant model 0.0
XGB train: 9.760712862590289 0.09358310824487537
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta

[32m[I 2022-08-25 21:18:47,956][0m A new study created in memory with name: no-name-c333cff7-656e-4e51-951e-b865c60c2fba[0m


XGB train: 9.929117147790853 0.05635211483289826 46.126760721206665


[32m[I 2022-08-25 21:18:52,233][0m Trial 0 finished with value: 0.005322787624741703 and parameters: {'n_estimators': 933, 'max_depth': 2, 'learning_rate': 0.0035504132012419737, 'colsample_bytree': 0.17049506314726862, 'subsample': 0.43025035502262304, 'alpha': 0.48045147834486623, 'lambda': 3.715943603247201, 'gamma': 1.551561268987683, 'min_child_weight': 0.10610909085414198}. Best is trial 0 with value: 0.005322787624741703.[0m
[32m[I 2022-08-25 21:18:57,777][0m Trial 1 finished with value: 0.006723857611341455 and parameters: {'n_estimators': 508, 'max_depth': 5, 'learning_rate': 0.029465870298917392, 'colsample_bytree': 0.5967227255707233, 'subsample': 0.7558594578377145, 'alpha': 0.6956784829403881, 'lambda': 167.51717677154733, 'gamma': 3.596485745389904e-10, 'min_child_weight': 1.1914748262851673}. Best is trial 1 with value: 0.006723857611341455.[0m
[32m[I 2022-08-25 21:19:01,606][0m Trial 2 finished with value: -0.0005760884478441175 and parameters: {'n_estimators': 

Total time for hypermarameter optimization  94.00881671905518
        n_estimators : 656
           max_depth : 4
       learning_rate : 0.023842781044028122
    colsample_bytree : 0.846786254608283
           subsample : 0.49735372953457946
               alpha : 0.2666874782213661
              lambda : 82.98814139840577
               gamma : 0.0011233868771867684
    min_child_weight : 0.38064788283321005
best objective value : 0.009552537534777548
Optuna XGB train: 9.860614932384733 0.06916504137619384 96.93542790412903
Min_prd:  600
Constant guess:  7.151687740274089 0.0
XGB test: 7.194567545345738 0.006236133789213394
XGB GS test: 7.163474458498653 0.00725061280046424
Optuna XGB test: 7.205065529671993 4.012371960093741e-05


(72134, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
1181,10025,624,-33.213243,-7.391161,2010,6.1775,15.0,-1.051382,0.024348,0.282811,0.187862,-22.380465,105.899232,0.870701,2.88454,2.491471,1.042598,3.0466,3.074331,2.646712,3.08848,0.076746,5.183114,-1.584946,0.181465,0.518737,-0.020983,4.638982,1.577377,5.7746,0.114155,1.854471,4.6104,0.202224,2.174688,4.5574,0.106752,2.390736,5.7746,0.902778,-57.420339,5.671538,3.734227,0.985554,5.641742,4.94001
1182,10025,625,-20.492828,12.269626,2010,-9.0219,15.0,-1.051382,0.024348,0.282811,0.187862,6.1775,27.912777,0.496119,1.779394,1.283221,0.992666,5.7236,1.997209,2.550382,2.89394,0.141593,5.243151,-1.584946,0.181465,0.518737,-0.020983,4.925337,0.870701,3.0466,0.076746,1.720513,3.5128,0.179426,1.992475,6.1459,0.143431,2.428806,3.0466,0.432484,-46.44693,3.140712,2.651936,0.99276,4.970791,5.028419
1183,10025,626,-37.557134,31.626169,2010,-5.0219,15.0,-1.051382,0.024348,0.282811,0.187862,-9.0219,11.380036,0.417178,2.047126,1.913765,1.126987,7.2128,4.226972,2.894209,2.874546,0.641574,5.148709,-1.584946,0.181465,0.518737,-0.020983,5.12368,0.496119,5.7236,0.141593,1.577377,5.7746,0.114155,1.954377,5.0362,0.522466,2.411381,5.7236,0.555556,-23.033321,4.105175,3.845008,0.986286,5.052746,5.121087
1184,10025,627,-48.7609,30.25479,2010,21.0118,15.0,-1.100725,0.176879,0.498498,-0.078728,-5.0219,-4.803229,0.328364,3.347596,2.637752,1.12671,12.1202,3.699867,3.165143,2.899452,0.126957,5.09802,-1.051382,0.024348,0.282811,0.187862,5.186815,0.417178,7.2128,0.641574,0.870701,3.0466,0.076746,1.854471,4.6104,0.202224,2.402947,7.2128,1.108647,41.924835,3.267046,2.774924,0.985537,4.931069,5.064231
1185,10025,628,-36.440453,77.12089,2010,-18.1761,15.0,-1.100725,0.176879,0.498498,-0.078728,21.0118,-25.218487,0.339769,1.49587,0.993694,1.161303,4.1322,1.798698,3.032113,2.873359,0.402414,5.264156,-1.051382,0.024348,0.282811,0.187862,5.382046,0.328364,12.1202,0.126957,0.496119,5.7236,0.141593,1.720513,3.5128,0.179426,2.345499,12.1202,0.209393,50.689511,2.190124,2.024474,0.972879,4.687398,4.996808


count    72134.000000
mean      2011.347118
std          1.000128
min       2010.000000
25%       2011.000000
50%       2011.000000
75%       2012.000000
max       2013.000000
Name: year, dtype: float64

PERMNO          72134
prd             72134
mom482          64969
mom242          71280
year            72134
RET             72134
ind             72134
bm              72134
op              72134
gp              72134
inv             71987
mom11           72134
mom122          72134
amhd            70467
ivol_capm       72133
ivol_ff5        72133
beta_bw         72134
MAX             72134
vol1m           72133
vol6m           72080
vol12m          71966
BAspr           71963
size            72134
lbm             72134
lop             72134
lgp             72134
linv            72134
llme            72134
l1amhd          70471
l1MAX           72129
l1BAspr         71947
l3amhd          70470
l3MAX           72115
l3BAspr         71923
l6amhd          70414
l6MAX           72101
l6BAspr         71877
l12amhd         70264
l12MAX          72129
l12BAspr        71683
l12mom122       72038
l12ivol_capm    72087
l12ivol_ff5     72087
l12beta_bw      72094
l12vol6m        72054
l12vol12m 

Number of features before transformation:  (68474, 44)
time to do feature proprocessing: 
Number of features after transformation:  (68474, 91)
mae of a constant model 7.929203557642096
R2 of a constant model 0.0
XGB train: 7.614579200435014 0.10740142593489
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.0

[32m[I 2022-08-25 21:21:11,970][0m A new study created in memory with name: no-name-ba9a56d3-9709-48f9-9c56-f547d7384dae[0m


XGB train: 7.828758732420091 0.040164488128036524 43.893948554992676


[32m[I 2022-08-25 21:21:21,877][0m Trial 0 finished with value: -0.02756818022204511 and parameters: {'n_estimators': 990, 'max_depth': 5, 'learning_rate': 0.03750624270771889, 'colsample_bytree': 0.5606786338927369, 'subsample': 0.7132237014079852, 'alpha': 3.0247723658862555, 'lambda': 0.14515931770000318, 'gamma': 0.00028025061074782835, 'min_child_weight': 8.882808380781123}. Best is trial 0 with value: -0.02756818022204511.[0m
[32m[I 2022-08-25 21:21:28,728][0m Trial 1 finished with value: -0.010691635155818094 and parameters: {'n_estimators': 699, 'max_depth': 5, 'learning_rate': 0.035994077505641195, 'colsample_bytree': 0.39485794973622357, 'subsample': 0.7295880981637665, 'alpha': 10.518285007775635, 'lambda': 2.724478265294206, 'gamma': 2.5977902760869326e-10, 'min_child_weight': 38.61082680490899}. Best is trial 1 with value: -0.010691635155818094.[0m
[32m[I 2022-08-25 21:21:33,441][0m Trial 2 finished with value: 0.007172902571093585 and parameters: {'n_estimators': 

Total time for hypermarameter optimization  97.3347270488739
        n_estimators : 868
           max_depth : 2
       learning_rate : 0.013868329963668649
    colsample_bytree : 0.4393040670491829
           subsample : 0.45423772830872244
               alpha : 0.7953939799955668
              lambda : 44.464833772439434
               gamma : 0.5014330335588163
    min_child_weight : 2.2100718698548856
best objective value : 0.009888624410767832
Optuna XGB train: 7.876640243138913 0.02196288131081059 99.35954713821411
Min_prd:  625
Constant guess:  7.739690600361432 0.0
XGB test: 7.8125700982203465 -0.02018297952134418
XGB GS test: 7.772694962113474 -0.013107832891068316
Optuna XGB test: 7.768919000518592 -0.012510181276847554


(66111, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
1206,10025,649,30.604529,47.614289,2012,-0.9851,15.0,-0.953163,0.060639,0.373348,-0.025756,0.1724,14.759414,1.259688,1.2047,0.923609,1.058419,2.5839,1.533648,2.224711,2.413616,0.404975,5.260005,-1.100725,0.176879,0.498498,-0.078728,5.226473,1.278464,2.9882,0.371854,1.383731,4.1328,0.513906,1.510489,8.7149,0.558214,1.43161,2.9882,0.397878,7.446355,1.532176,1.429862,1.297452,2.318278,2.736604
1207,10025,650,97.106725,44.386493,2012,26.1222,15.0,-0.953163,0.060639,0.373348,-0.025756,-0.9851,13.571797,1.141539,0.765013,0.647098,1.026432,1.4066,0.86687,1.761031,2.337123,0.579206,5.250206,-1.100725,0.176879,0.498498,-0.078728,5.238599,1.259688,2.5839,0.404975,1.369098,3.7563,0.490055,1.482684,9.2927,0.468883,1.488235,2.5839,0.579216,20.52537,1.116961,0.998634,1.245832,2.151214,2.495713
1208,10025,651,138.786753,50.502876,2012,7.876,15.0,-0.899741,0.072943,0.362002,0.184931,26.1222,18.270241,1.103795,1.517394,1.511676,1.007201,5.7938,1.525676,1.485825,2.33476,0.136519,5.482288,-0.953163,0.060639,0.373348,-0.025756,5.188325,1.141539,1.4066,0.579206,1.278464,2.9882,0.371854,1.460316,8.6057,0.567577,1.494722,1.4066,0.307377,28.446093,1.852721,1.680914,1.262682,2.178625,2.343239
1209,10025,652,162.91372,87.364374,2012,7.1633,15.0,-0.899741,0.072943,0.362002,0.184931,7.876,60.787559,1.047262,0.937883,0.768468,0.991882,3.1364,0.989203,1.371582,2.299344,0.578902,5.558824,-0.953163,0.060639,0.373348,-0.025756,5.001724,1.103795,5.7938,0.136519,1.259688,2.5839,0.404975,1.383731,4.1328,0.513906,1.504704,5.7938,0.367918,0.922674,1.208446,0.908065,1.307383,2.056547,2.32501
1210,10025,653,174.780357,113.102274,2012,20.3276,15.0,-0.899741,0.072943,0.362002,0.184931,7.1633,72.766685,0.949094,1.120278,0.963537,1.038772,4.3231,1.269428,1.271617,2.116573,0.335306,5.628101,-0.953163,0.060639,0.373348,-0.025756,5.005778,1.047262,3.1364,0.578902,1.141539,1.4066,0.579206,1.369098,3.7563,0.490055,1.523562,3.1364,0.681302,14.425587,2.072178,1.96863,1.081815,2.335976,2.367897


count    66111.000000
mean      2013.422970
std          1.009211
min       2012.000000
25%       2013.000000
50%       2013.000000
75%       2014.000000
max       2015.000000
Name: year, dtype: float64

PERMNO          66111
prd             66111
mom482          62388
mom242          65530
year            66111
RET             66111
ind             66111
bm              66111
op              66111
gp              66111
inv             66000
mom11           66111
mom122          66111
amhd            64693
ivol_capm       66109
ivol_ff5        66109
beta_bw         66111
MAX             66111
vol1m           66108
vol6m           66066
vol12m          65985
BAspr           66022
size            66111
lbm             66111
lop             66111
lgp             66111
linv            66111
llme            66111
l1amhd          64677
l1MAX           66107
l1BAspr         66017
l3amhd          64655
l3MAX           66100
l3BAspr         65998
l6amhd          64619
l6MAX           66093
l6BAspr         65978
l12amhd         64571
l12MAX          66107
l12BAspr        65935
l12mom122       66049
l12ivol_capm    66076
l12ivol_ff5     66076
l12beta_bw      66081
l12vol6m        66057
l12vol12m 

Number of features before transformation:  (62745, 44)
time to do feature proprocessing: 
Number of features after transformation:  (62745, 92)
mae of a constant model 7.4311450540111785
R2 of a constant model 0.0
XGB train: 7.102998486516917 0.11496400870943368
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta

[32m[I 2022-08-25 21:23:36,984][0m A new study created in memory with name: no-name-6a78c6b3-c4c3-4944-821b-b3267daab7f2[0m


XGB train: 7.285970160193694 0.04909639497219265 42.33363389968872


[32m[I 2022-08-25 21:23:41,820][0m Trial 0 finished with value: 0.005934044239519656 and parameters: {'n_estimators': 898, 'max_depth': 3, 'learning_rate': 0.013970707160544726, 'colsample_bytree': 0.8448742919963662, 'subsample': 0.3896946637885105, 'alpha': 0.4208552572168877, 'lambda': 79.98885655352393, 'gamma': 1.5171802297443182e-06, 'min_child_weight': 0.7178420046803845}. Best is trial 0 with value: 0.005934044239519656.[0m
[32m[I 2022-08-25 21:23:47,359][0m Trial 1 finished with value: 0.004510653682763724 and parameters: {'n_estimators': 937, 'max_depth': 3, 'learning_rate': 0.019082538960682405, 'colsample_bytree': 0.9063572459045574, 'subsample': 0.34944577971150903, 'alpha': 0.24312218987902928, 'lambda': 37.06873922960986, 'gamma': 2.0386676291445407, 'min_child_weight': 4.32935086765049}. Best is trial 0 with value: 0.005934044239519656.[0m
[32m[I 2022-08-25 21:23:49,997][0m Trial 2 finished with value: 0.007682508625491711 and parameters: {'n_estimators': 512, '

Total time for hypermarameter optimization  85.87788891792297
        n_estimators : 647
           max_depth : 3
       learning_rate : 0.012450448458307003
    colsample_bytree : 0.24137118054832493
           subsample : 0.8316929767773923
               alpha : 29.93291444686363
              lambda : 0.5837812878112858
               gamma : 0.04927291921313879
    min_child_weight : 3.72748330467612
best objective value : 0.008893848798401294
Optuna XGB train: 7.32321188261344 0.03272832722748553 87.77718997001648
Min_prd:  650
Constant guess:  6.455628673143056 0.0
XGB test: 6.3659382164190115 0.04702019993165385
XGB GS test: 6.3439868155999255 0.04081648160705098
Optuna XGB test: 6.321963503175968 0.04786719426337405


(60932, 46)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,BAspr,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l1BAspr,l3amhd,l3MAX,l3BAspr,l6amhd,l6MAX,l6BAspr,l12amhd,l12MAX,l12BAspr,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
1231,10025,674,27.416551,-18.3196,2014,9.4132,15.0,-1.27065,0.13971,0.476473,0.037948,-10.5278,-57.331083,-0.453503,1.169023,0.920656,0.869967,1.4066,1.272945,2.137565,2.669958,0.550875,5.087173,-0.899741,0.072943,0.362002,0.184931,6.146838,-0.530406,2.3184,0.462161,-0.798248,3.9944,0.182274,-0.646577,2.3122,0.342742,-0.354324,2.3184,0.047699,105.899232,1.154988,1.042161,0.75938,1.379997,1.399697
1232,10025,675,28.555097,-19.497988,2014,16.8626,15.0,-0.980287,0.091844,0.388207,0.09299,9.4132,-57.158226,-0.219798,3.222446,2.820006,1.011708,10.1363,3.617839,2.391608,2.532018,0.622463,5.177135,-1.27065,0.13971,0.476473,0.037948,6.031552,-0.453503,1.4066,0.550875,-0.61369,2.9798,0.45045,-0.78228,6.1561,0.245098,-0.396455,1.4066,0.040102,91.596131,4.875088,4.522004,0.821775,2.256988,1.924399
1233,10025,676,68.314359,-15.979361,2014,3.9264,15.0,-0.980287,0.091844,0.388207,0.09299,16.8626,-56.790596,-0.112437,2.09263,1.927199,1.034622,5.965,2.230764,2.446523,2.631069,0.768849,5.332964,-1.27065,0.13971,0.476473,0.037948,6.113684,-0.219798,10.1363,0.622463,-0.530406,2.3184,0.462161,-0.793291,3.5475,0.454002,-0.403102,10.1363,0.394964,58.268392,0.765013,0.654865,0.827337,2.210333,1.880524
1234,10025,677,80.406968,-28.051947,2014,-10.5785,15.0,-0.980287,0.091844,0.388207,0.09299,3.9264,-47.916712,-0.058889,0.85028,0.806903,1.000746,2.4426,0.86687,2.346222,2.599881,0.383401,5.371477,-1.27065,0.13971,0.476473,0.037948,6.082726,-0.112437,5.965,0.768849,-0.453503,1.4066,0.550875,-0.798248,3.9944,0.182274,-0.466285,5.965,0.064045,60.21641,1.443863,1.275272,0.847987,2.247112,1.909812
1235,10025,678,58.24535,-41.759961,2014,21.4418,15.0,-0.980287,0.091844,0.388207,0.09299,-10.5785,-43.009059,0.128252,2.112572,1.65776,0.983134,4.6989,2.23995,2.203089,2.253126,0.189343,5.259667,-1.27065,0.13971,0.476473,0.037948,6.031191,-0.058889,2.4426,0.383401,-0.219798,10.1363,0.622463,-0.61369,2.9798,0.45045,-0.538077,2.4426,0.139451,29.091358,5.116995,4.540554,0.901574,3.008285,2.369508


count    60932.000000
mean      2015.503381
std          1.010979
min       2014.000000
25%       2015.000000
50%       2015.000000
75%       2016.000000
max       2017.000000
Name: year, dtype: float64

PERMNO          60932
prd             60932
mom482          57428
mom242          60392
year            60932
RET             60932
ind             60932
bm              60932
op              60932
gp              60932
inv             60785
mom11           60932
mom122          60932
amhd            59954
ivol_capm       60931
ivol_ff5        60931
beta_bw         60932
MAX             60932
vol1m           60930
vol6m           60881
vol12m          60786
BAspr           60867
size            60932
lbm             60932
lop             60932
lgp             60932
linv            60932
llme            60932
l1amhd          59947
l1MAX           60931
l1BAspr         60866
l3amhd          59929
l3MAX           60920
l3BAspr         60857
l6amhd          59887
l6MAX           60915
l6BAspr         60850
l12amhd         59810
l12MAX          60931
l12BAspr        60810
l12mom122       60888
l12ivol_capm    60916
l12ivol_ff5     60916
l12beta_bw      60919
l12vol6m        60896
l12vol12m 

Number of features before transformation:  (57612, 44)
time to do feature proprocessing: 
Number of features after transformation:  (57612, 92)
mae of a constant model 7.844483176058157
R2 of a constant model 0.0
XGB train: 7.463082080313984 0.12777261074441615
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=

[32m[I 2022-08-25 21:25:49,601][0m A new study created in memory with name: no-name-03400079-7a3f-44ca-8908-de03a1096848[0m


XGB train: 7.698325588684248 0.05027393848590822 41.73289680480957


[32m[I 2022-08-25 21:25:58,855][0m Trial 0 finished with value: -0.017507152426829303 and parameters: {'n_estimators': 944, 'max_depth': 5, 'learning_rate': 0.02173842376668825, 'colsample_bytree': 0.7059334365524941, 'subsample': 0.4516252398816166, 'alpha': 19.95713097370607, 'lambda': 0.4772871198201621, 'gamma': 9.308233133556265e-05, 'min_child_weight': 6.005904061507015}. Best is trial 0 with value: -0.017507152426829303.[0m
[32m[I 2022-08-25 21:26:05,029][0m Trial 1 finished with value: 0.004789122681059692 and parameters: {'n_estimators': 932, 'max_depth': 4, 'learning_rate': 0.019002484298095177, 'colsample_bytree': 0.5365596807266103, 'subsample': 0.6844164371402591, 'alpha': 23.59544126090816, 'lambda': 38.07860697981912, 'gamma': 1.6254762635028171e-07, 'min_child_weight': 4.237638540528578}. Best is trial 1 with value: 0.004789122681059692.[0m
[32m[I 2022-08-25 21:26:11,215][0m Trial 2 finished with value: 0.0010626717378536907 and parameters: {'n_estimators': 652,

Total time for hypermarameter optimization  92.27102971076965
        n_estimators : 601
           max_depth : 3
       learning_rate : 0.015196251991589521
    colsample_bytree : 0.41612524983927524
           subsample : 0.764846491142368
               alpha : 0.5476846156741678
              lambda : 25.534537192933342
               gamma : 2.2101698854530036e-09
    min_child_weight : 1.540181610474194
best objective value : 0.011918020633176344
Optuna XGB train: 7.7165973749988765 0.04007504337618861 93.96981620788574
Min_prd:  675
Constant guess:  6.559515936576189 0.0
XGB test: 6.606500270058161 -0.025128448860597175
XGB GS test: 6.587640379139121 -0.010631009835597505
Optuna XGB test: 6.5926129951965695 -0.008785808551218466
3660.1989362239838     min_prd      xgbf     xgbgs      xgbo
0       100 -0.033926 -0.019374 -0.024648
1       125  0.001887  0.006944  0.004293
2       150  0.009361  0.018258  0.019385
3       175 -0.008671 -0.002783  -0.00082
4       200  0.012099   0

In [33]:
results

Unnamed: 0,min_prd,xgbf,xgbgs,xgbo
0,100,-0.033926,-0.019374,-0.024648
1,125,0.001887,0.006944,0.004293
2,150,0.009361,0.018258,0.019385
3,175,-0.008671,-0.002783,-0.00082
4,200,0.012099,0.01089,0.007637
5,225,0.024659,0.024404,0.024533
6,250,-0.004759,0.003918,0.005939
7,275,0.013317,0.017266,0.022634
8,300,-0.009404,-0.000589,-0.001855
9,325,-0.00607,0.004755,0.006592


In [31]:
print('Total time for a script: ', time.time()-time0)

Total time for a script:  3660.214448451996


In [34]:
results.iloc[:,1:].mean()

xgbf     0.004061
xgbgs    0.008791
xgbo     0.008833
dtype: float64

In [None]:
# 3yr window, trials=20, cv_reg=0.03: 0.88%. runs 1 hr.
