### This is a new version of MLEAP scripts, started in late Aug 2022.
It will combine IProject_MLEAP_ANN and IP_MLEAP script, while improving them.

#### Outline

1. Load libraries and data.
2. pEDA. Look at feature distribution, fix them if they do not look right.
3. Train-test split. Most likely couple years into test set. 2015-2018?. Impute missing values.
4. Transform numerical features, add ohe for inds.
5. Fit classic models: ols as a baseline, then xgb.
6. Fir DL.


Notes:
ideally, I want to use time-based cross-validation.
since I have panel data, it is not a trivial task.
need to find some solution online.
e.g., https://towardsdatascience.com/time-based-cross-validation-d259b13d42b8.

for now, will try to do siple for loop.


In [53]:
# 0. Import libraries #

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, time, math, re, warnings, random, gc, dill, optuna, pickle
import statsmodels.api as sm
from random import sample

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNetCV
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

plt.style.use('seaborn-white')
warnings.simplefilter(action='ignore')
pd.set_option('display.max_columns', 110)
gc.enable()

In [54]:
### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

In [55]:
# Detect TPU, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [56]:
# for loop to see appx performance over the whole sample with some rolling window

time0 = time.time()

#min_prd_list = range(100, 676, 25)
min_prd_list = [125]
windows_width = 3*12
cv_regularizer=0.03
optuna_trials = 40

results = pd.DataFrame(columns = ['min_prd', 'xgbf', 'xgbgs', 'xgbo'])
results.min_prd = min_prd_list

for min_prd in min_prd_list:
    
    
    with open('../input/kaggle-46pkl/IMLEAP_v4.pkl', 'rb') as pickled_one:
        df = pickle.load(pickled_one)
    df = df[df.prd.isin(range(min_prd-1, min_prd+windows_width+2))]
    df_cnt = df.count()
    empty_cols = list(df_cnt[df_cnt<int(df.shape[0]/2)].index)
    df.drop(columns=empty_cols, inplace=True)
    display(df.shape, df.head(), df.year.describe(), df.count())
    
    df = df[(df.RET>-50)&(df.RET<75)]
    meanret = df.groupby('prd').RET.mean().to_frame().reset_index().rename(columns={'RET':'mRET'})
    df = pd.merge(df, meanret, on='prd', how='left')
    df.RET = df.RET-df.mRET
    df.drop(columns='mRET', inplace=True)

    features_miss_dummies = ['amhd', 'BAspr']
    for col in features_miss_dummies:
        if col in df.columns:
            df[col+'_miss'] = df[col].isnull().astype(int)

    temp_cols = ['PERMNO', 'year']
    train = df[df.prd<(min_prd+windows_width)]
    test = df[df.prd==(min_prd+windows_width)]
    train.drop(columns=temp_cols, inplace=True)
    test.drop(columns=temp_cols, inplace=True)

    col_ignore = ['RET', 'prd']
    col_cat = ['ind']
    col_num = [x for x in train.columns if x not in col_ignore+col_cat]
    for col in col_num:
        train[col] = train[col].fillna(train[col].median())
        test[col] = test[col].fillna(train[col].median())
    for col in col_cat:
        train[col] = train[col].fillna(value=-1000)
        test[col] = test[col].fillna(value=-1000)

    X_train = train.copy()
    y_train = X_train.pop('RET')
    X_test = test.copy()
    y_test = X_test.pop('RET')

    feature_transformer = ColumnTransformer([('num', StandardScaler(), col_num),
                                            ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore", drop='if_binary'), col_cat)], 
                                            remainder="passthrough")

    print('Number of features before transformation: ', X_train.shape)
    X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
    X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
    print('time to do feature proprocessing: ')
    print('Number of features after transformation: ', X_train.shape)
    
    X_train0 = X_train.copy()
    y_train0 = y_train.copy()
    
    X_train.drop(columns=['remainder__prd'], inplace=True)
    X_test.drop(columns=['remainder__prd'], inplace=True)

    print('mae of a constant model', mean_absolute_error(df.RET, np.ones(df.shape[0])*(df.RET.mean())))
    print('R2 of a constant model', r2_score(df.RET, np.ones(df.shape[0])*(df.RET.mean())))

    xgb1 = XGBRegressor(tree_method = 'gpu_hist', n_estimators=300, max_depth=5, eta=0.03, colsample_bytree=0.6)
    xgb1.fit(X_train, y_train)
    print('XGB train:', mean_absolute_error(y_train, xgb1.predict(X_train)), r2_score(y_train, xgb1.predict(X_train)))

    time1 = time.time()
    xgb = XGBRegressor(tree_method = 'gpu_hist')
    param_grid = {'n_estimators':[400, 700], 'max_depth':[2,3,4], 'eta':[0.006, 0.012, 0.02], 'subsample':[0.6], 'colsample_bytree':[0.6]}
    xgbm = GridSearchCV(xgb, param_grid, cv=2, verbose=2, scoring='r2')
    xgbm.fit(X_train, y_train)
    print('XGB', xgbm.best_params_, xgbm.best_score_, time.time()-time1)
    print('XGB train:', mean_absolute_error(y_train, xgbm.predict(X_train)), r2_score(y_train, xgbm.predict(X_train)), time.time()-time1)

    time1 = time.time()
    def objective(trial, cv_runs=1, n_splits=2, n_jobs=-1):

        params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 500, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.001, 0.05),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.3, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 30.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 200.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 50)    }

        temp_out = []

        for i in range(cv_runs):

            X = X_train
            y = y_train
            model = XGBRegressor(**params, njobs=-1)
            rkf = KFold(n_splits=n_splits, shuffle=True)
            X_values = X.values
            y_values = y.values
            y_pred = np.zeros_like(y_values)
            y_pred_train = np.zeros_like(y_values)
            for train_index, test_index in rkf.split(X_values):
                X_A, X_B = X_values[train_index, :], X_values[test_index, :]
                y_A, y_B = y_values[train_index], y_values[test_index]
                model.fit(X_A, y_A, eval_set=[(X_B, y_B)], verbose = False)
                y_pred[test_index] = model.predict(X_B)
                y_pred_train[train_index] = model.predict(X_A)

            score_train = r2_score(y_train, y_pred_train)
            score_test = r2_score(y_train, y_pred) 
            overfit = (score_train-score_test)
            temp_out.append(score_test-cv_regularizer*overfit)

        return (np.mean(temp_out))

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=optuna_trials)
    print('Total time for hypermarameter optimization ', time.time()-time1)
    hp = study.best_params
    for key, value in hp.items():
        print(f"{key:>20s} : {value}")
    print(f"{'best objective value':>20s} : {study.best_value}")
    optuna_hyperpars = study.best_params
    optuna_hyperpars['tree_method']='gpu_hist'
    optuna_xgb = XGBRegressor(**optuna_hyperpars)
    optuna_xgb.fit(X_train, y_train)
    print('Optuna XGB train:', 
          mean_absolute_error(y_train, optuna_xgb.predict(X_train)), r2_score(y_train, optuna_xgb.predict(X_train)), time.time()-time1)

    # Evaluate performance of XGB models:
    r2_xgb1 = r2_score(y_test, xgb1.predict(X_test))
    r2_xgbgs = r2_score(y_test, xgbm.predict(X_test))
    r2_xgbo = r2_score(y_test, optuna_xgb.predict(X_test))

    print('Min_prd: ', min_prd)
    print('Constant guess: ', mean_absolute_error(y_test, np.ones(len(y_test))*y_test.mean()), 
          r2_score(y_test, np.ones(len(y_test))*y_test.mean()))
    print('XGB test:', mean_absolute_error(y_test, xgb1.predict(X_test)), r2_xgb1)
    print('XGB GS test:', mean_absolute_error(y_test, xgbm.predict(X_test)), r2_xgbgs)
    print('Optuna XGB test:', mean_absolute_error(y_test, optuna_xgb.predict(X_test)), r2_xgbo)

    results.loc[results.min_prd==min_prd,'xgbf':'xgbo'] = r2_xgb1, r2_xgbgs, r2_xgbo
    
print(time.time()-time0, results)

(40682, 41)

Unnamed: 0,PERMNO,prd,mom482,mom242,year,RET,ind,bm,op,gp,inv,mom11,mom122,amhd,ivol_capm,ivol_ff5,beta_bw,MAX,vol1m,vol6m,vol12m,size,lbm,lop,lgp,linv,llme,l1amhd,l1MAX,l3amhd,l3MAX,l6amhd,l6MAX,l12amhd,l12MAX,l12mom122,l12ivol_capm,l12ivol_ff5,l12beta_bw,l12vol6m,l12vol12m
49,10006,124,57.022356,27.230111,1968,-10.5198,25.0,-0.22327,0.183384,0.269118,0.100395,3.4619,12.086336,1.536049,1.37765,1.078594,0.854703,4.982,1.945352,2.653852,2.174071,5.86758,-0.149515,0.173745,0.242714,0.119169,5.740148,1.573985,21.135115,1.636271,4.2086,1.653423,2.5316,1.833293,21.135115,6.066114,1.906794,1.54551,1.069805,1.668624,1.814489
50,10006,125,45.576895,32.149862,1968,-6.2596,25.0,-0.22327,0.183384,0.269118,0.100395,-10.5198,24.581595,1.493226,2.217285,1.948001,0.794842,2.7293,2.301335,2.7978,2.238881,5.751293,-0.149515,0.173745,0.242714,0.119169,5.660875,1.536049,4.982,1.641999,2.2022,1.585251,1.7344,1.792446,4.982,27.909142,1.503199,1.346875,1.007441,1.599164,1.760787
51,10006,126,16.380849,21.566933,1968,4.3219,25.0,-0.22327,0.183384,0.269118,0.100395,-6.2596,13.253947,1.469247,4.800074,4.663338,0.777908,7.2477,4.297812,3.251701,2.512787,5.691229,-0.149515,0.173745,0.242714,0.119169,5.648296,1.493226,2.7293,1.573985,21.135115,1.624355,2.1833,1.744677,2.7293,25.117139,1.051638,0.849621,0.989661,1.421761,1.677152
52,10006,127,32.954723,53.105785,1968,9.7618,25.0,-0.22327,0.183384,0.269118,0.100395,4.3219,11.098127,1.375395,1.477982,1.203894,0.787439,4.4095,1.764988,3.249921,2.531631,5.737749,-0.149515,0.173745,0.242714,0.119169,5.606947,1.469247,7.2477,1.536049,4.982,1.636271,4.2086,1.727351,7.2477,26.865911,1.174851,1.018713,1.014681,1.442506,1.555119
53,10006,128,34.099293,39.4152,1968,3.945,25.0,-0.22327,0.183384,0.269118,0.100395,9.7618,21.666549,1.246353,1.734429,1.621548,0.750551,5.4695,1.618184,3.253052,2.540064,5.82476,-0.149515,0.173745,0.242714,0.119169,5.549943,1.375395,4.4095,1.493226,2.7293,1.641999,2.2022,1.738743,4.4095,20.035787,1.903329,1.637691,0.996995,1.592287,1.605068


count    40682.000000
mean      1969.798437
std          0.978257
min       1968.000000
25%       1969.000000
50%       1970.000000
75%       1971.000000
max       1971.000000
Name: year, dtype: float64

PERMNO          40682
prd             40682
mom482          35826
mom242          40286
year            40682
RET             40682
ind             40682
bm              40682
op              40682
gp              40682
inv             40678
mom11           40682
mom122          40682
amhd            37268
ivol_capm       40681
ivol_ff5        40681
beta_bw         40682
MAX             40682
vol1m           40681
vol6m           40682
vol12m          40678
size            40682
lbm             40682
lop             40682
lgp             40682
linv            40682
llme            40682
l1amhd          37313
l1MAX           40682
l3amhd          37398
l3MAX           40682
l6amhd          37547
l6MAX           40682
l12amhd         37988
l12MAX          40682
l12mom122       40497
l12ivol_capm    40673
l12ivol_ff5     40673
l12beta_bw      40680
l12vol6m        40637
l12vol12m       40296
dtype: int64

Number of features before transformation:  (38133, 38)
time to do feature proprocessing: 
Number of features after transformation:  (38133, 82)
mae of a constant model 7.1541100803200415
R2 of a constant model 0.0
XGB train: 6.700516343240393 0.15021333381905422
Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.5s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=2, n_estimators=700, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta=0.006, max_depth=3, n_estimators=400, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.6, eta

[32m[I 2022-08-25 23:26:17,045][0m A new study created in memory with name: no-name-dcb1ae61-d0f9-4c94-925c-ba8c8561e892[0m


XGB train: 7.036265023358253 0.043365857443366185 34.69868850708008


[32m[I 2022-08-25 23:26:22,370][0m Trial 0 finished with value: 0.014243362118309039 and parameters: {'n_estimators': 918, 'max_depth': 4, 'learning_rate': 0.015818278138253396, 'colsample_bytree': 0.8705046290735095, 'subsample': 0.3580922111480277, 'alpha': 20.01355931834925, 'lambda': 65.5957786917176, 'gamma': 7.128270112596257e-10, 'min_child_weight': 0.9269185763770273}. Best is trial 0 with value: 0.014243362118309039.[0m
[32m[I 2022-08-25 23:26:25,497][0m Trial 1 finished with value: 0.013336363709325663 and parameters: {'n_estimators': 952, 'max_depth': 2, 'learning_rate': 0.027600597650666924, 'colsample_bytree': 0.46830375618901376, 'subsample': 0.32284844460025897, 'alpha': 0.42036573214548145, 'lambda': 0.17453470429992365, 'gamma': 1.3213762680631284e-09, 'min_child_weight': 33.29476183750863}. Best is trial 0 with value: 0.014243362118309039.[0m
[32m[I 2022-08-25 23:26:29,049][0m Trial 2 finished with value: 0.010077829269954212 and parameters: {'n_estimators': 6

Total time for hypermarameter optimization  187.6615331172943
        n_estimators : 970
           max_depth : 3
       learning_rate : 0.0050977294928359305
    colsample_bytree : 0.607679988114767
           subsample : 0.7333499650041201
               alpha : 21.598320214878488
              lambda : 165.2061301493154
               gamma : 1.9262607351514998e-09
    min_child_weight : 0.15652865250400633
best objective value : 0.01904475392832938
Optuna XGB train: 7.048069547071229 0.03465041881051134 189.68923211097717
Min_prd:  125
Constant guess:  6.801798451895731 0.0
XGB test: 6.690736270893462 0.001886781638528956
XGB GS test: 6.7089180299557585 0.006943571377436508
Optuna XGB test: 6.696254549506994 0.006086495549869619
226.8033630847931    min_prd      xgbf     xgbgs      xgbo
0      125  0.001887  0.006944  0.006086


In [57]:
results

Unnamed: 0,min_prd,xgbf,xgbgs,xgbo
0,125,0.001887,0.006944,0.006086


In [58]:
print('Total time for a script: ', time.time()-time0)

Total time for a script:  226.83410501480103


In [59]:
results.iloc[:,1:].mean()

xgbf     0.001887
xgbgs    0.006944
xgbo     0.006086
dtype: float64

In [60]:
# 3yr window, trials=20, cv_reg=0.03: 0.88%. runs 1 hr.
# 3yr, t=40, cv_reg=0.04: 0.96%.



In [61]:
results0

Unnamed: 0,min_prd,xgbf,xgbgs,xgbo
0,100,-0.033926,-0.019374,-0.02375
1,125,0.001887,0.006944,0.009033
2,150,0.009361,0.018258,0.018259
3,175,-0.008671,-0.002783,-0.000479
4,200,0.012099,0.01089,0.009292
5,225,0.024659,0.024404,0.025087
6,250,-0.004759,0.003918,0.006247
7,275,0.013317,0.017266,0.022251
8,300,-0.009404,-0.000589,-0.000884
9,325,-0.00607,0.004755,0.009447


In [65]:
X_train0.remainder__prd


0        124.0
1        125.0
2        126.0
3        127.0
4        128.0
         ...  
38128    156.0
38129    157.0
38130    158.0
38131    159.0
38132    160.0
Name: remainder__prd, Length: 38133, dtype: float64

In [None]:
### try NN  

model_ann5_s = Sequential([
    BatchNormalization(input_shape=(X_train.shape[0],)),
    Dense(64, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.4),
    BatchNormalization(),
    Dense(64, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.4),
    BatchNormalization(),
    Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.4),
    BatchNormalization(),
    Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.4),
    BatchNormalization(),
    Dense(16, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.4),
    BatchNormalization(),
    Dense(16, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.4),
    BatchNormalization(),
    Dense(8, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.4),
    Dense(1)])

print(model_ann5_s.count_params())

In [None]:
early_stopping30 = EarlyStopping(patience=30)
time1 = time.time()
optimizer_adam = tf.keras.optimizers.Adam()
model_ann5_s.compile(loss= "mean_squared_error" , optimizer=optimizer_adam, metrics=["mean_squared_error"])
history = model_ann5_s.fit(X_train, y_train, validation_data=(s_test_X, s_test_y), 
                         batch_size=8192, epochs=50, verbose=2, callbacks=[early_stopping20])
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()))

print([r2_score(s_train_y, model_ann5_s.predict(s_train_X)), 
       r2_score(s_test_y, model_ann5_s.predict(s_test_X))])
print(time.time()-time1)