## HYPER-PARAMETER OPTIMIZATION

**Content**




*   Installing libraries & loading functions and data from repository
*   Download and transform stock data into input data
*   Load preprocessed input data and create candlestick pattern feature vectors
*   Hyper-parameter optimization for neural network models
*   Grid-search for random forest model
*   Grid-search for ridge/LASSO regression



\


**This code block contains**

1.   Installation of required libraries
2.   Import of helping functions 
3.   Import of pre-processed utilized data











In [None]:
!pip install yfinance
!pip install h5py scikit-optimize

!git clone https://github.com/JakobsGit/MTMLmodels.git repo-dir

%cd repo-dir

import get_data_functions
from get_data_functions import *

import preprocessing_data_helpers
from preprocessing_data_helpers import *

import create_keras_models
from create_keras_models import *

import create_results
from create_results import *


Collecting yfinance
  Downloading yfinance-0.1.63.tar.gz (26 kB)
Collecting lxml>=4.5.1
  Downloading lxml-4.6.3-cp37-cp37m-manylinux2014_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 2.9 MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.63-py2.py3-none-any.whl size=23918 sha256=3f010be3b977ad4785d81998e28b059d657293c1ff8717110488478fe6957935
  Stored in directory: /root/.cache/pip/wheels/fe/87/8b/7ec24486e001d3926537f5f7801f57a74d181be25b11157983
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfully installed lxml-4.6.3 yfinance-0.1.63
Collecting scikit-optimize
  Downloading scikit_optimize-0.8.1-py2.py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 1

Install ta-lib and define functions using the library

In [None]:
!wget https://launchpad.net/~mario-mariomedina/+archive/ubuntu/talib/+files/libta-lib0_0.4.0-oneiric1_amd64.deb -qO libta.deb
!wget https://launchpad.net/~mario-mariomedina/+archive/ubuntu/talib/+files/ta-lib0-dev_0.4.0-oneiric1_amd64.deb -qO ta.deb
!dpkg -i libta.deb ta.deb
!pip install ta-lib
import talib

def technicalsfeatures(dataset):
  candle_names = talib.get_function_groups()['Pattern Recognition']

  for candle in candle_names:

      dataset[candle] = 0 

  for stockindex in np.unique(dataset['Stock']):
      
      stockindices = dataset.index[dataset['Stock'] == stockindex]

      op = dataset.loc[stockindices,'Open']
      hi = dataset.loc[stockindices,'High']
      lo = dataset.loc[stockindices,'Low']
      cl = dataset.loc[stockindices,'Close']

      for candle in candle_names:
          dataset.loc[stockindices,candle] = getattr(talib, candle)(op, hi, lo, cl)

  dataset.iloc[:,12:] = (dataset.iloc[:,12:])/100
  return dataset


def create_Xlin(y_df, most_frequent_n):
    y_df_lin_data = y_df.copy()
    y_df_lin_data = technicalsfeatures(y_df_lin_data)

    y_df_lin_train = y_df_lin_data[y_df_lin_data.Date <'2015-01-01'] 

    X_train_lin = y_df_lin_train.copy()
    X_train_lin = X_train_lin.iloc[:,18:]

    import numpy as np
    pattern_freq = np.zeros(X_train_lin.shape[1])
    pattern_index = np.zeros(X_train_lin.shape[1])

    counter = 0
    for pattern in X_train_lin.columns:
        pattern_freq[counter] = X_train_lin[X_train_lin[str(pattern)] !=0].shape[0]
        counter = counter +1
    pattern_freq_ordered = np.sort(pattern_freq)


    counter = -1
    for pattern in X_train_lin.columns:
        counter = counter +1
        if X_train_lin[X_train_lin[str(pattern)] !=0].shape[0] >= pattern_freq_ordered[-most_frequent_n]:
          pattern_index[counter] = counter
          

    X_lin = np.zeros((y_df_lin_data.shape[0],pattern_index[pattern_index>0].shape[0]))

    mostfreqpatterns = pattern_index[pattern_index>0]

    allpatterns = y_df_lin_data.iloc[:,18:]
    utilizedpatterns = allpatterns.iloc[:,pattern_index[pattern_index>0]]
    X_lin = np.asarray(utilizedpatterns)

    return X_lin


Selecting previously unselected package libta-lib0.
(Reading database ... 160837 files and directories currently installed.)
Preparing to unpack libta.deb ...
Unpacking libta-lib0 (0.4.0-oneiric1) ...
Selecting previously unselected package ta-lib0-dev.
Preparing to unpack ta.deb ...
Unpacking ta-lib0-dev (0.4.0-oneiric1) ...
Setting up libta-lib0 (0.4.0-oneiric1) ...
Setting up ta-lib0-dev (0.4.0-oneiric1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Processing triggers for libc-bin (2.27-3ubuntu1.2) ...
/sbin/ldconfig.real: /usr/local/lib/python3.7/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link

Collecting ta-lib
  Downloading TA-Lib-0.4.21.tar.gz (270 kB)
[K     |████████████████████████████████| 270 kB 7.2 MB/s 
Building wheels for collected packages: ta-lib
  Building wheel for ta-lib (setup.py) ... [?25l[?25hdone
  Created wheel for ta-lib: filename=TA_Lib-0.4.21-cp37-cp37m-linux_x86_64.whl size=1444983 sha256=e27176582a91a5906507cf2c5380ff8c1b03

**Download and transform stock data into input data**

*Note: This takes quite some time. Instead preprocessed data can be imported in the next step*

*Note 2: Ta-Lib installation required see previous step*


In [None]:
#define input parameters
forecastdays = 1
approach = 240
timesteps = 20
n=1
returnfeature = 1

# get s&p 500 data with highest trade volume from yahoo finance and replace the "nan"
stockdata = getsp500data(numberofstocks=50,startdate='1999-12-31', enddate='2019-12-31')  
replacenans(stockdata)

#data preprocessing
dataset = createreturncolumn(stockdata,forecastdays,approach)
dataset = createtargetcolumn(dataset,approach)
dataset = deletedividendentries(dataset)

# create feature vectors
X, y, y_df  = createseries(dataset, timesteps, n, returnfeature)
X = np.reshape(X, (X.shape[0], X.shape[1]*X.shape[2]))

# create candlestick pattern feature vectors with the most frequently occurring patterns
most_frequent_n = 20
X_lin = create_Xlin(y_df, most_frequent_n)

**Load preprocessed input data and create candlestick pattern feature vectors**


In [None]:
dataset = pd.read_csv('dataset_50SP500_stocks.zip')
dataset = dataset.drop(columns='Unnamed: 0')

y_df = pd.read_csv('df_with_features.zip')
y_df = y_df.drop(columns='Unnamed: 0')

y = np.asarray(y_df.Target)

X_lin = np.load('X_lin.npy')

X1 = np.load('X_array_part1.npy')
X2 = np.load('X_array_part2.npy')

X = np.concatenate((X1,X2), axis = 0)

**Hyper-parameter optimization for neural network models**

*   LSTM
*   GRU
*   Fusion ANN
*   Fusion LSTM
*   Fusion GRU


In [None]:
# -*- coding: utf-8 -*-

#import used libraries
import numpy as np
import math
import pandas as pd
import os

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping

import skopt
from skopt import gp_minimize 
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args
from skopt import dump, load

#set random seeds
seed_value= 1

os.environ['PYTHONHASHSEED']=str(seed_value)

import random
random.seed(seed_value)
np.random.seed(seed_value)

try:
    tf.random.set_seed(seed_value)
except:
    tf.set_random_seed(seed_value)
try:
  session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
  K.set_session(sess)
except:
  session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
  tf.compat.v1.keras.backend.set_session(sess)


from tensorflow.keras.metrics import *
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, accuracy_score

#####################################################
# Input / Parameter Definition
#####################################################

# select approach, timesteps and arch
# arch: 'simpleLSTM', 'simpleGRU', 'fusionANN', 'fusionLSTM', 'fusionGRU'

# approach 240: classification
#----------------
# Feature vector: [R_{t-19}, R_{t-19},..., R_{t}], Open, Close, High and Low Prices
# where R_t = P_t / P_{t-1} -1 with  R_{t,m}: Return calculated with P_t closing prices 
# The features are standardized (std dev = 1, mean = 0) with the training data
#
# Target: if stock return >0 -> 1 else: 0
#

arch = 'simpleGRU'
approach = 240
timesteps = 20
forecastdays = 1
toppercentile=1

num_of_iterations = 50

# create folder for specific model results
try:
  os.mkdir(arch)
except:
  pass

#fusion = 1 feeds models with additional X_lin input and creates additional outputs 
if (arch == 'fusionANN')|(arch=='fusionLSTM')|(arch=='fusionGRU'):
  fusion =1
else:
  fusion =0


# define parameter space
dim_learning_rate = Real(low=1e-3, high=1e-1, prior='log-uniform', name='learning_rate')
dim_num_nodes = Integer(low=20, high=100, name='num_nodes')
dim_num_batch_size = Integer(low=7, high=15, name='num_batch') # batch_size= 2^num_batch
dim_dropout_rate = Real(low=0.1, high=0.5, prior='log-uniform', name='dropout_rate')


dimensions = [dim_learning_rate,
              dim_num_nodes,
              dim_num_batch_size,
              dim_dropout_rate]

# define default parameters to start the hyperparameter optimization
lr = 0.03
nodes = 100
batch = 15
dropout = 0.1
default_parameters = [lr, nodes, batch, dropout]

# regulation parameter
l2reg = 0.9

# initialization of global metric value
best_metric = 1.0

# calculate foldsize to distribute input data on equally-sized folds
number_of_folds = 6
last_training_date = '2014-12-31'
datevec = np.unique(y_df.Date)
last_train_date_index = np.where(datevec==min(y_df.loc[y_df.Date >= last_training_date].Date))[0][0]
numberofdays = np.unique(y_df.Date).shape[0]
days_for_val_test = numberofdays-last_train_date_index
fold_size = int(days_for_val_test/number_of_folds)


#define parameter & performance df for all iterations
zeroph = np.zeros(1)
parameter_dict = {'auc':zeroph, 'lr':zeroph, 'node':zeroph, 'batch':zeroph, 'dropout':zeroph}
best_parameter_df = pd.DataFrame(parameter_dict, columns = ['auc','lr','node','batch', 'dropout'], index = range(0,1)) 

# define dataframe to save the performace on the test set during the hyperparameter optimization 
test_perf_dict = {'auc':np.zeros(1), 'acc':np.zeros(1),'balacc':np.zeros(1), 'RMSE':np.zeros(1), 'MSE':np.zeros(1),'MAE':np.zeros(1)}
test_perf_df = pd.DataFrame(test_perf_dict, columns = ['auc','acc','balacc','RMSE','MSE','MAE']) 

# run time series validation, returing the average validation loss (out of sample validation)
def timeseriesCV(dataset, X,X_lin, y,y_df,fold_size, numberofdays, timesteps, learning_rate, num_nodes, num_batch, dropout_rate, number_of_folds, approach, forecastdays, arch):
  
  # create empty lists to track metrics
  metric_list = []
  test_auc = []
  test_acc = []
  test_balacc = []

  val_auc = []
  val_acc = []
  val_balacc = []

  for foldindex in range(1,number_of_folds):

    # split input data into train, validation and test sets
    X_train, X_val, X_test, X_train_lin, X_val_lin, X_test_lin, y_train_df, y_val_df, y_test_df, y_train, y_val, y_test = splitdata(X,X_lin,y,y_df, fold_size, foldindex, last_train_date_index)
    
    # standardize data set based on training data
    X_train, X_val, X_test = standardize_input(y_train_df, dataset, X_train, X_val, X_test, timesteps, forecastdays)
    
    # reshape input for LSTM and GRU networks
    if (arch == 'fusionLSTM')|(arch == 'fusionGRU')|(arch == 'simpleLSTM')|(arch == 'simpleGRU'):
        X_train = np.reshape(X_train, (X_train.shape[0],20,5))
        X_val = np.reshape(X_val, (X_val.shape[0],20,5))
        X_test = np.reshape(X_test, (X_test.shape[0],20,5))
    
    # transform target vector into 2d-vector (softmax activation)
    y_train_2d = create2dy(y_train)
    y_val_2d = create2dy(y_val)
    y_test_2d = create2dy(y_test)

    # calculate class weights
    class_weights = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train),
                                                y_train)
    class_weights = dict(enumerate(class_weights))
        
    #create model based on input choise
    if arch == 'simpleLSTM':
        model = create_simple_lstm_model(learning_rate = learning_rate,
                          num_nodes= num_nodes,
                          dropout_rate = dropout_rate,
                          l2reg = l2reg)
        
    elif arch == 'simpleGRU':
        model = create_simple_gru_model(learning_rate = learning_rate,
                          num_nodes= num_nodes,
                          dropout_rate = dropout_rate,
                          l2reg = l2reg)

    if arch == 'fusionANN':
        model = create_ann_fusion_model(learning_rate = learning_rate,
                          num_nodes= num_nodes,
                          dropout_rate = dropout_rate,
                          X_train_lin = X_train_lin,
                          l2reg = l2reg)

    elif arch == 'fusionLSTM':
        model = create_lstm_fusion_model(learning_rate = learning_rate,
                          num_nodes= num_nodes,
                          dropout_rate = dropout_rate,
                          X_train_lin = X_train_lin,
                          l2reg = l2reg) 
        

    elif arch == 'fusionGRU':
        model = create_gru_fusion_model(learning_rate = learning_rate,
                          num_nodes= num_nodes,
                          dropout_rate = dropout_rate,
                          X_train_lin = X_train_lin,
                          l2reg = l2reg) 
        
    # training stops after a patience period
    es = EarlyStopping(monitor='val_binary_crossentropy', 
                        mode='min', 
                        verbose=0, 
                        patience=10)
    # model with best validation loss is saved
    checkpointer = ModelCheckpoint(filepath="modweights.hdf5",
                                    monitor='val_binary_crossentropy',
                                    mode ='min',
                                    verbose=0, 
                                    save_best_only=True)

    # fit model
    if fusion == 1:
      history = model.fit(x=[X_train_lin, X_train],
                      y=y_train_2d,
                      epochs=2,
                      batch_size=2**num_batch,
                      validation_data=([X_val_lin, X_val],y_val_2d),
                      verbose=0,
                      callbacks=[es, checkpointer],
                      class_weight = class_weights)

    else:
      history = model.fit(x=X_train,
                    y=y_train_2d,
                    epochs=2,
                    batch_size=2**num_batch,
                    validation_data=(X_val,y_val_2d),
                    verbose=0,
                    callbacks=[es, checkpointer])

    # best model is loaded
    model.load_weights('modweights.hdf5')   
    # best model is saved for each fold
    model_dir = str(foldindex) + arch + str(approach) + 'model.h5'
    model.save(model_dir)
 
    # prediction on the train, validation and test sets
    if fusion == 1:
      y_test_pred = model.predict([X_test_lin, X_test])
      y_test_df['Prediction'] = y_test_pred[:,0]
      y_test_df['foldindex']=foldindex

      y_val_pred = model.predict([X_val_lin, X_val])
      y_val_df['Prediction'] = y_val_pred[:,0]
      y_val_df['foldindex']=foldindex

      y_train_pred = model.predict([X_train_lin, X_train])
      y_train_df['Prediction'] = y_train_pred[:,0]
      y_train_df['foldindex']=foldindex

    if fusion == 0:
      y_test_pred = model.predict(X_test)
      y_test_df['Prediction'] = y_test_pred[:,0]
      y_test_df['foldindex']=foldindex

      y_val_pred = model.predict(X_val)
      y_val_df['Prediction'] = y_val_pred[:,0]
      y_val_df['foldindex']=foldindex

      y_train_pred = model.predict(X_train)
      y_train_df['Prediction'] = y_train_pred[:,0]
      y_train_df['foldindex']=foldindex

    if foldindex ==1:
      test_df = y_test_df
      val_df = y_val_df
      train_df = y_train_df
    else:
      test_df = pd.concat([test_df, y_test_df])
      val_df = pd.concat([val_df, y_val_df])
      train_df = pd.concat([train_df, y_train_df])

    # save the weights of the two additional neurons for interpretability
    if arch == 'fusionANN':
      weightarray = model.layers[8].get_weights()[0]
      if foldindex == 1:
        allweights = weightarray
      else:
        allweights = np.concatenate((allweights, weightarray), axis=1)

    elif (arch == 'fusionLSTM')|(arch == 'fusionGRU'):
      weightarray = model.layers[4].get_weights()[0]
      if foldindex == 1:
        allweights = weightarray
      else:
        allweights = np.concatenate((allweights, weightarray), axis=1)

    # add validation loss to metrics list
    bce = np.min(history.history['val_binary_crossentropy'])
    metric_list.append(bce)

    # calculate validation and test metrics
    print('val bce: ', bce)
    test_auc.append(roc_auc_score(y_test_2d,y_test_pred))
    test_acc.append(accuracy_score(y_test,np.round(y_test_pred[:,0])))
    test_balacc.append(balanced_accuracy_score(y_test,np.round(y_test_pred[:,0])))

    val_auc.append(roc_auc_score(y_val_2d,y_val_pred))
    val_acc.append(accuracy_score(y_val,np.round(y_val_pred[:,0])))
    val_balacc.append(balanced_accuracy_score(y_val,np.round(y_val_pred[:,0])))

  metric_list = [x for x in metric_list if (not math.isnan(x))]
  av_metric = np.average(metric_list)
  
  global best_metric
  print(" ")
  print('best acc: ',best_metric )

  # save hyperparameter, performance, weights, model for best performing set of hyper parameters 
  if av_metric < best_metric:
    best_metric = av_metric
    print('Test AUC: ', np.average(test_auc))
    test_perf_df['auc'] = np.average(test_auc)
    best_test_accuray =  np.average(test_acc)
    print('TEST Accuracy: ', best_test_accuray)
    test_perf_df['acc'] = best_test_accuray
    best_test_balaccuray = np.average(test_balacc)
    print('TEST Balcanced Accuracy: ', best_test_balaccuray)
    test_perf_df['balacc'] = best_test_balaccuray
    test_perf_dir = arch +'/'+ arch + 'test_performance_' + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(test_perf_dir, 'w') as csv_file:
      test_perf_df.to_csv(path_or_buf=csv_file,  index=False)

    print('Val AUC: ', np.average(val_auc))
    print('TEST Accuracy: ', np.average(val_acc))
    print('TEST Balcanced Accuracy: ', np.average(val_balacc))

    best_parameter_df.loc[0,'auc'] = np.average(val_auc)
    best_parameter_df.loc[0,'lr'] = learning_rate
    best_parameter_df.loc[0,'node'] = num_nodes
    best_parameter_df.loc[0,'batch'] = 2**num_batch
    best_parameter_df.loc[0,'dropout'] = dropout_rate

    best_param_dir = arch +'/'  +arch +  'val_performance_' +str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(best_param_dir, 'w') as csv_file:
      best_parameter_df.to_csv(path_or_buf=csv_file,  index=False)

    test_df_dir = arch +'/'  + arch + 'test_df_'+  str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(test_df_dir, 'w') as csv_file:
      test_df.to_csv(path_or_buf=csv_file,  index=False)
    
    val_df_dir = arch +'/' + arch +'val_df_' + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(val_df_dir, 'w') as csv_file:
      val_df.to_csv(path_or_buf=csv_file,  index=False)
    
    train_df_dir = arch +'/'  + arch+'train_df_'  + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(train_df_dir, 'w') as csv_file:
      train_df.to_csv(path_or_buf=csv_file,  index=False)

    if fusion ==1:
      allweights_dir = arch +'/' + arch +'allweights_' +str(approach)+'_d_'+str(forecastdays) +'.csv'
      np.savetxt(allweights_dir, allweights, delimiter=",")
   
    for fold in range(1,number_of_folds):
      model_dir = str(fold) + arch + str(approach) + 'model.h5'
      bestmodel = tf.keras.models.load_model(model_dir)

      model_dir = arch +'/'  +str(fold) + arch + str(approach) + 'bestmodel.h5'
      bestmodel.save(model_dir)

  return av_metric


# optimization function for the hyperparameter optimization
@use_named_args(dimensions=dimensions)
def fitness(learning_rate,
            num_nodes,
            num_batch,
            dropout_rate):

    # Print the hyper-parameters.
    print('learning rate: {0:.1e}'.format(learning_rate))
    print('num lstm nodes:', num_nodes)
    print('batch size:', 2**num_batch)
    print('dropout:', dropout_rate)
    print()
    
    # Create the model with a set of hyper-parameters + run time series cross validation     
    av_metric = timeseriesCV(dataset, X, X_lin, y, y_df,fold_size, numberofdays, timesteps, learning_rate, num_nodes, num_batch, dropout_rate, number_of_folds, approach, forecastdays, arch) #num_hidden_layers

    print()
    print("Average bce: ", (av_metric))
    print()
    
    return av_metric
    

##########################################################
# HYPERPARAMETER OPTIMIZATION
##########################################################


# Bayesian optimization
search_result = gp_minimize(func=fitness,
                            dimensions=dimensions,
                            acq_func='EI', # Expected Improvement.
                            n_calls=num_of_iterations,
                            x0=default_parameters)


performance_df, finperformance_df = createresults(arch, approach, forecastdays, toppercentile)

print(performance_df)
print(finperformance_df)

if (arch == 'fusionANN') | (arch == 'fusionLSTM') | (arch == 'fusionGRU'):
    bullishmatrix, bearishmatrix = createpredictionmatrices(arch, approach, forecastdays)


**Grid-search for random forest model**

In [None]:
#import used libraries
import numpy as np
import math
import pandas as pd

# set random seed
seed_value= 1

import os
os.environ['PYTHONHASHSEED']=str(seed_value)

import random
random.seed(seed_value)
np.random.seed(seed_value)

from sklearn.metrics import roc_auc_score, balanced_accuracy_score, accuracy_score
from sklearn.metrics import  mean_squared_error, mean_absolute_error, log_loss

'''
! apt-get install default-jre
!java -version
! pip install h2o
import h2o
h2o.init()
'''

!pip install -q bentoml "h2o>=3.24.0.2"

import h2o
h2o.init(min_mem_size = "6g")


from h2o.estimators.random_forest import H2ORandomForestEstimator

#####################################################
# Input / Parameter Definition
#####################################################

# select approach, timesteps and arch
# arch: 'RF'
# approach 240: RF approach (classification)
#----------------
# Feature vector: [R_{t-19}, R_{t-18},..., R_{t}], Open, Close, High and Low Prices
# where R_t = P_t / P_{t-1} -1 with  R_{t,m}: Return calculated with P_t closing prices 
# The features are standardized (std dev = 1, mean = 0) with the training data
#
# define approach parameters
arch = 'RF'
approach = 240
timesteps = 20
forecastdays=1
toppercentile=1

# create folder for specific model results
try:
  os.mkdir(arch)
except:
  pass

# initialization of global metric value
best_metric = 1.0

# calculate foldsize to distribute input data on equally-sized folds
number_of_folds = 6
last_training_date = '2014-12-31'
datevec = np.unique(y_df.Date)
last_train_date_index = np.where(datevec==min(y_df.loc[y_df.Date >= last_training_date].Date))[0][0]
numberofdays = np.unique(y_df.Date).shape[0]
days_for_val_test = numberofdays-last_train_date_index
fold_size = int(days_for_val_test/number_of_folds)
val_days = fold_size


# define dataframe to save the performace on the test set during the hyperparameter optimization 
test_perf_dict = {'auc':np.zeros(1), 'acc':np.zeros(1),'balacc':np.zeros(1), 'RMSE':np.zeros(1), 'MSE':np.zeros(1),'MAE':np.zeros(1)}
test_perf_df = pd.DataFrame(test_perf_dict, columns = ['auc','acc','balacc','RMSE','MSE','MAE']) 

zeroph = np.zeros(1)
parameter_dict = {'auc':zeroph, 'trees':zeroph, 'depth':zeroph}
best_parameter_df_RF = pd.DataFrame(parameter_dict, columns = ['auc','trees','depth'], index = range(0,1)) 

# transform input data intp h2o data frames
def createh2oframes(X, y):

    indexvec = np.arange(0,len(y))
    colnames = np.arange(-X.shape[1],0,1)
    
    df = pd.DataFrame(data=X[0:,0:],
            index=indexvec,    
            columns=colnames) 
    
    df['y'] = y
    hf = h2o.H2OFrame(df)
    if (approach == 31) | (approach == 240):
      hf['y'] = hf['y'].asfactor()
    
    return hf


# run time series validation, returing the average validation loss (out of sample validation)
def timeseriesCV_RF(dataset,X,X_lin,y,y_df,fold_size, numberofdays, timesteps,ntrees,max_depth, number_of_folds, approach, forecastdays, arch):

  #define empty lists to track performance of each fold
  metric_list = []
  test_auc = []
  test_acc = []
  test_balacc = []

  val_auc = []
  val_acc = []
  val_balacc = []

  for foldindex in range(1,number_of_folds):

    #prepare data
    X_train, X_val, X_test, X_train_lin, X_val_lin, X_test_lin, y_train_df, y_val_df, y_test_df, y_train, y_val, y_test = splitdata(X,X_lin,y,y_df, fold_size, foldindex, last_train_date_index)  
    X_train, X_val, X_test = standardize_input(y_train_df, dataset, X_train, X_val, X_test, timesteps, forecastdays)
    
    hf_train = createh2oframes(X_train, y_train)
    hf_val = createh2oframes(X_val, y_val)
    hf_test = createh2oframes(X_test, y_test)

    ydata= "y"
    xdata= hf_train.columns[:-1]

    #fit model
    rf_fit = H2ORandomForestEstimator(model_id='rf_fit', ntrees=int(ntrees),max_depth=int(max_depth), seed=seed_value, balance_classes=True)
    rf_fit.train(x=xdata, y=ydata, training_frame=hf_train, validation_frame = hf_val)

    # predict target for training, validation and test sets
    y_test_pred = rf_fit.predict(hf_test)
    preddf = y_test_pred.as_data_frame(use_pandas=True)
    y_test_pred = np.array(preddf['p1'])

    y_val_pred = rf_fit.predict(hf_val)
    val_preddf = y_val_pred.as_data_frame(use_pandas=True)
    y_val_pred = np.array(val_preddf['p1'])

    y_train_pred = rf_fit.predict(hf_train)
    train_preddf = y_train_pred.as_data_frame(use_pandas=True)
    y_train_pred = np.array(train_preddf['p1'])


    y_test_df['Prediction'] = y_test_pred
    y_test_df['foldindex']=foldindex

    y_val_df['Prediction'] = y_val_pred
    y_val_df['foldindex']=foldindex

    y_train_df['Prediction'] = y_train_pred
    y_train_df['foldindex']=foldindex
      
    
    if foldindex ==1:
      test_df = y_test_df
      val_df = y_val_df
      train_df = y_train_df
    else:
      test_df = pd.concat([test_df, y_test_df])
      val_df = pd.concat([val_df, y_val_df])
      train_df = pd.concat([train_df, y_train_df])

    logloss = log_loss(y_val,y_val_pred)
    metric_list.append(logloss)

    # calculate metrics for validation and test seta
    test_auc.append(roc_auc_score(y_test,y_test_pred))
    test_acc.append(accuracy_score(y_test,np.round(y_test_pred)))
    test_balacc.append(balanced_accuracy_score(y_test,np.round(y_test_pred)))

    val_auc.append(roc_auc_score(y_val,y_val_pred))
    val_acc.append(accuracy_score(y_val,np.round(y_val_pred)))
    val_balacc.append(balanced_accuracy_score(y_val,np.round(y_val_pred)))

    # delete model to create a new model for the next fold
    h2o.remove_all()

  metric_list = [x for x in metric_list if (not math.isnan(x))]
  av_metric = np.average(metric_list)
  
  global best_metric
  print(" ")
  print('best acc: ',best_metric )

  # save hyper parameter, performance and prediction for best hyper parameters 
  if av_metric < best_metric:
    best_metric = av_metric
    print('Test AUC: ', np.average(test_auc))
    test_perf_df['auc'] = np.average(test_auc)
    best_test_accuray =  np.average(test_acc)
    print('TEST Accuracy: ', best_test_accuray)
    test_perf_df['acc'] = best_test_accuray
    best_test_balaccuray = np.average(test_balacc)
    print('TEST Balcanced Accuracy: ', best_test_balaccuray)
    test_perf_df['balacc'] = best_test_balaccuray
    test_perf_dir = arch +'/'+ arch + 'test_performance_' + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(test_perf_dir, 'w') as csv_file:
      test_perf_df.to_csv(path_or_buf=csv_file,  index=False)

    print('Val AUC: ', np.average(val_auc))
    print('Val Accuracy: ', np.average(val_acc))
    print('Val Balcanced Accuracy: ', np.average(val_balacc))

    best_parameter_df_RF.loc[0,'auc'] = np.average(av_metric)
    best_parameter_df_RF.loc[0,'trees'] = ntrees
    best_parameter_df_RF.loc[0,'depth'] = max_depth

    best_param_dir = arch +'/'  +arch +  'val_performance_' +str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(best_param_dir, 'w') as csv_file:
      best_parameter_df_RF.to_csv(path_or_buf=csv_file,  index=False)

    test_df_dir = arch +'/'  + arch + 'test_df_'+  str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(test_df_dir, 'w') as csv_file:
      test_df.to_csv(path_or_buf=csv_file,  index=False)
    
    val_df_dir = arch +'/' + arch +'val_df_' + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(val_df_dir, 'w') as csv_file:
      val_df.to_csv(path_or_buf=csv_file,  index=False)
    
    train_df_dir = arch +'/'  + arch+'train_df_'  + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(train_df_dir, 'w') as csv_file:
      train_df.to_csv(path_or_buf=csv_file,  index=False)

  return av_metric


##########################################################
# HYPERPARAMETER OPTIMIZATION
##########################################################

# grid search
av_metric_list = []
for numtrees in [100,300,500,1000,2000]:
  for treedepth in [20,10,5]:
    av_metric = timeseriesCV_RF(dataset,X,X_lin,y,y_df,fold_size, numberofdays, timesteps,numtrees,treedepth, number_of_folds, approach, forecastdays, arch) #num_hidden_layers
    av_metric_list.append(av_metric)
    print('trees: ', numtrees, 'depth: ', treedepth)
    print(av_metric_list)

performance_df, finperformance_df = createresults(arch, approach, forecastdays, toppercentile)

print(performance_df)
print(finperformance_df)



**Grid-search for ridge/LASSO regression**

In [None]:
# -*- coding: utf-8 -*-

#import used libraries
import numpy as np
#import math
import pandas as pd

#set random seed
seed_value= 1
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
random.seed(seed_value)
np.random.seed(seed_value)

from sklearn.metrics import roc_auc_score, balanced_accuracy_score, accuracy_score, log_loss

'''
! apt-get install default-jre
!java -version
! pip install h2o
'''
!pip install -q bentoml "h2o>=3.24.0.2"

import h2o
h2o.init(min_mem_size = "6g")

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
#####################################################
# Input / Parameter Definition
#####################################################

# select approach, timesteps and arch
# arch: 'ridge' or 'lasso'
# approach 240: classification
#----------------
# Feature vector: [R_{t-19}, R_{t-18},..., R_{t}], Open, Close, High and Low Prices
# where R_t = P_t / P_{t-1} -1 with  R_{t,m}: Return calculated with P_t closing prices 
# The features are standardized (std dev = 1, mean = 0) with the training data
#

fusion = 0
arch = 'ridge'
approach = 240
timesteps = 20
forecastdays=1
toppercentile=1

# create folder for specific model results
try:
  os.mkdir(arch)
except:
  pass

# initialization of global metric value
best_metric = 1.0

# calculate foldsize to distribute input data on equally-sized folds
number_of_folds = 6
last_training_date = '2014-12-31'
datevec = np.unique(y_df.Date)
last_train_date_index = np.where(datevec==min(y_df.loc[y_df.Date >= last_training_date].Date))[0][0]
numberofdays = np.unique(y_df.Date).shape[0]
days_for_val_test = numberofdays-last_train_date_index#-val_days
fold_size = int(days_for_val_test/number_of_folds)
val_days = fold_size


# define dataframe to save the performace on the test set during the hyperparameter optimization 
test_perf_dict = {'auc':np.zeros(1), 'acc':np.zeros(1),'balacc':np.zeros(1), 'RMSE':np.zeros(1), 'MSE':np.zeros(1),'MAE':np.zeros(1)}
test_perf_df = pd.DataFrame(test_perf_dict, columns = ['auc','acc','balacc','RMSE','MSE','MAE']) 

zeroph = np.zeros(1)
parameter_dict = {'auc':zeroph, 'lambda_val':zeroph}
best_parameter_df_reg = pd.DataFrame(parameter_dict, columns = ['auc','lambda_val'], index = range(0,1)) 


def createh2oframes(X, y):

    indexvec = np.arange(0,len(y))
    colnames = np.arange(-X.shape[1],0,1)
    
    df = pd.DataFrame(data=X[0:,0:],
            index=indexvec,    
            columns=colnames) 
      
    df['y'] = y
    hf = h2o.H2OFrame(df)
    if (approach == 31) | (approach == 240):
      hf['y'] = hf['y'].asfactor()
    
    return hf


# run time series validation, returing the average validation loss (out of sample validation)
def timeseriesCV_reg(dataset,X,X_lin,y,y_df,fold_size, numberofdays, timesteps, lambda_val, number_of_folds, approach, forecastdays, arch):

  metric_list = []
  test_auc = []
  test_acc = []
  test_balacc = []

  val_auc = []
  val_acc = []
  val_balacc = []


  for foldindex in range(1,number_of_folds):

    #prepare data
    X_train, X_val, X_test, X_train_lin, X_val_lin, X_test_lin, y_train_df, y_val_df, y_test_df, y_train, y_val, y_test = splitdata(X,X_lin,y,y_df, fold_size, foldindex, last_train_date_index)  
    X_train, X_val, X_test = standardize_input(y_train_df, dataset, X_train, X_val, X_test, timesteps, forecastdays)

    if arch == 'lasso':
      alpha_value = 1
    if arch == 'ridge':
      alpha_value = 0
    
    hf_train = createh2oframes(X_train, y_train)
    hf_val = createh2oframes(X_val, y_val)
    hf_test = createh2oframes(X_test, y_test)

    ydata= "y"
    xdata= hf_train.columns[:-1]

    # create + fit model
    glm = H2OGeneralizedLinearEstimator(alpha = alpha_value,
                                        lambda_ = lambda_val,
                                        seed = seed_value)
    glm.train(x=xdata, y=ydata, training_frame=hf_train, validation_frame=hf_val)
    
    # predict target for training, validation and test sets
    y_test_pred = glm.predict(hf_test)
    preddf = y_test_pred.as_data_frame(use_pandas=True)
    y_test_pred = np.array(preddf['p1'])

    y_val_pred = glm.predict(hf_val)
    val_preddf = y_val_pred.as_data_frame(use_pandas=True)
    y_val_pred = np.array(val_preddf['p1'])

    y_train_pred = glm.predict(hf_train)
    train_preddf = y_train_pred.as_data_frame(use_pandas=True)
    y_train_pred = np.array(train_preddf['p1'])

    y_test_df['Prediction'] = y_test_pred
    y_test_df['foldindex']=foldindex

    y_val_df['Prediction'] = y_val_pred
    y_val_df['foldindex']=foldindex

    y_train_df['Prediction'] = y_train_pred
    y_train_df['foldindex']=foldindex
         
    if foldindex ==1:
      test_df = y_test_df
      val_df = y_val_df
      train_df = y_train_df
    else:
      test_df = pd.concat([test_df, y_test_df])
      val_df = pd.concat([val_df, y_val_df])
      train_df = pd.concat([train_df, y_train_df])

    logloss = log_loss(y_val,y_val_pred)
    metric_list.append(logloss)
    # calculate metrics for validation and test seta
    test_auc.append(roc_auc_score(y_test,y_test_pred))
    test_acc.append(accuracy_score(y_test,np.round(y_test_pred)))
    test_balacc.append(balanced_accuracy_score(y_test,np.round(y_test_pred)))

    val_auc.append(roc_auc_score(y_val,y_val_pred))
    val_acc.append(accuracy_score(y_val,np.round(y_val_pred)))
    val_balacc.append(balanced_accuracy_score(y_val,np.round(y_val_pred)))
    
    # delete model to create a new model for the next fold
    h2o.remove_all()

  metric_list = [x for x in metric_list if (not math.isnan(x))]
  av_metric = np.average(metric_list)
  
  global best_metric
  print(" ")
  print('best acc: ',best_metric )

  # save hyper parameter, performance and prediction for best hyper parameters
  if av_metric < best_metric:
    best_metric = av_metric
    print('Test AUC: ', np.average(test_auc))
    test_perf_df['auc'] = np.average(test_auc)
    best_test_accuray =  np.average(test_acc)
    print('TEST Accuracy: ', best_test_accuray)
    test_perf_df['acc'] = best_test_accuray
    best_test_balaccuray = np.average(test_balacc)
    print('TEST Balcanced Accuracy: ', best_test_balaccuray)
    test_perf_df['balacc'] = best_test_balaccuray
    test_perf_dir = arch +'/'+ arch + 'test_performance_' + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(test_perf_dir, 'w') as csv_file:
      test_perf_df.to_csv(path_or_buf=csv_file,  index=False)

    print('Val AUC: ', np.average(val_auc))
    print('Val Accuracy: ', np.average(val_acc))
    print('Val Balcanced Accuracy: ', np.average(val_balacc))

    best_parameter_df_reg.loc[0,'auc'] = np.average(av_metric)
    best_parameter_df_reg.loc[0,'lambda_val'] = lambda_val

    best_param_dir = arch +'/'  +arch +  'val_performance_' +str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(best_param_dir, 'w') as csv_file:
      best_parameter_df_reg.to_csv(path_or_buf=csv_file,  index=False)

    test_df_dir = arch +'/'  + arch + 'test_df_'+  str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(test_df_dir, 'w') as csv_file:
      test_df.to_csv(path_or_buf=csv_file,  index=False)
    
    val_df_dir = arch +'/' + arch +'val_df_' + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(val_df_dir, 'w') as csv_file:
      val_df.to_csv(path_or_buf=csv_file,  index=False)
    
    train_df_dir = arch +'/'  + arch+'train_df_'  + str(approach)+'_d_'+str(forecastdays) +'.csv'
    with open(train_df_dir, 'w') as csv_file:
      train_df.to_csv(path_or_buf=csv_file,  index=False)

  return av_metric


##########################################################
# HYPERPARAMETER OPTIMIZATION
##########################################################

# Grid search
av_metric_list = []
for lambdaval in [0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,0.9]:

    av_metric = timeseriesCV_reg(dataset,X,X_lin,y,y_df,fold_size, numberofdays, timesteps, lambdaval, number_of_folds, approach, forecastdays, arch)
    av_metric_list.append(av_metric)

performance_df, finperformance_df = createresults(arch, approach, forecastdays, toppercentile)

print(performance_df)
print(finperformance_df)
