In [35]:
import sys
from google.colab import drive
drive.mount('/content/drive')
# to use the packages in google drive
sys.path.append('/content/drive/My Drive/6000M_proj2/proj2')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
from tensorflow import keras

In [37]:
!pip install loguru

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [38]:
import yfinance as yf
import pandas as pd
import seaborn as sns
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, LeakyReLU, LSTM, BatchNormalization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from scipy.stats import skew, kurtosis
from os.path import join

from config import *
from src.universe import Universe
from src.utils import time_series_generator
from src.metrics import plot_mse

In [39]:
!pip install keras-tuner --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [40]:
import tensorflow as tf
tf.compat.v1.experimental.output_all_intermediates(True)

from tensorflow.keras import layers
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, LeakyReLU, LSTM, BatchNormalization
from keras.layers import Dense, Input, LSTM, Layer, TimeDistributed, Lambda, Activation, Add
from keras.layers import GlobalMaxPooling1D, Flatten, GlobalAveragePooling1D, Reshape, concatenate
from tensorflow.keras.initializers import Ones, Zeros
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.initializers import glorot_normal
from tensorflow.keras import callbacks
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

# refer to # # https://www.kaggle.com/shujian/transformer-with-lstm
# define the multi-head transformer structure by hand
class LayerNormalization(Layer):
    def __init__(self, eps=1e-6, **kwargs):
        self._eps = eps
        super(LayerNormalization, self).__init__(**kwargs)

    def build(self, input_shape):
        self._gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
                                      initializer=Ones(), trainable=True)
        self._beta = self.add_weight(name='beta', shape=input_shape[-1:],
                                     initializer=Zeros(), trainable=True)
        super(LayerNormalization, self).build(input_shape)

    def call(self, x):
        mean = K.mean(x, axis=-1, keepdims=True)
        std = K.std(x, axis=-1, keepdims=True)
        return self._gamma * (x - mean) / (std + self._eps) + self._beta

    def compute_output_shape(self, input_shape):
        return input_shape


class ScaledDotProductAttention():
    def __init__(self, d_model, attn_dropout=0.1):
        self._temper = np.sqrt(d_model)
        self._dropout = Dropout(attn_dropout)

    def __call__(self, q, k, v, mask):
        attn = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 2]) / self._temper)([q, k])
        if mask is not None:
            mmask = Lambda(lambda x: (-1e+10) * (1 - x))(mask)
            attn = Add()([attn, mmask])
        attn = Activation('softmax')(attn)
        attn = self._dropout(attn)
        output = Lambda(lambda x: K.batch_dot(x[0], x[1]))([attn, v])
        return output, attn


class MultiHeadAttention():
    # mode 0 - big martixes, faster; mode 1 - more clear implementation
    def __init__(self, num_head, dim_model, dim_key, dim_value, dropout, mode=0, use_norm=True):
        self._head_mode = mode
        self._num_head = num_head
        self._dim_key = dim_key
        self._dim_value = dim_value
        self._dropout = dropout
        if self._head_mode == 0:
            self.qs_layer = Dense(num_head * dim_key, use_bias=False)
            self.ks_layer = Dense(num_head * dim_key, use_bias=False)
            self.vs_layer = Dense(num_head * dim_value, use_bias=False)
        elif self._head_mode == 1:
            self.qs_layers = []
            self.ks_layers = []
            self.vs_layers = []
            for _ in range(num_head):
                self.qs_layers.append(TimeDistributed(Dense(dim_key, use_bias=False)))
                self.ks_layers.append(TimeDistributed(Dense(dim_key, use_bias=False)))
                self.vs_layers.append(TimeDistributed(Dense(dim_value, use_bias=False)))
        self.attention = ScaledDotProductAttention(dim_model)
        self.layer_norm = LayerNormalization() if use_norm else None
        self.w_o = TimeDistributed(Dense(dim_model))

    def __call__(self, q, k, v, mask=None):
        from keras.layers import Concatenate
        if self._head_mode == 0:
            qs = self.qs_layer(q)
            ks = self.ks_layer(k)
            vs = self.vs_layer(v)

            def reshape1(x):
                shape = tf.shape(x)
                x = tf.reshape(x, [shape[0], shape[1], self._num_head, self._dim_key])
                x = tf.transpose(x, [2, 0, 1, 3])
                x = tf.reshape(x, [-1, shape[1], self._dim_key])
                return x

            qs = Lambda(reshape1)(qs)
            ks = Lambda(reshape1)(ks)
            vs = Lambda(reshape1)(vs)

            if mask is not None:
                mask = Lambda(lambda x: K.repeat_elements(x, self._num_head, 0))(mask)
            head, attn = self.attention(qs, ks, vs, mask=mask)

            def reshape2(x):
                shape = tf.shape(x)
                x = tf.reshape(x, [self._num_head, -1, shape[1], shape[2]])
                x = tf.transpose(x, [1, 2, 0, 3])
                x = tf.reshape(x, [-1, shape[1], self._num_head * self._dim_value])
                return x

            head = Lambda(reshape2)(head)
        elif self._head_mode == 1:
            heads = [];
            attns = []
            for i in range(self._num_head):
                qs = self.qs_layers[i](q)
                ks = self.ks_layers[i](k)
                vs = self.vs_layers[i](v)
                head, attn = self.attention(qs, ks, vs, mask)
                heads.append(head);
                attns.append(attn)
            head = Concatenate()(heads) if self._num_head > 1 else heads[0]
            attn = Concatenate()(attns) if self._num_head > 1 else attns[0]

        outputs = self.w_o(head)
        outputs = Dropout(self._dropout)(outputs)
        if not self.layer_norm: return outputs, attn
        # outputs = Add()([outputs, q]) # sl: fix
        return self.layer_norm(outputs), attn

In [41]:
class Transformer:
    def __init__(self, win_len, input_dim, hidden_dim=[128, 64], attn_mode=0, head_mode=0, scd_layer=True):
        self.name = 'Transformer'
        self._win_len = win_len
        self._input_dim = input_dim
        self._hidden_dim = hidden_dim
        # if attn_mode = 1, it will deploy the handmade multihead attention layer
        # if attn_mode = 0, it will deploy the keras multihead attention layer
        self._attn_mode = attn_mode
        self._head_mode = head_mode
        self._scd_layer = scd_layer

    def build_and_compile(self) -> None:
        inputs = Input(shape=(self._win_len, self._input_dim))
        x = LSTM(self._hidden_dim[0], return_sequences=True)(inputs)
        if self._scd_layer == True:
          x = LSTM(self._hidden_dim[1], return_sequences=True)(x)
        if self._attn_mode == 1:
            x, slf_attn = MultiHeadAttention(num_head=3, dim_model=300,
                                             dim_key=self._hidden_dim[1],
                                             dim_value=self._hidden_dim[1],
                                             dropout=0.1,
                                             mode=self._head_mode)(x, x, x)
        elif self._attn_mode == 0:
            x, slf_attn = layers.MultiHeadAttention(
                num_heads=3,
                key_dim=self._hidden_dim[1],
                value_dim=self._hidden_dim[1],
                dropout=0.1)(x, x, x,
                             return_attention_scores=True)
        else:
            print('Wrong attn_mode input. Attention layer will not be deployed.')
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        conc = concatenate([avg_pool, max_pool])
        dense = Dense(self._hidden_dim[1], activation="relu")(conc)
        dense_out = Dense(self._input_dim)(dense)

        model = Model(inputs=inputs, outputs=dense_out)

        model.compile(
            loss='mse',
            metrics=['mse'],
        )

        self._model = model

    def get_model(self) -> Model:
        return self._model

    def summary(self) -> None:
        self._model.summary()

    def fit(self, **kwargs) -> None:
        self._model.fit(**kwargs)

    def predict(self, X_test):
        return self._model.predict(X_test)
        

In [42]:
import keras_tuner

def build_model(hp):
    hid1 = hp.Int("hidden1", min_value=32, max_value=512, step=32)
    hid2 = hp.Int("hidden2", min_value=32, max_value=512, step=32)
    attn_mode = hp.Int("attn", min_value=0, max_value=1, step=1)
    head_mode = hp.Int("head", min_value=0, max_value=1, step=1)
    scd_layer = hp.Boolean("Second Layer")
    hidden_dim = [hid1, hid2]
    build_transformer = Transformer(30, 2500, hidden_dim=hidden_dim, attn_mode=attn_mode, head_mode=head_mode, scd_layer=scd_layer)
    build_transformer.build_and_compile()
    model = build_transformer.get_model()
    return model


build_model(keras_tuner.HyperParameters())

<keras.engine.functional.Functional at 0x7f923fed19d0>

In [43]:
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective="val_loss",
    max_trials=10,
    executions_per_trial=10,
    overwrite=True,
    directory="my_dir",
    project_name="Transformer",
)

In [44]:
tuner.search_space_summary()

Search space summary
Default search space size: 5
hidden1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
hidden2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
attn (Int)
{'default': None, 'conditions': [], 'min_value': 0, 'max_value': 1, 'step': 1, 'sampling': 'linear'}
head (Int)
{'default': None, 'conditions': [], 'min_value': 0, 'max_value': 1, 'step': 1, 'sampling': 'linear'}
Second Layer (Boolean)
{'default': False, 'conditions': []}


In [45]:
inception_date = '2022-03-31'
ONE_YEAR_TRADE_DAYS = 252
TWO_YEAR_TRADE_DAYS = ONE_YEAR_TRADE_DAYS * 2
WIN_LEN = 30
UNIVERSE_SIZE = 2500
EPOCH = 20
BATCH_SIZE = 16
training_path = Path(join(data_path, 'train_set'))
ret_train = pd.read_csv(join(training_path, '2022-03-31.csv'), index_col=0)
X, y = time_series_generator(ret_train, WIN_LEN)
train_index = ret_train.loc[:inception_date].iloc[-TWO_YEAR_TRADE_DAYS:].index
X_train, X_val, y_train, y_val = train_test_split(X,y)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((378, 30, 2500), (126, 30, 2500), (378, 2500), (126, 2500))

In [46]:
tuner.search(X_train, y_train, epochs=2, validation_data=(X_val, y_val))

Trial 10 Complete [00h 02m 13s]
val_loss: 0.009882439622685077

Best val_loss So Far: 0.002595622382969374
Total elapsed time: 00h 37m 56s


In [47]:
models = tuner.get_best_models(num_models=2)
best_model = models[0]

In [48]:
best_model.build(input_shape=(None, 30, 2500))

In [49]:
best_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 30, 2500)]   0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 30, 480)      5723520     ['input_1[0][0]']                
                                                                                                  
 multi_head_attention (MultiHea  ((None, 30, 480),   738912      ['lstm[0][0]',                   
 dAttention)                     (None, 3, 30, 30))               'lstm[0][0]',                   
                                                                  'lstm[0][0]']                   
                                                                                              

In [50]:
tuner.results_summary()

Results summary
Results in my_dir/Transformer
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 04 summary
Hyperparameters:
hidden1: 480
hidden2: 128
attn: 0
head: 0
Second Layer: False
Score: 0.002595622382969374

Trial 00 summary
Hyperparameters:
hidden1: 64
hidden2: 64
attn: 0
head: 1
Second Layer: False
Score: 0.002595659167993636

Trial 03 summary
Hyperparameters:
hidden1: 352
hidden2: 128
attn: 0
head: 0
Second Layer: True
Score: 0.002595840852027611

Trial 05 summary
Hyperparameters:
hidden1: 160
hidden2: 192
attn: 0
head: 1
Second Layer: True
Score: 0.0025962796775505894

Trial 02 summary
Hyperparameters:
hidden1: 320
hidden2: 192
attn: 0
head: 1
Second Layer: False
Score: 0.002596281621132105

Trial 01 summary
Hyperparameters:
hidden1: 32
hidden2: 512
attn: 0
head: 0
Second Layer: True
Score: 0.0025963249390885705

Trial 07 summary
Hyperparameters:
hidden1: 64
hidden2: 256
attn: 0
head: 1
Second Layer: False
Score: 0.002596362462149016

Trial 06 summary