In [1]:
import keras
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras import layers
from tensorflow.keras.callbacks import CSVLogger
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
import json
from dataclasses import dataclass, asdict
import ast
import os
from typing import Union, List
from datetime import datetime, timedelta
import time

In [2]:
from sklearn.preprocessing import StandardScaler

In [3]:
import stride_data

In [4]:
import pandas as pd
import numpy as np

In [5]:
from collections import namedtuple
Chunk = namedtuple('Chunk', ['lob', 'factor', 'label'])

In [6]:
def get_factor_itr(factor_file_list=['factor_22.csv', 'factor_23.csv'], base_folder='factor',
                   chunksize=500000):
    itr_list = []
    for f_name in factor_file_list:
        itr_list.append(pd.read_csv(fr'{base_folder}/{f_name}', chunksize=chunksize))

    for df_tuple in zip(*itr_list):
        df_list = []
        for df in df_tuple:
            df['time'] = pd.to_datetime(df['time'])
            df = df.set_index('time').sort_index()
            # print(df.index.min(), df.index.max())
            df = df[~df.index.duplicated()]
            # print(df.index.min(), df.index.max())
            df_list.append(df)
        yield pd.concat(df_list, axis=1)

In [7]:
chunksize = 100000

data_itr = pd.read_csv(r'data/data_night_shifted_au.csv.gz', chunksize=chunksize)
label_itr = pd.read_csv(r'label/5_min_tp4_sl2_10yuan_target.csv', chunksize=chunksize)
factor_itr = get_factor_itr(['factor_24.csv', 'factor_23.csv'], chunksize=chunksize)

all_feature = ['volume', 'bid_1', 'bid_1', 'bid_size_1', 'ask_1',
       'ask_size_1', 'bid_2', 'bid_size_2', 'ask_2', 'ask_size_2', 'bid_3',
       'bid_size_3', 'ask_3', 'ask_size_3', 'bid_4', 'bid_size_4', 'ask_4',
       'ask_size_4', 'bid_5', 'bid_size_5', 'ask_5', 'ask_size_5', 'vwap']

lob_feature = ['bid_1', 'bid_size_1', 'ask_1',
               'ask_size_1', 'bid_2', 'bid_size_2', 'ask_2', 'ask_size_2', 'bid_3',
               'bid_size_3', 'ask_3', 'ask_size_3', 'bid_4', 'bid_size_4', 'ask_4',
               'ask_size_4', 'bid_5', 'bid_size_5', 'ask_5', 'ask_size_5']

def get_chunk_from_file_gen():
    for df, fac, label in zip(data_itr, factor_itr, label_itr):
        df['time'] = pd.to_datetime(df['time'])
        df = df.set_index('time').sort_index()

        label['time'] = pd.to_datetime(label['time'])
        label = label.set_index('time').sort_index()

        df = df.loc[~df.index.duplicated()]
        fac = fac.loc[~fac.index.duplicated()]
        label = label.loc[~label.index.duplicated()]
        # print(df.index.max(), label.index.max(), fac.index.max())
        if (fac.index.max() != label.index.max()) or (fac.index.max() != df.index.max()):
            raise RuntimeError("Index do not match!")

        yield Chunk(df, fac, label), label.index.max()

def concat_chunk(*chunk_tuple):
    arg_list = []
    for df_tuple in zip(*chunk_tuple):
        df = pd.concat(df_tuple).sort_index()
        df = df.loc[~df.index.duplicated()]
        arg_list.append(df)
    return Chunk(*arg_list)

def get_weekly_data_gen(skip_to=None):
    print('Data loading started')
    while True:
        chunk_gen = get_chunk_from_file_gen()
        chunk, latest = next(chunk_gen)
        print(f"                                         ", end='\r')
        time.sleep(0)
        print(f"{latest=}", end='\r')
        time.sleep(0)
        if skip_to is not None:
            if latest < skip_to:
                continue

        if (chunk.label.index.day_of_week == 0).sum() > 0:
            start = chunk.label[chunk.label.index.day_of_week == 0].index.date.min()
            start = pd.Timestamp(start)
            end = start + pd.Timedelta(5,'d') #end on friday midnight
            break

    chunk = Chunk(*[data[data.index >= start] for data in chunk])
    for new_chunk, latest in chunk_gen:
        chunk = concat_chunk(chunk, new_chunk)
        while latest > end:
            print(f"                                             ", end='\r')
            time.sleep(0)
            print(f"{latest=}", end='\r')
            time.sleep(0)
            to_yield = Chunk(*(df.loc[start:end] for df in chunk))
            chunk = Chunk(*(df.loc[end:] for df in chunk))
            start = start + pd.Timedelta(7,'d')
            end = end + pd.Timedelta(7,'d')
            yield to_yield

def get_chunk_list_gen(list_len=2, skip_to=None, end=None):
    out_list = []
    weekly_gen = get_weekly_data_gen(skip_to)
    for week_chunk in weekly_gen:
        if (end is not None) and (week_chunk.label.index.max() > end):
            break 
        out_list.append(week_chunk)
        if len(out_list) < list_len:
            continue
        if len(out_list) > list_len:
            out_list = out_list[1:] #basically, pop left
        yield out_list

def split_market(chunk, avoid_market_edge = timedelta(minutes=5)):
    chunk_arg = []
    for df in chunk:
        night_df = df.loc[(df.index.time >= (datetime(1970,1,1,21,0,0)+avoid_market_edge).time()) 
                        | (df.index.time <= (datetime(1970,1,1,2,30,0)-avoid_market_edge).time())] #night market
        morning_df = df.loc[(df.index.time >= (datetime(1970,1,1,9,0,0)+avoid_market_edge).time()) 
                            & (df.index.time <= (datetime(1970,1,1,11,30,0)-avoid_market_edge).time())] #morning market
        afternoon_df = df.loc[(df.index.time >= (datetime(1970,1,1,13,30,0)+avoid_market_edge).time())
                            & (df.index.time <= (datetime(1970,1,1,15,0,0)-avoid_market_edge).time())] #afternoon market
        chunk_arg.append((morning_df, afternoon_df, night_df))

    return tuple(Chunk(*df_list) for df_list in zip(*chunk_arg))

In [8]:

# training_week = 8
# skip_to = pd.Timestamp('2022-11-01 00:00:00.000') #use this to skip to the period we want
# # skip_to = None
# # end = pd.Timestamp('2022-07-15 23:59:59.500') #use this to fix when to stop training
# end = pd.Timestamp('2023-03-01 00:00:00')

# chunk_list_gen = get_chunk_list_gen(training_week+1, skip_to=skip_to, end=end)
# for i, chunk_list in enumerate(chunk_list_gen):
#     break

# train_chunk = concat_chunk(*chunk_list[:training_week])
# walk_forward_chunk = chunk_list[-1]

# morning_train_chunk, afternoon_train_chunk, night_train_chunk = split_market(train_chunk)
# morning_wf_chunk, afternoon_wf_chunk, night_wf_chunk = split_market(walk_forward_chunk)

In [9]:
# lookback = 50 #this number of timestep per batch
# batch_size = 1000 #this number of samples in a batch

# chunk = morning_train_chunk

# sp_list_list = []
# for chunk in [morning_train_chunk, afternoon_train_chunk, night_train_chunk]:

#     X = chunk.lob.loc[:, all_feature].join(chunk.factor)
#     y = chunk.label

#     sp_list : List[stride_data.SequencePair] = stride_data.create_train_val_sequence_cv(X, y.iloc[:, 0], cv=4, lookback=lookback, 
#                                             batch_size=batch_size, batch_no=None, 
#                                             shuffle=False)

#     sp_list_list.append(sp_list)

# sequence_pair_list_per_cv = list(zip(*sp_list_list))
# sequence_list_per_cv = [stride_data.CombinedSequence(*[sequence_pair_list_per_cv[i][j].train_sequence 
#                         for j in range(len(sequence_pair_list_per_cv[i]))])
#                         for i in range(len(sequence_pair_list_per_cv))]

In [10]:
help(layers.Conv2D)

Help on class Conv2D in module keras.src.layers.convolutional.conv2d:

class Conv2D(keras.src.layers.convolutional.base_conv.Conv)
 |  Conv2D(filters, kernel_size, strides=(1, 1), padding='valid', data_format=None, dilation_rate=(1, 1), groups=1, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs)
 |  
 |  2D convolution layer (e.g. spatial convolution over images).
 |  
 |  This layer creates a convolution kernel that is convolved
 |  with the layer input to produce a tensor of
 |  outputs. If `use_bias` is True,
 |  a bias vector is created and added to the outputs. Finally, if
 |  `activation` is not `None`, it is applied to the outputs as well.
 |  
 |  When using this layer as the first layer in a model,
 |  provide the keyword argument `input_shape`
 |  (tuple of integers or `None`, does not include the sampl

In [11]:
layers.Conv2D(16, (1,2), (1,2), activation= 'leaky_relu')

<keras.src.layers.convolutional.conv2d.Conv2D at 0x126a553af70>

In [12]:
schematic_dict = {
    'stride2_depth_simple': [
        (layers.Reshape, {'target_shape': (50,-1,1)}), #50 lookback
        (layers.Conv2D, {'filters': 16, 'kernel_size': (1, 2), 'strides': (1, 2), 'activation': 'leaky_relu'}),
        # (layers.Conv2D, {'filters': 8, 'kernel_size': (5, 1), 'padding': 'same', 'activation': 'leaky_relu'}),
        (layers.BatchNormalization, {}),
        (layers.Conv2D, {'filters': 16, 'kernel_size': (1, 2), 'strides': (1, 2), 'activation': 'leaky_relu'}),
        (layers.Conv2D, {'filters': 16, 'kernel_size': (5, 1), 'padding': 'same', 'activation': 'leaky_relu'}),
        (layers.BatchNormalization, {}),
        (layers.Conv2D, {'filters': 16, 'kernel_size': (1, 5), 'activation': 'leaky_relu'}),
        # (layers.Conv2D, {'filters': 8, 'kernel_size': (5, 1), 'padding': 'same', 'activation': 'leaky_relu'}),
        (layers.Reshape, {'target_shape': (50,-1)}), #50 lookback
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 32, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
    ],
    'midnight_1': [ # nope
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2'}),
        (layers.Dense, {'units': 10, 'kernel_regularizer': 'l2'}),
        (layers.Dense, {'units': 5, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_2': [
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units': 10, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units': 5, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_3': [ # nope
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2'}),
        (layers.Dense, {'units': 5, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_4': [
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units': 5, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_5': [ # nope
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6': [
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_rnn': [
        (layers.SimpleRNN, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.SimpleRNN, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_1': [
        (layers.LSTM, {'units': 30, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_2': [
        (layers.LSTM, {'units': 30, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_2_rnn': [
        (layers.SimpleRNN, {'units': 30, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.SimpleRNN, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_3': [
        (layers.LSTM, {'units': 40, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_4': [
        (layers.LSTM, {'units': 128, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_4_mini': [
        (layers.LSTM, {'units': 32, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_5': [
        (layers.LSTM, {'units': 128, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 64, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_6': [
        (layers.LSTM, {'units': 128, 'kernel_regularizer': 'l2', 'recurrent_dropout': 0.2, 'return_sequences': True}),
        (layers.Dropout, {'rate': 0.2}),
        (layers.LSTM, {'units': 64, 'kernel_regularizer': 'l2', 'recurrent_dropout': 0.2}),
        (layers.Dropout, {'rate': 0.2}),
        (layers.Dense, {'units':32, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_6': [
        (layers.LSTM, {'units': 128, 'kernel_regularizer': 'l2', 'recurrent_dropout': 0.2, 'return_sequences': True}),
        (layers.Dropout, {'rate': 0.2}),
        (layers.LSTM, {'units': 64, 'kernel_regularizer': 'l2', 'recurrent_dropout': 0.2, 'return_sequences': True}),
        (layers.Dropout, {'rate': 0.2}),
        (layers.LSTM, {'units': 64, 'kernel_regularizer': 'l2', 'recurrent_dropout': 0.2}),
        (layers.Dropout, {'rate': 0.2}),
        (layers.Dense, {'units':32, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_6_7': [
        (layers.LSTM, {'units': 128, 'kernel_regularizer': 'l2', 'recurrent_dropout': 0.2, 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 64, 'kernel_regularizer': 'l2', 'recurrent_dropout': 0.2, 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 64, 'kernel_regularizer': 'l2', 'recurrent_dropout': 0.2}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':32, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_7': [ # nope
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'recurrent_dropout':0.2, 'return_sequences': True}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'recurrent_dropout':0.2}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2', }),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_8': [ # nope
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'recurrent_dropout':0.4, 'return_sequences': True}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'recurrent_dropout':0.4}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2', }),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_9': [ # out
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'recurrent_dropout':0.5, 'return_sequences': True}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'recurrent_dropout':0.5}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2', }),
#         (layers.Dense, {'units': output_count})
    ],
    'midnight_10': [
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2', 'activation': 'relu'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'saturday_1': [
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2', 'activation': 'relu'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'friday_1': [
        (layers.LSTM, {'units': 25, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 10, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2', 'activation': 'relu'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'friday_2': [
        (layers.LSTM, {'units': 32, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 24, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 16, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 16, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2', 'activation': 'relu'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'tuesday_1': [
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True, 'activation': 'relu'}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'tuesday_2': [
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'return_sequences': True}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2', 'activation': 'relu'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
#         (layers.Dense, {'units': output_count})
    ],
    'cnn_1':[
        (layers.Conv1D, {'filters': 20, 'kernel_size':10, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.LSTM, {'units': 20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
    ],
    'cnn_2':[
        (layers.Conv1D, {'filters': 20, 'kernel_size':10, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Conv1D, {'filters': 20, 'kernel_size':10, 'kernel_regularizer': 'l2'}),
        (layers.Flatten, {}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
    ],
    'cnn_3':[
        (layers.Conv1D, {'filters': 20, 'kernel_size':10, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Conv1D, {'filters': 20, 'kernel_size':10, 'kernel_regularizer': 'l2'}),
        (layers.Flatten, {}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':20, 'kernel_regularizer': 'l2'}),
        (layers.BatchNormalization, {}),
        (layers.Dense, {'units':10, 'kernel_regularizer': 'l2'}),
    ]
}

opt_dict = {
    'Adam': tf.keras.optimizers.Adam,
    'Nadam': tf.keras.optimizers.Nadam,
}

# try

for k, s in schematic_dict.items():
    for l, p in s:
        l(**p)

In [17]:
from sklearn.model_selection import ParameterGrid

metric_dict = {
    # 'mse': tf.keras.metrics.MeanSquaredError(name='mean_squared_error', dtype=None),
    # 'mae': tf.keras.metrics.MeanAbsoluteError(name='mean_absolute_error', dtype=None),
    'coss': tf.keras.metrics.CosineSimilarity(name='cosine_similarity', dtype=None),
    'ce': tf.keras.metrics.CategoricalCrossentropy(name='categorical_ce', dtype=None)
}

cosine = tf.keras.losses.CosineSimilarity(name='cosine_similarity')
mse = tf.keras.losses.MeanSquaredError(name='mean_squared_error')
mae = tf.keras.losses.MeanAbsoluteError(name='mean_absolute_error')

ce = tf.keras.losses.CategoricalCrossentropy(name='categorical_ce')
fc = tf.keras.losses.CategoricalFocalCrossentropy()

loss_dict = {
    # 'mse': tf.keras.losses.MeanSquaredError(name='mean_squared_error', reduction="auto"),
    # 'mae': tf.keras.losses.MeanAbsoluteError(name='mean_absolute_error', reduction="auto"),
    'coss': tf.keras.losses.CosineSimilarity(name='cosine_similarity', reduction="auto"),
    'ce': tf.keras.losses.CategoricalCrossentropy(name='categorical_ce', reduction='auto'),
    'fc': tf.keras.losses.CategoricalFocalCrossentropy(),
    # 'ce': tf.keras.losses.BinaryFocalCrossentropy(name='categorical_ce', reduction='auto'),
    # 'coss-mse': lambda y, yhat: mse(y, yhat) + cosine(y, yhat),
    # 'coss-mae': lambda y, yhat: mae(y, yhat) + cosine(y, yhat),
}

#           'features' :  [['High', 'Low', 'Open', 'Close', 
#                          'vix_forward_5_historical', 'vix_forward_10_historical', 'vix_forward_15_historical']],

p_grid = {'batch_size' : [100],
          'features' :  [lob_feature],
          'init_learning_rate' : [0.25],
          'lr_decay' : [0.95],
          'loss': ['fc'],
          'schematic' : [
#                          'cnn_1',
                        #  'cnn_2',
#                          'cnn_3',
#                          'tuesday_1',
#                          'tuesday_2', 
                        #  'midnight_1',]}
                         'stride2_depth_simple',]}
#                          'midnight_6_2_rnn',
                        #  'midnight_6_rnn',]}
#                          'midnight_7',
#                          'midnight_8',
#                          'midnight_9',
#                          'friday_1',
#                          'friday_2',
#                          'midnight_6_3',
#                          'midnight_6_4',
#                          'midnight_6_5',]}
#                          'midnight_6_6',]}
#                          'midnight_6_7']}

p_grid = ParameterGrid(p_grid)
len(p_grid)

1

In [18]:
lob_feature

['bid_1',
 'bid_size_1',
 'ask_1',
 'ask_size_1',
 'bid_2',
 'bid_size_2',
 'ask_2',
 'ask_size_2',
 'bid_3',
 'bid_size_3',
 'ask_3',
 'ask_size_3',
 'bid_4',
 'bid_size_4',
 'ask_4',
 'ask_size_4',
 'bid_5',
 'bid_size_5',
 'ask_5',
 'ask_size_5']

In [19]:
def make_if_not_exist(folder_name, base_folder='output'):
    try:
        os.makedirs(fr'{base_folder}\{folder_name}')
    except:
        pass

In [20]:
from dataclasses import field, dataclass

@dataclass
class HyperParameters:
    features: list = field(default_factory=lambda: list(all_feature))
    lookback: int = 50
    epochs: int = 100
    cv: int = 4
    batch_size: int = 100
    batch_no: tuple = (230, 130, 520) #per market period. Mornin afternoon night.
    shuffle: bool = True
    init_learning_rate: float = 2.5e-1
    seed: int = 420
    lr_decay: float = 0.99
    trim: str = 'both'
    decay_steps: int = 50
    schematic: str = 'midnight_1'
    opt: str = 'Nadam'
    replace: bool = False
    loss: str = 'ce'
    training_week: int = 8
    # skip_to = pd.Timestamp('2022-07-02 11:24:58.000') #use this to skip to the period we want
    skip_to: pd.Timestamp = pd.Timestamp('2022-11-01 00:00:00.000')
    # end = pd.Timestamp('2022-07-15 23:59:59.500') #use this to fix when to stop training
    end: pd.Timestamp = pd.Timestamp('2023-03-01 00:00:00')

hp_list = [HyperParameters(**p) for p in p_grid]
base_output_folder = 'output'

run_prefix = 'test'

model_dict = {}

make_if_not_exist(fr"plots\{run_prefix}", base_output_folder)
make_if_not_exist(fr"output\{run_prefix}", base_output_folder)
make_if_not_exist(fr"callback_logs\{run_prefix}", base_output_folder)
make_if_not_exist(fr"callback_logs\{run_prefix}\_temp", base_output_folder)
make_if_not_exist(fr"run_summary\{run_prefix}", base_output_folder)
    
# if run_prefix not in [d for d in os.listdir('output\\') if os.path.isdir('output\\' + d)]:
#     os.makedirs('output\\' + run_prefix)

In [21]:
continue_loop = 0
end_loop = 9999

for run_count, hp in enumerate(hp_list[continue_loop:]):
    run_count += continue_loop
    if run_count >= end_loop:
        break
    print(hp)
    if hp.trim == 'both':
        hp.trim = (hp.lookback, hp.lookback)
    break



HyperParameters(features=['bid_1', 'bid_size_1', 'ask_1', 'ask_size_1', 'bid_2', 'bid_size_2', 'ask_2', 'ask_size_2', 'bid_3', 'bid_size_3', 'ask_3', 'ask_size_3', 'bid_4', 'bid_size_4', 'ask_4', 'ask_size_4', 'bid_5', 'bid_size_5', 'ask_5', 'ask_size_5'], lookback=50, epochs=100, cv=4, batch_size=100, batch_no=(230, 130, 520), shuffle=True, init_learning_rate=0.25, seed=420, lr_decay=0.95, trim='both', decay_steps=50, schematic='stride2_depth_simple', opt='Nadam', replace=False, loss='fc', training_week=8, skip_to=Timestamp('2022-11-01 00:00:00'), end=Timestamp('2023-03-01 00:00:00'))


In [22]:
chunk_list_gen = get_chunk_list_gen(hp.training_week+1, skip_to=hp.skip_to, end=hp.end)
for i, chunk_list in enumerate(chunk_list_gen):
    break

train_chunk = concat_chunk(*chunk_list[:hp.training_week])
walk_forward_chunk = chunk_list[-1]

morning_train_chunk, afternoon_train_chunk, night_train_chunk = split_market(train_chunk)
morning_wf_chunk, afternoon_wf_chunk, night_wf_chunk = split_market(walk_forward_chunk)

Data loading started
latest=Timestamp('2023-01-05 09:32:26.500000')

In [23]:
def make_class(label, epsilon=1.0001):
    long_wins = label['long_wealth'] > label['short_wealth']

    class_label = pd.Series(np.nan, index=label.index)
    class_label[label.notna().all(1)] = 0
    class_label[long_wins & (label['long_wealth'] > epsilon)] = 1
    class_label[~long_wins & (label['short_wealth'] > epsilon)] = -1
    class_label = class_label.dropna().astype(int)
    return pd.get_dummies(class_label, columns=['short', 'neutral', 'long']).astype(int)

make_class(night_train_chunk.label, 1.0001).mean()

-1    0.161957
 0    0.666907
 1    0.171136
dtype: float64

In [24]:
X_list = []
y_list = []

epsilon=1.0001

for i, chunk in enumerate([morning_train_chunk, afternoon_train_chunk, night_train_chunk]):
    X = chunk.lob.loc[:, hp.features].join(chunk.factor)
    y = make_class(chunk.label, epsilon)
    X = X.reindex(y.index)
    y = y.reindex(X.index)
    X_list.append(X)
    y_list.append(y)

scaler = StandardScaler()
scaler.fit(pd.concat(X_list));

In [25]:
print(morning_train_chunk.label.dropna().shape[0])
print(afternoon_train_chunk.label.dropna().shape[0])
print(night_train_chunk.label.dropna().shape[0])
print(morning_train_chunk.label.dropna().shape[0] + afternoon_train_chunk.label.dropna().shape[0] + night_train_chunk.label.dropna().shape[0])

228842
132460
519114
880416


In [26]:
sp_list_list = []
for i, chunk in enumerate([morning_train_chunk, afternoon_train_chunk, night_train_chunk]):

    X = chunk.lob.loc[:, hp.features].join(chunk.factor)
    X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
    y = make_class(chunk.label, epsilon)
    X = X.reindex(y.index)
    y = y.reindex(X.index)
    #transform y here

    sp_list : List[stride_data.SequencePair] = stride_data.create_train_val_sequence_cv(X, y, cv=hp.cv, lookback=hp.lookback, 
                                            batch_size=hp.batch_size, batch_no=hp.batch_no[i], 
                                            shuffle=hp.shuffle, trim=hp.trim, replace=hp.replace)

    sp_list_list.append(sp_list)

sequence_pair_list_per_cv = list(zip(*sp_list_list))
sequence_list_per_cv = [stride_data.CombinedSequence(*[sequence_pair_list_per_cv[i][j].train_sequence 
                        for j in range(len(sequence_pair_list_per_cv[i]))])
                        for i in range(len(sequence_pair_list_per_cv))]

cv_test_tuple_dict = {}
for cv, sp_list in enumerate(sequence_pair_list_per_cv):
    X_list, y_list = list(zip(*[sp.test_tuple for sp in sp_list]))
    test_tuple = np.vstack(X_list), np.vstack(y_list)
    cv_test_tuple_dict[cv] = test_tuple

In [27]:
wf_sequence_list = []
for i, chunk in enumerate([morning_wf_chunk, afternoon_wf_chunk, night_wf_chunk]):

    X = chunk.lob.loc[:, hp.features].join(chunk.factor)
    X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
    y = make_class(chunk.label, epsilon)
    X = X.reindex(y.index)
    y = y.reindex(X.index)
    #transform y here

    wf_sequence : stride_data.StrideData = stride_data.StrideData(X, y, lookback=hp.lookback, 
                                            batch_size=y.shape[0] - hp.lookback, batch_no=None, 
                                            shuffle=False, replace=False)

    wf_sequence_list.append(wf_sequence)

In [28]:
early_stopper = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=15,
    verbose=1,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

for cv_count, sq in enumerate(sequence_list_per_cv):
    print('running cv' + str(cv_count))
    csv_logger = CSVLogger(fr'{base_output_folder}\callback_logs\{run_prefix}\_temp\{run_prefix}_{cv_count}.csv')
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=hp.init_learning_rate,
        decay_steps=hp.decay_steps,
        decay_rate=hp.lr_decay)
    try:
        opt = opt_dict[hp.opt](learning_rate=lr_schedule)
    except Exception:
        opt = opt_dict[hp.opt](learning_rate=hp.init_learning_rate)

    test_tuple = cv_test_tuple_dict[cv_count]

    model = tf.keras.Sequential()
    for l, p in schematic_dict[hp.schematic]:
        model.add(l(**p))
        
    model.add(layers.Dense(3, activation='softmax')) # add output node

    model.compile(loss=loss_dict[hp.loss], optimizer=opt, metrics=[], weighted_metrics=[])
    model.fit(x=sq,
              use_multiprocessing=False,
              validation_data=test_tuple,
              epochs=hp.epochs,
              verbose=1,
              callbacks=[csv_logger, early_stopper])
    break

running cv0
Epoch 1/100
Epoch 2/100
Epoch 3/100
 74/880 [=>............................] - ETA: 1:03 - loss: 0.0852

KeyboardInterrupt: 

In [29]:
output = model.predict(wf_sequence_list[0][0][0])



In [30]:
true_df = pd.DataFrame(wf_sequence_list[0][0][1], columns=['true_short','true_neutral','true_long'])

In [31]:
predicted = pd.DataFrame(output, columns=['short','neutral','long']).join(true_df)
predicted

Unnamed: 0,short,neutral,long,true_short,true_neutral,true_long
0,0.212784,0.53729,0.249926,0,1,0
1,0.212784,0.53729,0.249925,0,1,0
2,0.212784,0.53729,0.249925,0,1,0
3,0.212784,0.53729,0.249926,0,1,0
4,0.212784,0.53729,0.249926,0,1,0
...,...,...,...,...,...,...
34990,0.212784,0.53729,0.249926,0,1,0
34991,0.212784,0.53729,0.249925,0,1,0
34992,0.212784,0.53729,0.249926,0,1,0
34993,0.212784,0.53729,0.249926,0,1,0


In [33]:
predicted[predicted['neutral'] == predicted.iloc[:, :3].max(1)].iloc[:,3:].mean()

true_short      0.126618
true_neutral    0.688413
true_long       0.184969
dtype: float64

In [67]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_53 (LSTM)              (None, 32)                7040      
                                                                 
 batch_normalization_54 (Bat  (None, 32)               128       
 chNormalization)                                                
                                                                 
 dense_36 (Dense)            (None, 3)                 99        
                                                                 
Total params: 7,267
Trainable params: 7,203
Non-trainable params: 64
_________________________________________________________________
