## Modules

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime
# import tensorflow as tf
# import tensorflow_addons as tfa

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, Input, RNN, GRU, Bidirectional, concatenate, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler


gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)
    
TF_ENABLE_ONEDNN_OPTS=0

## Data

In [6]:
file_path = "data_with_features_ver01.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0.1,Unnamed: 0,Time_UTC_Start,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,...,smoothed_minus_DM72,mad72,ATR72,CCI72,plus_DI72,minus_DI72,DX72,ADX72,BOP72,MFI72
0,0,2018-07-24 09:00:00,7991.55,7995.89,7991.52,7994.86,31.547768,2.521824e+05,114.0,17.411796,...,728.673056,161.526701,8.191528,174.835387,16870.670917,8895.447532,30.952366,11.256566,0.141162,55.823217
1,1,2018-07-24 10:00:00,8022.05,8032.85,8022.05,8032.85,11.358339,9.117570e+04,136.0,7.730805,...,689.193056,165.613529,8.218889,180.754117,17529.241584,8385.477220,35.284058,11.590281,0.147690,56.511624
2,2,2018-07-24 11:00:00,8140.18,8145.00,8135.00,8135.00,61.441457,5.001102e+05,296.0,29.340389,...,667.261944,170.471458,8.304167,213.721125,19586.447566,8035.266767,41.819203,12.010127,0.133587,56.879834
3,3,2018-07-24 12:00:00,8257.98,8266.61,8255.02,8263.01,36.020530,2.976317e+05,317.0,16.279052,...,667.261944,176.651978,8.305417,247.779179,20948.826903,8034.057426,44.559987,12.462209,0.138612,57.654664
4,4,2018-07-24 13:00:00,8218.71,8228.68,8216.45,8220.00,59.458543,4.887918e+05,284.0,31.777879,...,743.866250,181.885078,8.400694,221.563167,19001.025048,8854.818550,36.423979,12.795011,0.126266,56.079662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33186,33186,2022-05-07 03:00:00,35953.63,35953.64,35943.60,35951.24,6.376430,2.292318e+05,339.0,3.818980,...,7035.463472,1482.198889,45.395556,-79.503234,9943.275773,15498.132771,21.833921,13.697214,0.042153,61.134462
33187,33187,2022-05-07 04:00:00,35812.22,35826.14,35812.21,35826.14,6.066650,2.173069e+05,348.0,5.129450,...,7222.968611,1503.375019,45.295139,-82.701137,9828.032810,15946.454274,23.738286,13.836674,0.044086,60.964637
33188,33188,2022-05-07 05:00:00,35746.62,35768.06,35722.24,35768.05,34.156320,1.220607e+06,993.0,19.901990,...,7251.375139,1524.164087,45.760833,-83.214330,9728.015831,15846.248004,23.923395,13.976767,0.055374,61.167920
33189,33189,2022-05-07 06:00:00,35898.96,35903.80,35890.03,35900.00,16.707700,5.997658e+05,580.0,10.902570,...,7161.405139,1539.731304,45.436111,-74.759128,10145.099652,15761.483463,21.679369,14.083748,0.045734,61.377534


In [7]:
df.columns

Index(['Unnamed: 0', 'Time_UTC_Start', 'open', 'high', 'low', 'close',
       'volume', 'quote_asset_volume', 'number_of_trades',
       'taker_buy_base_asset_volume',
       ...
       'smoothed_minus_DM72', 'mad72', 'ATR72', 'CCI72', 'plus_DI72',
       'minus_DI72', 'DX72', 'ADX72', 'BOP72', 'MFI72'],
      dtype='object', length=179)

In [8]:
df.drop(columns=["Unnamed: 0"], inplace=True)

## Additional preprocessing

In [9]:
df["day"] = pd.to_datetime(df["Time_UTC_Start"]).dt.day/31
df["month"] = pd.to_datetime(df["Time_UTC_Start"]).dt.month/12
df["year"] = pd.to_datetime(df["Time_UTC_Start"]).dt.year/2018
df["hour"] = pd.to_datetime(df["Time_UTC_Start"]).dt.hour/24

In [10]:
df.drop(columns=["Time_UTC_Start"], inplace=True)

In [11]:
df

Unnamed: 0,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,label,...,plus_DI72,minus_DI72,DX72,ADX72,BOP72,MFI72,day,month,year,hour
0,7991.55,7995.89,7991.52,7994.86,31.547768,2.521824e+05,114.0,17.411796,139194.088209,1.0,...,16870.670917,8895.447532,30.952366,11.256566,0.141162,55.823217,0.774194,0.583333,1.000000,0.375000
1,8022.05,8032.85,8022.05,8032.85,11.358339,9.117570e+04,136.0,7.730805,62059.658337,1.0,...,17529.241584,8385.477220,35.284058,11.590281,0.147690,56.511624,0.774194,0.583333,1.000000,0.416667
2,8140.18,8145.00,8135.00,8135.00,61.441457,5.001102e+05,296.0,29.340389,238843.014408,0.0,...,19586.447566,8035.266767,41.819203,12.010127,0.133587,56.879834,0.774194,0.583333,1.000000,0.458333
3,8257.98,8266.61,8255.02,8263.01,36.020530,2.976317e+05,317.0,16.279052,134510.633427,1.0,...,20948.826903,8034.057426,44.559987,12.462209,0.138612,57.654664,0.774194,0.583333,1.000000,0.500000
4,8218.71,8228.68,8216.45,8220.00,59.458543,4.887918e+05,284.0,31.777879,261222.842840,1.0,...,19001.025048,8854.818550,36.423979,12.795011,0.126266,56.079662,0.774194,0.583333,1.000000,0.541667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33186,35953.63,35953.64,35943.60,35951.24,6.376430,2.292318e+05,339.0,3.818980,137290.253366,0.0,...,9943.275773,15498.132771,21.833921,13.697214,0.042153,61.134462,0.225806,0.416667,1.001982,0.125000
33187,35812.22,35826.14,35812.21,35826.14,6.066650,2.173069e+05,348.0,5.129450,183735.644012,1.0,...,9828.032810,15946.454274,23.738286,13.836674,0.044086,60.964637,0.225806,0.416667,1.001982,0.166667
33188,35746.62,35768.06,35722.24,35768.05,34.156320,1.220607e+06,993.0,19.901990,711168.120653,1.0,...,9728.015831,15846.248004,23.923395,13.976767,0.055374,61.167920,0.225806,0.416667,1.001982,0.208333
33189,35898.96,35903.80,35890.03,35900.00,16.707700,5.997658e+05,580.0,10.902570,391364.049595,1.0,...,10145.099652,15761.483463,21.679369,14.083748,0.045734,61.377534,0.225806,0.416667,1.001982,0.250000


In [12]:
# Convert all dataframes to numpy
train_df = df.iloc[0:int(len(df)*0.7),:].copy()
train_data = train_df.to_numpy()
num_rows_train, num_cols_train = train_data.shape

val_df = df.iloc[int(len(df)*0.7):int(len(df)*0.8),:].copy()
val_data = val_df.to_numpy()
num_rows_val, num_cols_val = val_data.shape

test_df = df.iloc[int(len(df)*0.8):].copy()
test_data = test_df.to_numpy()
num_rows_test, num_cols_test = test_data.shape

In [13]:
steps = 32

# Make copies of test_data and train_data
test_data_cpy = test_data.copy()
train_data_cpy = train_data.copy()
val_data_cpy = val_data.copy()

# StandardScaler for training data, we take only {steps} row before to apply StandardScaler() in these rows
for i in range(steps, num_rows_train):
    scaler = StandardScaler().fit(train_data_cpy[i-steps:i])
    train_data[i] = scaler.transform(train_data_cpy[i].reshape(1, -1))
    del scaler

# StandardScaler for validation data, we take only {steps} row before to apply StandardScaler() in these rows
for i in range(steps, num_rows_val):
    scaler = StandardScaler().fit(val_data_cpy[i-steps:i])
    val_data[i] = scaler.transform(val_data_cpy[i].reshape(1, -1))
    del scaler

train_data = train_data[steps+1:]
train_df = train_df.iloc[steps+1:,:]

In [14]:
# StandardScaler for test data, we take only {steps} row before to apply StandardScaler() in these rows
for i in range(num_rows_test):
    if i > steps:
        scaler = StandardScaler().fit(test_data_cpy[i-steps:i])
        test_data[i] = scaler.transform(test_data_cpy[i].reshape(1, -1))
        del scaler
    else:
        scaler = StandardScaler().fit(np.array(train_data_cpy[-steps:].tolist() + test_data_cpy[:i].tolist()))
        test_data[i] = scaler.transform(test_data_cpy[i].reshape(1, -1))
        del scaler

In [15]:
def split_sequences(data, window_size, price_col_idx):
    x = []
    y = []
    for i in range(0,len(data)-window_size):
        x.append(data[i:i+window_size,:]) # Take window_size rows data before
        y.append(data[i+window_size,price_col_idx]) # To predict the current value of 'difference' column
    return np.array(x), np.array(y)

In [16]:
train_data = np.asarray(train_data).astype('float32')
val_data = np.asarray(val_data).astype('float32')
test_data = np.asarray(test_data).astype('float32')

x_train, y_train = split_sequences(train_data, steps, df.columns.get_loc("close"))
x_val, y_val = split_sequences(val_data, steps, df.columns.get_loc("close"))
x_test, y_test = split_sequences(test_data, steps, df.columns.get_loc("close"))

In [17]:
num_of_outputs = 32

In [18]:
print('Build model %s...')

class LSTM4_with_t2v(Model):

    def __init__(self, vol_idx, close_idx):
        super().__init__()
        self.vol_idx = vol_idx
        self.close_idx = close_idx

        self.t2v_first_col = Dense(1, activation=None)
        self.t2v_others_col = Dense(steps-1, activation=None)

        self.vol2v_first_col = Dense(1, activation=None)
        self.vol2v_others_col = Dense(steps-1, activation=None)

        self.close2v_first_col = Dense(1, activation=None)
        self.close2v_others_col = Dense(steps-1, activation=None)

        self.LSTM1 = LSTM(64, return_sequences=True, recurrent_dropout=0.1)
        self.LSTM2 = LSTM(64, return_sequences=True, recurrent_dropout=0.1)
        self.LSTM3 = LSTM(64, return_sequences=False, recurrent_dropout=0.1)

        self.batch_norm = BatchNormalization()
        self.out = Dense(1)


    def call(self, inputs):
        t2v_x1 = self.t2v_first_col(inputs[:,:,-4:])
        t2v_x2 = tf.sin(self.t2v_others_col(inputs[:,:,:-4]))
        t2v = tf.concat([t2v_x1,t2v_x2],2)

        vol2v_x1 = self.vol2v_first_col(inputs[:,:,self.vol_idx:self.vol_idx+1])
        vol2v_x2 = tf.sin(self.vol2v_others_col(inputs[:,:,self.vol_idx:self.vol_idx+1]))
        vol2v = tf.concat([vol2v_x1,vol2v_x2],2)

        close2v_x1 = self.close2v_first_col(inputs[:,:,self.close_idx:self.close_idx+1])
        close2v_x2 = tf.sin(self.close2v_others_col(inputs[:,:,self.close_idx:self.close_idx+1]))
        close2v = tf.concat([close2v_x1,close2v_x2],2)

        new_input = tf.concat([inputs[:,:,:-4], t2v], 2)
        new_input = tf.concat([new_input, vol2v], 2)
        new_input = tf.concat([new_input, close2v], 2)
        
        x1 = self.LSTM1(new_input)
        x1 = self.batch_norm(x1)

        x2 = self.LSTM2(x1)
        x2 = self.batch_norm(x2)

        x3 = self.LSTM3(x2)

        return self.out(x3) 

model = LSTM4_with_t2v(df.columns.get_loc("volume"), df.columns.get_loc("close"))

opt = tf.keras.optimizers.Adam(
    learning_rate=0.001
)
model.compile(loss='mean_absolute_error', optimizer=opt)

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=3, 
                        verbose=1, mode='auto', restore_best_weights=True)

# First test fit:
print("First fit:")
model.fit(x_train, y_train, batch_size=128, epochs=1, validation_data=(x_val, y_val))
print("Done!")

Build model %s...


2022-06-12 19:21:28.437077: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-12 19:21:28.438134: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-12 19:21:28.438311: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-12 19:21:28.438386: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so ret

First fit:
  2/181 [..............................] - ETA: 17s - loss: 1.1217  

2022-06-12 19:21:34.660473: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Done!


In [19]:
print('Train...')

opt = tf.keras.optimizers.Adam(
    learning_rate=0.001
)

model.compile(loss='mean_absolute_error', optimizer=opt)

for i in range(1,3):
    print("Load %i:"%i)
    model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val), callbacks=[monitor])

Train...
Load 1:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping
Load 2:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping


In [20]:
print('Train...')

opt = tf.keras.optimizers.Adam(
    learning_rate=0.0001
)

model.compile(loss='mean_absolute_error', optimizer=opt)

for i in range(1,3):
    print("Load %i:"%i)
    model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val), callbacks=[monitor])

Train...
Load 1:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping
Load 2:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping


In [21]:
model.summary()

Model: "lstm4_with_t2v"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               multiple                  5         
                                                                 
 dense_1 (Dense)             multiple                  5518      
                                                                 
 dense_2 (Dense)             multiple                  2         
                                                                 
 dense_3 (Dense)             multiple                  62        
                                                                 
 dense_4 (Dense)             multiple                  2         
                                                                 
 dense_5 (Dense)             multiple                  62        
                                                                 
 lstm (LSTM)                 multiple               

In [22]:
y_pred = model.predict(x_test)
pred_data = [0 for _ in range(y_pred.shape[0])]

for i in range(y_pred.shape[0]):
    if i > steps:
        scaler = StandardScaler().fit(test_df["close"][i-steps:i].to_numpy().reshape(-1,1))
        pred_data[i] = float(scaler.inverse_transform(y_pred[i].reshape(1, -1)).reshape(-1))
        del scaler
    else:
        scaler = StandardScaler().fit(np.array(test_df["close"][:i].tolist() + train_df["close"][-steps:].tolist()).reshape(-1,1))
        pred_data[i] = float(scaler.inverse_transform(y_pred[i].reshape(1, -1)).reshape(-1))
        del scaler


y_pred = pd.DataFrame(pred_data)
y_test = pd.DataFrame(test_df["close"].to_numpy())
((y_pred - y_test.shift(1))*(y_test - y_test.shift(1))>=0).sum()/y_test.shape[0]



0    0.545112
dtype: float64