# Train a SCINet Model for Live Trading Simulation

In [35]:
import os
import sys

cwd = os.getcwd()
BASE_DIR = os.path.dirname(os.path.dirname(cwd))
print(BASE_DIR)
sys.path.insert(0, BASE_DIR) #add base to path for relative imports

d:\Studie\Universiteit\Studie\Jaar 4\Advances in Deep Learning\SCINet


In [36]:
from base.train_scinet import train_scinet
from base.preprocess_data import preprocess

In [37]:
import numpy as np
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from time import time
import os
import sys

## Load data

Load data from the `data/` folder using an expected OHLCV format as common in stock market data structures. No volume is used in the live data as sparse input data results in 0 volume values a lot of the time as, for demonstration purposes, the timescale is seconds instead of minutes.

In [30]:
#expected dataformat of individual pairs
data_format = [
                            "timestamp",
                            "open",
                            "high",
                            "low",
                            "close",
            ]

#fraction of dataset used (could be 1, not that the first samples in the dataset are used)
fraction_used = 0.01

#train validation test set fractions of used data
train_frac = 0.7
val_frac = 0.2
test_frac = 0.1

#predict next Y values based on previous X values
X_LEN = 240
Y_LEN = 24

OVERLAPPING = True
STANDARDIZE = True

RANDOM_SEED = None

if RANDOM_SEED != None:
    random.seed(RANDOM_SEED)

#names of pairs
pairs = ["BTCUSD"]
data = {}

for pair in pairs:
    data[pair] =  pd.read_csv(f"data/{pair}.csv")#.iloc[:10000, :] #debug
    # print(data[pair].isnull().values.any())
    
data["BTCUSD"] = data["BTCUSD"].loc[ : , data["BTCUSD"].columns != "volume"]

## Preprocessing

Before the network is trained the input data is preprocessed as to standardize the input samples and make train-validation-test split.

In [31]:
results = preprocess(   data = data, 
                        symbols = pairs,
                        data_format = data_format,
                        fraction = fraction_used,
                        train_frac = train_frac,
                        val_frac = val_frac,
                        test_frac = test_frac,
                        X_LEN = X_LEN,
                        Y_LEN = Y_LEN,
                        OVERLAPPING = OVERLAPPING,
                        STANDARDIZE = STANDARDIZE,
                        standardization_settings = {"per_sample": True}
                    )

for result in results:
    print(f"{result}: {results[result].shape}") 

Starting data preprocessing...
     open    high     low   close
0   93.25   93.30   93.25   93.30
1  100.00  100.00  100.00  100.00
2   93.30   93.30   93.30   93.30
3   93.35   93.47   93.35   93.47
4   93.47   93.47   93.47   93.47 (34653, 4)
Making train/validation/test splits...
Making samples...


100%|██████████| 23993/23993 [00:02<00:00, 9315.17it/s] 
  samples = np.array(samples)


Making samples...


100%|██████████| 6666/6666 [00:00<00:00, 8679.81it/s] 


Making samples...


100%|██████████| 3202/3202 [00:00<00:00, 6994.04it/s] 


Making X-y splits...
X_train: (23993, 240, 4)
y_train: (23993, 24, 4)
X_val: (6666, 240, 4)
y_val: (6666, 24, 4)
X_test: (3202, 240, 4)
y_test: (3202, 24, 4)


## Train model

The code below trains a SCINet model using the preprocessed data above. All the arguments are explained below in comments.

In [33]:
EPOCHS = 16
BATCH_SIZE = 64

N_BLOCKS = 2

training_result  = train_scinet(   
                X_train = results["X_train"], #samples of training set
                y_train = results["y_train"], #labels of training set
                X_val = results["X_val"], #samples of validation set
                y_val = results["y_val"], #labels of validation set
                X_test = results["X_test"], #samples of test set
                y_test = results["y_test"], #labels of test set
                X_LEN = X_LEN, #input sample length
                Y_LEN = [Y_LEN] * N_BLOCKS, #model output sequence length (per scinet stack)
                epochs = EPOCHS,
                batch_size = BATCH_SIZE,
                output_dim = [results["X_train"].shape[2]] * N_BLOCKS, #output dimensions (per scinet stack)
                selected_columns = None, #select on which column is trained, None means all
                hid_size= 32, #hidden size
                num_levels = 3, #number of scinet levels
                kernel = 5, #kernel size
                dropout = 0.5, #dropout probability
                loss_weights = [0.4, 0.6], #combine scinet losses with fractions (must sum up to 1)
                probabilistic = False #probabilistic output
            )

model = training_result[0]

model.save_weights(f"model_weights/{'_'.join(pairs)}_{int(time())}")

Initializing training with data:
X_train: (23993, 240, 4), y_train: (23993, 24, 4)
X_val: (6666, 240, 4), y_val: (6666, 24, 4)
X_test: (3202, 240, 4), y_test: (3202, 24, 4)
Building model...
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 240, 4)]     0                                            
__________________________________________________________________________________________________
Block_0 (SCINet)                (None, 24, 4)        259440      input_1[0][0]                    
__________________________________________________________________________________________________
tf.concat (TFOpLambda)          (None, 264, 4)       0           input_1[0][0]                    
                                                                 Block_0[0][0]                    
__