# Constants:

In [1]:
SEQ_LEN = 180 #240   # how many past candles to use to predict
CANDLES_SHIFT = 2 #5 # how many candles to shift between sequences
NAME = "pp1_m5_ov40th004p_shift2_seq180"
VALIDATION_PCT = 0.2

# Functions:

sequence split

In [2]:
import numpy as np

def splitDf_new(df):
    
    res = []
    print("")
    print("splitDf")
    while len(df) >= SEQ_LEN:
        first = df.head(SEQ_LEN).copy()
        first.index = np.arange(0, len(first))
        res.append(first)
        df = df.tail(len(df) - CANDLES_SHIFT)
        df.index = np.arange(0, len(df))

    print("-done")
    print("")
    return res

balance

In [3]:
def balance(dfs):
    
    buys = []
    sells = []
    holds = []
    for df in dfs:
        if df.at[len(df)-1, 'target'] == 0:
            sells.append(df)
        elif df.at[len(df)-1, 'target'] == 1:
            buys.append(df)
        else:
          holds.append(df)

    print("before balancing:")
    print("buys:", len(buys), ", sells:", len(sells), ", holds:", len(holds))

    smallest = min(len(buys), len(sells), len(holds))
    buys = buys[:smallest]
    sells = sells[:smallest]
    holds = holds[:smallest]

    dfsBalanced = buys+sells+holds
    return dfsBalanced

preprocessing

In [4]:
from sklearn.preprocessing import StandardScaler

# Function to preprocess data
def preprocess1_train(df):
    # before sequencing
    #
    # pct.change transform price columns ('low', 'high', 'open', 'close')
    # scale every colum (center mean and unit variance)

    scaler_dict = {}
    for col in df.columns:
        if col != 'target':
            if col != 'quantity_baseUnits' and col != 'hl_percent':
                df[col] = df[col].pct_change()
                df.dropna(inplace=True)
            scaler = StandardScaler()
            df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))
            scaler_dict[col] = scaler
    df.index = np.arange(0, len(df))
    return df, scaler_dict

# Function to apply saved preprocessing to new data
def apply_preprocess1_val(df, scaler_dict):
    # before sequencing
    #
    # pct.change transform price columns ('low', 'high', 'open', 'close')
    # scale every colum (center mean and unit variance)
    
    for col in df.columns:
        if col != 'target':
            if col != 'quantity_baseUnits' and col != 'hl_percent':
                df[col] = df[col].pct_change()
                df.dropna(inplace=True)
            scaler = scaler_dict[col]
            df[col] = scaler.transform(df[col].values.reshape(-1, 1))
    df.index = np.arange(0, len(df))
    return df

sequences

In [5]:
def buildSequences(dfs):
    
    sequences = []
    for df in dfs:
        if(len(df) == SEQ_LEN):
            label = df.at[SEQ_LEN-1, 'target']
            df = df.iloc[:, :-1]
            dfArray = df.values.tolist()
            sequences.append([np.array(dfArray), label])
    
    return sequences

X y split

In [6]:
def xySplit(seqWithTarget):

    X = []
    y = []
    for seq, target in seqWithTarget:
        X.append(seq)
        y.append(target)

    return np.array(X),np.array(y)

# DF manipulation, build training sets:

In [7]:
import pandas as pd
df = pd.read_csv("../historicalData/labeled/HistoricalDataLabeled_BTC_USDT_01072016_01072023_MINUTE_5_ov40_th004p.csv")
df = df[['close', 'hl_percent', 'quantity_baseUnits', 'target']]
df

Unnamed: 0,close,hl_percent,quantity_baseUnits,target
0,681.84,0.005617,0.488352,0
1,683.40,0.007915,0.239165,0
2,681.95,0.002078,0.103008,0
3,678.86,0.008804,9.084392,0
4,684.39,0.000497,0.236453,0
...,...,...,...,...
736051,30408.88,0.000332,0.091832,0
736052,30413.16,0.000211,0.128622,0
736053,30414.06,0.000341,0.090499,0
736054,30426.45,0.000526,0.205135,0


In [8]:
# Split data into train and validation sets
train_size = int((1-VALIDATION_PCT) * len(df))
train_df = df.iloc[:train_size].copy()
val_df = df.iloc[train_size:].copy()

In [9]:
# Preprocess the training data and save the scaling parameters
train_df, scaler_dict = preprocess1_train(train_df)
# Apply saved preprocessing to validation data
val_df = apply_preprocess1_val(val_df, scaler_dict)

split into dfs with SEQ_LEN rows

In [10]:
train_dfs = splitDf_new(train_df)
val_dfs = splitDf_new(val_df)


splitDf
-done


splitDf
-done



balance buys/sells/holds

In [11]:
# balance buys, sells, and holds
import random

random.shuffle(train_dfs)
random.shuffle(val_dfs)
train_dfs = balance(train_dfs)
val_dfs = balance(val_dfs)
random.shuffle(train_dfs)
random.shuffle(val_dfs)

before balancing:
buys: 104438 , sells: 101495 , holds: 88399
before balancing:
buys: 24698 , sells: 24908 , holds: 23910


build sequences

In [12]:
sequencesTraining = buildSequences(train_dfs)
sequencesValidation = buildSequences(val_dfs)
random.shuffle(sequencesTraining)
random.shuffle(sequencesValidation)

split sequence from label

In [13]:
train_x, train_y = xySplit(sequencesTraining)
validation_x, validation_y = xySplit(sequencesValidation)

In [14]:
print(type(train_x))
print(train_x.shape)
print(type(validation_x))
print(validation_x.shape)

<class 'numpy.ndarray'>
(265197, 180, 3)
<class 'numpy.ndarray'>
(71730, 180, 3)


In [15]:
import pickle

with open(f'../trainData/{NAME}_train_x.pkl', 'wb') as file:
    pickle.dump(train_x, file)
with open(f'../trainData/{NAME}_train_y.pkl', 'wb') as file:
    pickle.dump(train_y, file)
with open(f'../trainData/{NAME}_validation_x.pkl', 'wb') as file:
    pickle.dump(validation_x, file)
with open(f'../trainData/{NAME}_validation_y.pkl', 'wb') as file:
    pickle.dump(validation_y, file)

In [3]:
import pickle

with open(f"../trainData/{NAME}_train_x.pkl", "rb") as file:
    train_x = pickle.load(file)
with open(f"../trainData/{NAME}_train_y.pkl", "rb") as file:
    train_y = pickle.load(file)
with open(f"../trainData/{NAME}_validation_x.pkl", "rb") as file:
    validation_x = pickle.load(file)
with open(f"../trainData/{NAME}_validation_y.pkl", "rb") as file:
    validation_y = pickle.load(file)

# Model:

In [16]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  1


hyper parameters bounds

In [4]:
EPOCHS = 60

batchSize = 96
layers = 2
nodes = 256
denseNodes = 128

dropOut = 0.8 #0.92 #0.88
rec_dropout = 0
l1l2_reg = 0 #1e-5#1e-3

learningRate = 0.00001
decay = 0

hp_suffix = f"bs({batchSize})_layers({layers})_noded({nodes})_dNodes({denseNodes})_do({int(dropOut*100)}%)_recdo({rec_dropout})_l1l2({l1l2_reg})_lr({learningRate})_decay({decay})"

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
import os
from tqdm.keras import TqdmCallback
import datetime
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras import regularizers
import json
# model
model = Sequential()

for _ in range(layers-1):
  model.add(LSTM(nodes, 
               activation="tanh", 
               recurrent_activation = 'sigmoid', 
               recurrent_dropout = rec_dropout, 
               unroll = False, 
               use_bias = True, 
               input_shape=(train_x.shape[1:]), 
               return_sequences=True,
               kernel_regularizer=regularizers.l1_l2(l1=l1l2_reg/10, l2=l1l2_reg),
               #bias_regularizer=regularizers.l2(l1l2_reg),
               activity_regularizer=regularizers.l2(l1l2_reg)
               ))
  model.add(Dropout(dropOut))
  #model.add(BatchNormalization())

model.add(LSTM(nodes, 
             activation="tanh", 
             recurrent_activation = 'sigmoid', 
             recurrent_dropout = rec_dropout, 
             unroll = False, 
             use_bias = True, 
             input_shape=(train_x.shape[1:]),
             kernel_regularizer=regularizers.l1_l2(l1=l1l2_reg/10, l2=l1l2_reg),
             #bias_regularizer=regularizers.l2(l1l2_reg),
             activity_regularizer=regularizers.l2(l1l2_reg)
             ))
model.add(Dropout(dropOut))
#model.add(BatchNormalization())

model.add(Dense(denseNodes, 
                activation="relu",
                kernel_regularizer=regularizers.l1_l2(l1=l1l2_reg/10, l2=l1l2_reg),
                #bias_regularizer=regularizers.l2(l1l2_reg), 
                activity_regularizer=regularizers.l2(l1l2_reg)))
model.add(Dropout(dropOut))

model.add(Dense(3, activation="softmax"))



# opt  
opt = tf.keras.optimizers.Adam(lr=learningRate) # decay?


model.compile(loss="sparse_categorical_crossentropy",
              optimizer = opt,
              metrics=["accuracy"])


filename = "" + NAME + "-{epoch:02d}"
filepath = f"models/{filename}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) # saves every epoch

# Prepare TensorBoard callback
log_dir = "../logs/fit/" + f"{NAME}" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)


# store hyper-parameters
# Path to the JSON file
file_path = "models/hps/hps.json"
# Load the JSON file into a dictionary
with open(file_path, "r") as json_file:
    data = json.load(json_file)
# Add a new key-value pair to the dictionary
data[f"{NAME}" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")] = hp_suffix
# Write the updated dictionary back to the file
with open(file_path, "w") as json_file:
    json.dump(data, json_file, indent=4)




# train
history = model.fit(
  train_x, train_y,
  batch_size = batchSize,
  epochs = EPOCHS,
  validation_data=(validation_x, validation_y),
  callbacks=[checkpoint, TqdmCallback(verbose=0), tensorboard_callback])
  #callbacks=[checkpoint])

  from .autonotebook import tqdm as notebook_tqdm




  super().__init__(name, **kwargs)
  0%|          | 0/60 [00:00<?, ?epoch/s]

Epoch 1/60

  2%|▏         | 1/60 [01:05<1:04:26, 65.53s/epoch, loss=1.1, accuracy=0.365, val_loss=1.07, val_accuracy=0.462]

Epoch 2/60

  3%|▎         | 2/60 [02:06<1:00:56, 63.05s/epoch, loss=1.06, accuracy=0.416, val_loss=1.04, val_accuracy=0.465]

Epoch 3/60

  5%|▌         | 3/60 [03:08<59:10, 62.29s/epoch, loss=1.05, accuracy=0.427, val_loss=1.04, val_accuracy=0.461]  

Epoch 4/60

  7%|▋         | 4/60 [04:09<57:47, 61.92s/epoch, loss=1.05, accuracy=0.437, val_loss=1.03, val_accuracy=0.474]

Epoch 5/60

  8%|▊         | 5/60 [05:10<56:34, 61.72s/epoch, loss=1.04, accuracy=0.443, val_loss=1.03, val_accuracy=0.468]

Epoch 6/60

 10%|█         | 6/60 [06:12<55:25, 61.58s/epoch, loss=1.04, accuracy=0.449, val_loss=1.02, val_accuracy=0.478]

Epoch 7/60

 12%|█▏        | 7/60 [07:13<54:18, 61.48s/epoch, loss=1.03, accuracy=0.453, val_loss=1.02, val_accuracy=0.478]

Epoch 8/60

 13%|█▎        | 8/60 [08:14<53:13, 61.41s/epoch, loss=1.03, accuracy=0.456, val_loss=1.02, val_accuracy=0.479]

Epoch 9/60

 15%|█▌        | 9/60 [09:16<52:10, 61.39s/epoch, loss=1.03, accuracy=0.459, val_loss=1.01, val_accuracy=0.48] 

Epoch 10/60

 17%|█▋        | 10/60 [10:17<51:09, 61.39s/epoch, loss=1.02, accuracy=0.461, val_loss=1.01, val_accuracy=0.48]

Epoch 11/60

 18%|█▊        | 11/60 [11:19<50:12, 61.48s/epoch, loss=1.02, accuracy=0.463, val_loss=1.01, val_accuracy=0.481]

Epoch 12/60

 20%|██        | 12/60 [12:24<50:11, 62.74s/epoch, loss=1.02, accuracy=0.465, val_loss=1.01, val_accuracy=0.482]

Epoch 13/60

 22%|██▏       | 13/60 [13:26<48:49, 62.34s/epoch, loss=1.02, accuracy=0.466, val_loss=1.01, val_accuracy=0.482]

Epoch 14/60

 23%|██▎       | 14/60 [14:27<47:35, 62.08s/epoch, loss=1.02, accuracy=0.467, val_loss=1.01, val_accuracy=0.483]

Epoch 15/60

 25%|██▌       | 15/60 [15:28<46:22, 61.83s/epoch, loss=1.02, accuracy=0.467, val_loss=1.01, val_accuracy=0.483]

Epoch 16/60

 27%|██▋       | 16/60 [16:29<45:02, 61.43s/epoch, loss=1.02, accuracy=0.469, val_loss=1.01, val_accuracy=0.481]

Epoch 17/60

 28%|██▊       | 17/60 [17:28<43:33, 60.77s/epoch, loss=1.01, accuracy=0.47, val_loss=1.01, val_accuracy=0.482] 

Epoch 18/60

 30%|███       | 18/60 [18:27<42:12, 60.29s/epoch, loss=1.01, accuracy=0.47, val_loss=1.01, val_accuracy=0.484]

Epoch 19/60

 32%|███▏      | 19/60 [19:27<41:00, 60.02s/epoch, loss=1.01, accuracy=0.471, val_loss=1.01, val_accuracy=0.485]

Epoch 20/60

 33%|███▎      | 20/60 [20:26<39:51, 59.78s/epoch, loss=1.01, accuracy=0.471, val_loss=1.01, val_accuracy=0.485]

Epoch 21/60

 35%|███▌      | 21/60 [21:25<38:45, 59.64s/epoch, loss=1.01, accuracy=0.473, val_loss=1.01, val_accuracy=0.485]

Epoch 22/60

 37%|███▋      | 22/60 [22:25<37:42, 59.53s/epoch, loss=1.01, accuracy=0.473, val_loss=1.01, val_accuracy=0.486]

Epoch 23/60

 38%|███▊      | 23/60 [23:24<36:40, 59.47s/epoch, loss=1.01, accuracy=0.473, val_loss=1.01, val_accuracy=0.484]

Epoch 24/60

 40%|████      | 24/60 [24:23<35:37, 59.38s/epoch, loss=1.01, accuracy=0.474, val_loss=1, val_accuracy=0.484]   

Epoch 25/60

 42%|████▏     | 25/60 [25:23<34:39, 59.41s/epoch, loss=1.01, accuracy=0.474, val_loss=1.01, val_accuracy=0.484]

Epoch 26/60

 43%|████▎     | 26/60 [26:22<33:39, 59.41s/epoch, loss=1.01, accuracy=0.474, val_loss=1.01, val_accuracy=0.486]

Epoch 27/60

 45%|████▌     | 27/60 [27:21<32:40, 59.40s/epoch, loss=1.01, accuracy=0.475, val_loss=1, val_accuracy=0.484]   

Epoch 28/60

 47%|████▋     | 28/60 [28:21<31:38, 59.33s/epoch, loss=1.01, accuracy=0.475, val_loss=1, val_accuracy=0.486]

Epoch 29/60

 48%|████▊     | 29/60 [29:20<30:36, 59.25s/epoch, loss=1.01, accuracy=0.476, val_loss=1, val_accuracy=0.485]

Epoch 30/60

 50%|█████     | 30/60 [30:19<29:36, 59.20s/epoch, loss=1.01, accuracy=0.477, val_loss=1, val_accuracy=0.482]

Epoch 31/60

 52%|█████▏    | 31/60 [31:18<28:36, 59.18s/epoch, loss=1.01, accuracy=0.477, val_loss=1, val_accuracy=0.486]

Epoch 32/60

 53%|█████▎    | 32/60 [32:17<27:36, 59.16s/epoch, loss=1.01, accuracy=0.477, val_loss=1.01, val_accuracy=0.484]

Epoch 33/60

 55%|█████▌    | 33/60 [33:16<26:37, 59.17s/epoch, loss=1.01, accuracy=0.479, val_loss=1, val_accuracy=0.485]   

Epoch 34/60

 57%|█████▋    | 34/60 [34:15<25:38, 59.16s/epoch, loss=1, accuracy=0.478, val_loss=1, val_accuracy=0.485]   

Epoch 35/60

 58%|█████▊    | 35/60 [35:14<24:38, 59.13s/epoch, loss=1, accuracy=0.478, val_loss=1, val_accuracy=0.484]

Epoch 36/60

 60%|██████    | 36/60 [36:14<23:40, 59.17s/epoch, loss=1, accuracy=0.479, val_loss=1.01, val_accuracy=0.484]

Epoch 37/60

 62%|██████▏   | 37/60 [37:13<22:41, 59.18s/epoch, loss=1, accuracy=0.478, val_loss=1, val_accuracy=0.485]   

Epoch 38/60

 63%|██████▎   | 38/60 [38:12<21:41, 59.15s/epoch, loss=1, accuracy=0.479, val_loss=1, val_accuracy=0.486]

Epoch 39/60

 65%|██████▌   | 39/60 [39:11<20:41, 59.14s/epoch, loss=1, accuracy=0.479, val_loss=1, val_accuracy=0.486]

Epoch 40/60

 67%|██████▋   | 40/60 [40:10<19:43, 59.16s/epoch, loss=1, accuracy=0.479, val_loss=1.01, val_accuracy=0.485]

Epoch 41/60

 68%|██████▊   | 41/60 [41:09<18:44, 59.17s/epoch, loss=1, accuracy=0.479, val_loss=1, val_accuracy=0.486]   

Epoch 42/60

 70%|███████   | 42/60 [42:09<17:45, 59.20s/epoch, loss=1, accuracy=0.48, val_loss=1, val_accuracy=0.487] 

Epoch 43/60

 72%|███████▏  | 43/60 [43:08<16:46, 59.20s/epoch, loss=1, accuracy=0.479, val_loss=1, val_accuracy=0.485]

Epoch 44/60

 73%|███████▎  | 44/60 [44:07<15:47, 59.22s/epoch, loss=1, accuracy=0.479, val_loss=1, val_accuracy=0.486]

Epoch 45/60

 75%|███████▌  | 45/60 [45:06<14:48, 59.23s/epoch, loss=1, accuracy=0.479, val_loss=1.01, val_accuracy=0.485]

Epoch 46/60

 77%|███████▋  | 46/60 [46:06<13:48, 59.21s/epoch, loss=1, accuracy=0.481, val_loss=1.01, val_accuracy=0.486]

Epoch 47/60

 78%|███████▊  | 47/60 [47:05<12:49, 59.21s/epoch, loss=1, accuracy=0.48, val_loss=1, val_accuracy=0.485]    

Epoch 48/60

 80%|████████  | 48/60 [48:04<11:50, 59.21s/epoch, loss=1, accuracy=0.481, val_loss=1, val_accuracy=0.485]

Epoch 49/60

 82%|████████▏ | 49/60 [49:03<10:51, 59.23s/epoch, loss=1, accuracy=0.481, val_loss=1, val_accuracy=0.486]

Epoch 50/60

 83%|████████▎ | 50/60 [50:02<09:52, 59.24s/epoch, loss=0.999, accuracy=0.482, val_loss=1, val_accuracy=0.487]

Epoch 51/60

 85%|████████▌ | 51/60 [51:02<08:53, 59.26s/epoch, loss=0.999, accuracy=0.481, val_loss=1, val_accuracy=0.487]

Epoch 52/60

 87%|████████▋ | 52/60 [52:01<07:53, 59.24s/epoch, loss=0.999, accuracy=0.481, val_loss=1.01, val_accuracy=0.484]

Epoch 53/60

 88%|████████▊ | 53/60 [53:00<06:54, 59.25s/epoch, loss=0.999, accuracy=0.481, val_loss=1, val_accuracy=0.486]   

Epoch 54/60

 90%|█████████ | 54/60 [54:00<05:55, 59.25s/epoch, loss=0.998, accuracy=0.481, val_loss=1, val_accuracy=0.486]

Epoch 55/60

 92%|█████████▏| 55/60 [54:59<04:56, 59.26s/epoch, loss=0.998, accuracy=0.483, val_loss=1, val_accuracy=0.488]

Epoch 56/60

 93%|█████████▎| 56/60 [55:58<03:57, 59.26s/epoch, loss=0.998, accuracy=0.482, val_loss=1.01, val_accuracy=0.487]

Epoch 57/60

 95%|█████████▌| 57/60 [56:57<02:57, 59.28s/epoch, loss=0.998, accuracy=0.482, val_loss=1, val_accuracy=0.483]   

Epoch 58/60

 97%|█████████▋| 58/60 [57:57<01:58, 59.31s/epoch, loss=0.998, accuracy=0.482, val_loss=1, val_accuracy=0.487]

Epoch 59/60

 98%|█████████▊| 59/60 [58:56<00:59, 59.27s/epoch, loss=0.998, accuracy=0.482, val_loss=1, val_accuracy=0.485]

Epoch 60/60

100%|██████████| 60/60 [59:55<00:00, 59.22s/epoch, loss=0.997, accuracy=0.482, val_loss=1, val_accuracy=0.486]



100%|██████████| 60/60 [59:55<00:00, 59.93s/epoch, loss=0.997, accuracy=0.482, val_loss=1, val_accuracy=0.486]
