# Constants:

In [1]:
SEQ_LEN = 240   # how many past candles to use to predict
CANDLES_SHIFT = 2 # how many candles to shift between sequences
NAME = "r20t0"
VALIDATION_PCT = 0.2

DF initialisation:

In [2]:
import pandas as pd
main_df = pd.read_csv("HistoricalDataClassified.csv")
main_df

Unnamed: 0,BTC_close,BTC_low,BTC_high,BTC_volume,BTC_average,BTC_HLPercent,target
0,675.500000,675.500000,675.500000,0.000100,675.500000,0.000100,0
1,667.000000,667.000000,671.000000,4.137774,668.041259,0.005961,2
2,670.100000,670.100000,672.000000,13.203878,671.999856,0.002827,2
3,667.000007,667.000007,672.000000,0.304313,671.903346,0.007440,2
4,667.000007,667.000007,667.000007,0.000100,667.000007,0.000100,2
...,...,...,...,...,...,...,...
403218,8824.457823,8814.000000,8828.100000,2.099438,8825.360378,0.001597,0
403219,8780.138300,8772.888273,8823.000000,18.767512,8794.423289,0.005680,0
403220,8776.869900,8773.414687,8790.000000,4.273303,8783.317655,0.001887,0
403221,8758.555000,8732.293856,8772.888415,11.747596,8753.805186,0.004627,0


# Functions:

split df

In [3]:
def splitDf(df):
    
    res = []
    print("")
    print("splitDf")
    while len(df) >= SEQ_LEN + len(df.columns) -1:
        first = df.head(SEQ_LEN + len(df.columns) -1).copy()
        first.index = np.arange(0, len(first))
        res.append(first)
        df = df.tail(len(df) - CANDLES_SHIFT)
        df.index = np.arange(0, len(df))

    print("-done")
    print("")
    return res

balance

In [4]:
def balance(dfs):
    
    buys = []
    sells = []
    holds = []
    for df in dfs:
        if df.at[len(df)-1, 'target'] == 0:
            sells.append(df)
        elif df.at[len(df)-1, 'target'] == 1:
            buys.append(df)
        else:
          holds.append(df)

    print("before balancing:")
    print("buys:", len(buys), ", sells:", len(sells), ", holds:", len(holds))

    smallest = min(len(buys), len(sells), len(holds))
    buys = buys[:smallest]
    sells = sells[:smallest]
    holds = holds[:smallest]

    dfsBalanced = buys+sells+holds
    return dfsBalanced

preprocessing

In [5]:
from sklearn import preprocessing
from tqdm import tqdm

def preprocess(dfs):
    
    for df in dfs:
        for col in df.columns:
            if col != "target":
                df[col] = df[col].pct_change()
                df.dropna(inplace=True)
                df[col] = preprocessing.scale(df[col].values)
                df.index = np.arange(0, len(df))

    return dfs

sequences

In [6]:
def buildSequences(dfs):
    
    sequences = []
    for df in dfs:
        if(len(df) == SEQ_LEN):
            label = df.at[SEQ_LEN-1, 'target']
            df = df.iloc[:, :-1]
            dfArray = df.values.tolist()
            sequences.append([np.array(dfArray), label])
    
    return sequences

split

In [7]:
def split(seqWithTarget):

    X = []
    y = []
    for seq, target in seqWithTarget:
        X.append(seq)
        y.append(target)

    return np.array(X),np.array(y)

# DF manipulation, build training sets:

split into dfs with SEQ_LEN rows

In [8]:
import numpy as np
splittedDfs = splitDf(main_df)


splitDf
-done



seperate training and validation

In [9]:
dfsTraining = splittedDfs[:(int(len(splittedDfs) * (1-VALIDATION_PCT)))].copy()
dfsValidation = splittedDfs[(int(len(splittedDfs) * (1-VALIDATION_PCT))):].copy()

shuffle

In [10]:
import random

random.shuffle(dfsTraining)
random.shuffle(dfsValidation)

balance buys/sells/holds

In [11]:
dfsTrainingBalanced = balance(dfsTraining)
dfsValidationBalanced = dfsValidation # balance(dfsValidation) , validation data does not have to be balanced   

buys: 64588 , sells: 63234 , holds: 33369


shuffle

In [12]:
random.shuffle(dfsTrainingBalanced)
random.shuffle(dfsValidationBalanced)

preprocessing

In [None]:
dfsTrainingPreprocessed = preprocess(dfsTrainingBalanced)
dfsValidationPreprocessed = preprocess(dfsValidationBalanced)



build sequences

In [None]:
sequencesTraining = buildSequences(dfsTrainingPreprocessed)
sequencesValidation = buildSequences(dfsValidationPreprocessed)

shuffle

In [None]:
random.shuffle(sequencesTraining)
random.shuffle(sequencesValidation)

split sequence from label

In [None]:
train_x, train_y = split(sequencesTraining)
validation_x, validation_y = split(sequencesValidation)

# Model:

hyper parameters bounds

In [None]:
EPOCHS = 50

batchSize = 32
layers = 2
nodes = 256
denseNodes = 128
dropOut = 0.8
learningRate = 0.0001
decay = 1e-06


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
import os



# model
model = Sequential()

for _ in range(layers-1):
  model.add(LSTM(nodes, activation="tanh", recurrent_activation = 'sigmoid', recurrent_dropout = 0, unroll = False, use_bias = True, input_shape=(train_x.shape[1:]), return_sequences=True))
  model.add(Dropout(dropOut))
  model.add(BatchNormalization())

model.add(LSTM(nodes, activation="tanh", recurrent_activation = 'sigmoid', recurrent_dropout = 0, unroll = False, use_bias = True, input_shape=(train_x.shape[1:])))
model.add(Dropout(dropOut))
model.add(BatchNormalization())

model.add(Dense(denseNodes, activation="relu"))
model.add(Dropout(dropOut))

model.add(Dense(3, activation="softmax"))



# opt  
opt = tf.keras.optimizers.Adam(lr=learningRate, decay=decay)


model.compile(loss="sparse_categorical_crossentropy",
              optimizer = opt,
              metrics=["accuracy"])


filename = NAME + "-{epoch:02d}"
filepath = f"models/{filename}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) # saves every epoch


# train
history = model.fit(
  train_x, train_y,
  batch_size = batchSize,
  epochs = EPOCHS,
  validation_data=(validation_x, validation_y),
  callbacks=[checkpoint])