## **Modules**

In [None]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from pandas import DataFrame

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization

from sklearn.metrics import accuracy_score
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer

gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

## **Data**

**Import the normalized data**

In [None]:
file_path = "normalized_data.csv"
df = pd.read_csv(file_path)
df

**Define the periods of time that the features have in the data**

For example, the features contain SMA2 which is the smoothed moving average during 2 days, so a period of time that must be in the periods list is 2.

In [None]:
periods = [2,4,8,12,24,48,96,192]

**Label value counts**

In [None]:
df["label"].value_counts()

### **Data Preprocessing**

**Drop useless column**

In [None]:
df.drop(columns=["Unnamed: 0"], inplace=True)

**Drop nan if exist**

In [None]:
# Print all the amount of nan values
df.isna().sum().sum()

Or forward fill before drop, this forward filling can avoid data leakeage!

In [None]:
forward_fill = False

if forward_fill:
    df.ffill(inplace=True)
else:
    df.dropna(inplace=True)

**Create train set, validation set and test set**

In [None]:
# Convert all dataframes to numpy
train_df = df.copy().iloc[0:int(len(df)*0.7),:]
train_data_np = train_df.to_numpy()

val_df = df.copy().iloc[int(len(df)*0.7):int(len(df)*0.85),:]
val_data_np = val_df.to_numpy()

test_df = df.copy().iloc[int(len(df)*0.85):].copy()
test_data_np = test_df.copy().to_numpy()

**Define window size**

This is the amount of previous row data which should be included in the LSTM model for predicting the current row label

In [None]:
window_size = 24

**Split sequence function**

This function is used for creating features and label, note that the label is already shifted, which means that it is derived by using the label (True of False) of 'next row close price > current row close price'.

In [None]:
def split_sequences(data, window_size, label_col_idx):
    x = []
    y = []
    tmp_data = np.concatenate((data[:,:label_col_idx], data[:,label_col_idx+1:]), axis=1)
    
    for i in range(-1,len(data)-window_size):
        scaler = MinMaxScaler(feature_range=(-1,1))
        
        # Take window_size rows data (including the current row, note that the label is already shifted)
        x.append(scaler.fit_transform(tmp_data[i+1:i+window_size+1,:])) 
        
        # To predict the current value of label column
        y.append(data[i+window_size,label_col_idx]) 
        
    return np.array(x), np.array(y)

In [None]:
train_data_np = np.asarray(train_data_np).astype('float32')
val_data_np = np.asarray(val_data_np).astype('float32')
test_data_np = np.asarray(test_data_np).astype('float32')

x_train, y_train = split_sequences(train_data_np, window_size, df.columns.get_loc("label"))
x_val, y_val = split_sequences(val_data_np, window_size, df.columns.get_loc("label"))
x_test, y_test = split_sequences(test_data_np, window_size, df.columns.get_loc("label"))

## **Model and training**

### Build model

In [None]:
print('Build model...')

class LSTM_model(Model):

    def __init__(self, num_of_outputs:int=32):
        super().__init__()
        
        self.LSTM1 = LSTM(num_of_outputs, return_sequences=True, recurrent_dropout=0.3)
        self.LSTM2 = LSTM(num_of_outputs, return_sequences=True, recurrent_dropout=0.3)
        self.LSTM3 = LSTM(num_of_outputs, return_sequences=False, recurrent_dropout=0.3)
        
        self.batch_norm = BatchNormalization()
        
        self.dense = Dense(16, activation='relu')

        self.out = Dense(1, activation='sigmoid')

    
    def call(self, inputs):
        
        x1 = self.LSTM1(inputs)
        x1 = self.batch_norm(x1)
        x2 = self.LSTM2(x1)
        x2 = self.batch_norm(x2)
        x3 = self.LSTM3(x2)
        x4 = self.dense(x3)

        return self.out(x4)

model = LSTM_model()

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=15, 
                        verbose=1, mode='auto', restore_best_weights=True)

print('Build model successfully')

print("First fit before printing model summary: ")

opt = Adam()
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['binary_crossentropy','accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=1, validation_data=(x_val, y_val))

print("Model summary:")

model.summary()

**Train**

+ Loss: Binary Cross Entropy
+ Optimizer: Adam
+ Metrics: Binary Cross Entropy and Accuracy

In [None]:
print('Train...')

opt = Adam()
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['binary_crossentropy','accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_val, y_val), callbacks=monitor)

print('End of training phase')

**Evaluation**

In [None]:
model.evaluate(x_train, y_train);
model.evaluate(x_val, y_val);
model.evaluate(x_test, y_test);

**Threshold tuning**

In [None]:
pred_val = model.predict(x_val)
tmp_pred_val = pred_val.copy()

l = []
for i in range(4000, 6000):
    pred_val = (tmp_pred_val >= i/10000) * 1
    l.append(accuracy_score(y_val, pred_val))

threshold = max(l)
print(max(l))

In [None]:
pred_test = model.predict(x_test)
pred_test = (pred_test >= threshold) * 1
print("Final testing accuracy:")
accuracy_score(y_test, pred_test)

**Note:**

The result will depend on the early stopping strategy, in the reality, we train multiple times the model and choose the one that gives us the best accuracy on the validation set