# Algo-trading: data preparation


> Author: <b>Nicola Bini</b>
>
> Date:   <b>06/17/2021</b>
>
>
> <b>Team 2:</b>
> <i>
> <br>Nicola Bini
> <br>Felipe Domingues
> <br>Tri Dung Dinh
> <br>Manuel Echazarra
> </i>

# Summary

This is a Keras LSTM model to predict whether or not the stock will close at higher price compared to the closing
price of the previous day.
Its predictions are then saved in a .csv file as a signal for the bt library.

In [12]:
### Import libraries ###
from keras.models import Sequential       # Sequential model
from keras.layers import Dense            # Dense layer
from keras.layers import LSTM             # LSTM layer
from keras.layers import Dropout          # Dropout layer
import keras
from keras.utils.vis_utils import plot_model
#from tensorflow   import set_random_seed  # set random seed

import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt

# Set random seed for reproducible results
np.random.seed(42)

In [2]:
# Load data
data = pd.read_csv("./final_data.csv")
data = data.rename(columns = {"Unnamed: 0":"date"})
data = data.set_index("date")
data

Unnamed: 0_level_0,DNN,HGEN,ICLN,OCGN,VGT,VUZI,XLV,^NBI,price%_DNN,price%_HGEN,...,sma_XLV,sma_^NBI,profit_DNN,profit_HGEN,profit_ICLN,profit_OCGN,profit_VGT,profit_VUZI,profit_XLV,profit_^NBI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-18,0.44,15.600000,9.390000,357.000000,102.790001,5.150000,70.910004,3732.669922,0.000000,-0.031056,...,1.014546,1.053607,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
2015-09-21,0.42,14.300000,9.410000,342.600006,103.639999,4.880000,69.970001,3568.250000,-0.045455,-0.083333,...,1.013110,1.051290,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2015-09-22,0.42,13.200000,9.200000,315.000000,102.019997,4.780000,69.540001,3508.300049,0.000000,-0.076923,...,1.011404,1.048319,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
2015-09-23,0.41,13.700000,9.060000,298.200012,101.180000,4.780000,69.470001,3488.520020,-0.023810,0.037879,...,1.009513,1.044806,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
2015-09-24,0.41,12.150000,9.020000,282.000000,101.080002,5.230000,68.760002,3418.729980,0.000000,-0.113139,...,1.007465,1.040933,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-02,1.26,19.360001,22.500000,9.455000,372.720001,19.990000,121.330002,4749.950195,-0.045455,0.039184,...,1.067943,1.019637,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
2021-06-03,1.28,18.240000,22.090000,8.890000,368.730011,18.870001,121.699997,4744.620117,0.015873,-0.057851,...,1.068541,1.019038,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2021-06-04,1.37,18.580000,22.299999,8.720000,375.700012,18.730000,122.089996,4803.450195,0.070313,0.018640,...,1.069211,1.019199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-06-07,1.44,20.680000,22.420000,10.300000,376.220001,19.400000,122.529999,4977.330078,0.051095,0.113025,...,1.069835,1.019640,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0


In [3]:
# DNN and ICLN Energy
# OCGN and ^NBI Biotechonlogy
# HGEN and XLV Healthcare
# VUGI and VGT Vision 


ticket     = 'OCGN'
index_fund = '^NBI'

lookback = 15 # Days
split_date = '2018-06-01' # Data goes from '2015-09-18' to '2021-06-08'

drop_columns = ["profit_" + index_fund,ticket,index_fund]
for col in data.columns:
    #print(col)
     if ticket not in col and index_fund not in col:
            drop_columns.append(col)

        
df_data = data.drop(drop_columns, axis=1)

df_train = df_data[df_data.index < split_date]
df_test  = df_data[df_data.index > split_date]

train_date = df_train.reset_index()
train_date = train_date['date']
train_date
test_date = df_test.reset_index()
test_date = test_date['date']
test_date

0      2018-06-04
1      2018-06-05
2      2018-06-06
3      2018-06-07
4      2018-06-08
          ...    
754    2021-06-02
755    2021-06-03
756    2021-06-04
757    2021-06-07
758    2021-06-08
Name: date, Length: 759, dtype: object

In [4]:
drop_columns

['profit_^NBI',
 'OCGN',
 '^NBI',
 'DNN',
 'HGEN',
 'ICLN',
 'VGT',
 'VUZI',
 'XLV',
 'price%_DNN',
 'price%_HGEN',
 'price%_ICLN',
 'price%_VGT',
 'price%_VUZI',
 'price%_XLV',
 'RSI_DNN',
 'RSI_HGEN',
 'RSI_ICLN',
 'RSI_VGT',
 'RSI_VUZI',
 'RSI_XLV',
 'sma_DNN',
 'sma_HGEN',
 'sma_ICLN',
 'sma_VGT',
 'sma_VUZI',
 'sma_XLV',
 'profit_DNN',
 'profit_HGEN',
 'profit_ICLN',
 'profit_VGT',
 'profit_VUZI',
 'profit_XLV']

In [5]:
x_train = df_train.drop(["profit_" + ticket], axis=1)
y_train = df_train['profit_' + ticket]

x_test = df_test.drop(["profit_" + ticket], axis=1)
y_test = df_test['profit_' + ticket]

In [6]:
def create_dataset_with_lookback(data, lookback):
    
    data = np.array(data)
    
    data_w_timesteps = np.zeros((data.shape[0]-lookback, lookback, data.shape[1]), dtype=np.float)
    #print("data_w_timesteps shape: ", data_w_timesteps.shape)
    
    for i in range(data_w_timesteps.shape[0]):
        for i2 in range(10):
            data_w_timesteps[i][i2] = data[i + i2]
    
    return data_w_timesteps
    
# Data with time steps
x_train_tm = create_dataset_with_lookback(x_train, lookback)
x_test_tm = create_dataset_with_lookback(x_test, lookback)

In [7]:
train_date = train_date.tail(y_train.shape[0]-lookback-1)
test_date  = test_date.tail(y_test.shape[0]-lookback)

x_train    = x_train[1:]
y_train_tm = y_train.tail(y_train.shape[0]-lookback-1)
y_test_tm  = y_test.tail(y_test.shape[0]-lookback)

x_train_tm = create_dataset_with_lookback(x_train, lookback)
x_test_tm = create_dataset_with_lookback(x_test, lookback)

In [8]:
print("x_train shape: ", x_train_tm.shape)
print("y_train shape: ", y_train_tm.shape)
print("x_test shape: ",  x_test_tm.shape)
print("y_test shape: ",  y_test_tm.shape)

x_train shape:  (664, 15, 6)
y_train shape:  (664,)
x_test shape:  (744, 15, 6)
y_test shape:  (744,)


In [9]:
y_train_tm.value_counts()

0.0    337
1.0    327
Name: profit_OCGN, dtype: int64

In [13]:
### LSTM model ###

model = Sequential()

model.add(LSTM(units = 250, return_sequences = True, input_shape=(lookback, x_train_tm.shape[2])))
model.add(Dropout(0.2))

model.add(LSTM(units = 250, return_sequences = True))
model.add(Dropout(0.2))

model.add(Dense(units = 50))
model.add(Dropout(0.2))

model.add(Dense(units = 1, activation='sigmoid'))
optimizer = keras.optimizers.Adam(learning_rate=0.05)
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

#model.build()
history = model.fit(x_train_tm, 
                    y_train_tm,
                    epochs           = 100, 
                    batch_size       = 16,
                    shuffle          = True,
                    validation_split = 0.2,
                    verbose          = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 15, 250)           257000    
_________________________________________________________________
dropout_3 (Dropout)          (None, 15, 250)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 15, 250)           501000    
_________________________________________________________________
dropout_4 (Dropout)          (None, 15, 250)           0         
_________________________________________________________________
dense_2 (Dense)              (None, 15, 50)            12550     
_________________________________________________________________
dropout_5 (Dropout)          (None, 15, 50)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 15, 1)            

In [None]:
#0.4578
print(history.history.keys())

plt.plot(history.history['loss'], label="loss")
plt.plot(history.history['accuracy'], label="accuracy")
#plt.plot(history.history['val_loss'], label="val_loss")
#plt.plot(history.history['val_accuracy'],label="val_accuracy")
plt.legend()
plt.show()

In [None]:
### Save train ###

y_train_preds_intervals = model.predict_classes(x_train_tm)
y_train_preds           = []
for pred_interval in y_train_preds_intervals:
    p0 = 0
    p1 = 0
    for pred in pred_interval:
        if pred == 0:
            p0 = p0 + 1
        elif pred == 1:
            p1 = p1 + 1
        else:
            print("Error!!")
            break
    if p1 >= p0:
        y_train_preds.append(1)
    else:
        y_train_preds.append(0)

train_preds = pd.DataFrame({ticket : y_train_preds}, index = train_date)
print(train_preds.value_counts())
train_preds.to_csv("train_signal_" + ticket + ".csv", index=True)


### Save test ### 

y_test_preds_intervals = model.predict_classes(x_test_tm)
y_test_preds           = []
for pred_interval in y_test_preds_intervals:
    p0 = 0
    p1 = 0
    for pred in pred_interval:
        if pred == 0:
            p0 = p0 + 1
        elif pred == 1:
            p1 = p1 + 1
        else:
            print("Error!!")
            break
    if p1 >= p0:
        y_test_preds.append(1)
    else:
        y_test_preds.append(0)

test_preds = pd.DataFrame({ticket : y_test_preds}, index = test_date)
print(test_preds.value_counts())
test_preds.to_csv("test_signal_" + ticket + ".csv", index=True)