In [56]:
import pandas as pd
from collections import deque
import random
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM,BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint

In [57]:
df = pd.read_csv("stock_data.csv")

In [58]:
df.head()

Unnamed: 0,Date,SPY,SPY_vol,AAPl,AAPl_vol,MSFT,MSFT_vol
0,2000-01-03,98.96,8164300,3.46,133949200,37.29,53228400
1,2000-01-04,95.09,8089800,3.17,128094400,36.03,54119000
2,2000-01-05,95.26,12177900,3.22,194580400,36.41,64059600
3,2000-01-06,93.73,6227200,2.94,191993200,35.19,54976600
4,2000-01-07,99.17,8066500,3.08,115183600,35.65,62013600


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5134 entries, 0 to 5133
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      5134 non-null   object 
 1   SPY       5134 non-null   float64
 2   SPY_vol   5134 non-null   int64  
 3   AAPl      5134 non-null   float64
 4   AAPl_vol  5134 non-null   int64  
 5   MSFT      5134 non-null   float64
 6   MSFT_vol  5134 non-null   int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 280.9+ KB


In [60]:
df.set_index("Date",inplace=True)


In [61]:
SEQ_LEN = 20  
FUTURE_PERIOD_PREDICT = 3  
RATIO_TO_PREDICT = "MSFT"
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}" 

def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0
    
df['future'] = df[f'{RATIO_TO_PREDICT}'].shift(-FUTURE_PERIOD_PREDICT)
df['target'] = list(map(classify, df[f'{RATIO_TO_PREDICT}'], df['future']))

In [62]:
times = sorted(df.index.values)  
last_5pct = sorted(df.index.values)[-int(0.05*len(times))] 

validation_main_df = df[(df.index >= last_5pct)]  
main_df = df[(df.index < last_5pct)]  

In [63]:
main_df.head(20)

Unnamed: 0_level_0,SPY,SPY_vol,AAPl,AAPl_vol,MSFT,MSFT_vol,future,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-03,98.96,8164300,3.46,133949200,37.29,53228400,35.19,0
2000-01-04,95.09,8089800,3.17,128094400,36.03,54119000,35.65,0
2000-01-05,95.26,12177900,3.22,194580400,36.41,64059600,35.91,0
2000-01-06,93.73,6227200,2.94,191993200,35.19,54976600,34.99,0
2000-01-07,99.17,8066500,3.08,115183600,35.65,62013600,33.85,0
2000-01-10,99.51,5741700,3.02,126266000,35.91,44963600,34.49,0
2000-01-11,98.32,7503700,2.87,110387200,34.99,46743600,35.91,1
2000-01-12,97.34,6907700,2.7,244017200,33.85,66532400,36.89,1
2000-01-13,98.66,5158300,2.99,258171200,34.49,83144000,34.23,0
2000-01-14,100.0,7437300,3.11,97594000,35.91,73416400,33.91,0


In [64]:
def preprocess_df(df):
    df = df.drop("future", 1)  

    for col in df.columns:  
        if col != "target":  
            df[col] = df[col].pct_change() 
            df.dropna(inplace=True)  

            scaler = preprocessing.MinMaxScaler()
            df[col] = scaler.fit_transform(df[col].values.reshape(-1,1))
            #df[col] = preprocessing.scale(df[col].values)  #
    
    df.dropna(inplace=True) 
    

    sequential_data = []  
    prev_days = deque(maxlen=SEQ_LEN)  
    
    for i in df.values:  
        prev_days.append([n for n in i[:-1]]) 
        if len(prev_days) == SEQ_LEN: 
            sequential_data.append([np.array(prev_days), i[-1]])  
            
    random.shuffle(sequential_data)  

    buys = []  
    sells = []  

    for seq, target in sequential_data:  
        if target == 0:  
            sells.append([seq, target])  
        elif target == 1:  
            buys.append([seq, target])  

    random.shuffle(buys)  
    random.shuffle(sells)  

    lower = min(len(buys), len(sells))  

    buys = buys[:lower]  
    sells = sells[:lower]  

    sequential_data = buys+sells  
    random.shuffle(sequential_data)  

    X = []
    y = []

    for seq, target in sequential_data: 
        X.append(seq)  
        y.append(target)  

    return np.array(X), np.array(y)  


In [65]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [66]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {len(train_y)-np.count_nonzero(train_y)}, buys: {np.count_nonzero(train_y)}")
print(f"VALIDATION Dont buys: {len(validation_y)-np.count_nonzero(validation_y)}, buys: {np.count_nonzero(validation_y)}")

train data: 4640 validation: 178
Dont buys: 2320, buys: 2320
VALIDATION Dont buys: 89, buys: 89


In [67]:
model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

In [68]:

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

In [69]:
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

filepath = "RNN_Final-{epoch:02d}-{val_accuracy:.2f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones



In [70]:
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [71]:
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))

Test loss: 0.6934525370597839
Test accuracy: 0.5
INFO:tensorflow:Assets written to: models/20-SEQ-3-PRED-1595859422/assets
