In [58]:
import pandas as pd
import numpy as np
from random import random
from numpy.random import shuffle
from sklearn import preprocessing 
from collections import deque
# from tensorflow.contrib.rnn import *

# df = pd.read_table("E:/Stock Data/30 min data/QQQ.txt", delimiter=',', 
#                    names=['date', 'time', 'low', 'high', 'open', 'close', 'volume'])

SEQ_LEN = 30  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 7  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "QQQ"

def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

#     for col in df.columns:  # go through all of the columns
#         if col != "target":  # normalize all ... except for the target itself!
#             df[col] = df[col].pct_change()  # pct change "normalizes" 
#             df.dropna(inplace=True)  # remove the nas created by pct_change
#             df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.
    
    df.dropna(inplace=True)  # cleanup
    
    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  #They are made with deque, which keeps the maximum length by popping out older values as new ones come in
 
    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # check for 60 sequences
            sequential_data.append([np.array(prev_days), i[-1]])  
            
    np.random.shuffle(sequential_data)  # shuffle for good measure.
    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)
  
    return np.array(X), y  # return X and y...and make X a numpy array

main_df = pd.DataFrame() # begin empty

ratios = ["QQQ", "SPY", "DIA", "IWM"]  # the 4 ETFs we want to consider
for ratio in ratios:  # begin iteration
    print(ratio)
    dataset = f'E:/Stock Data/{ratio}.txt'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['date', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file
    df['21ma'] = df['close'].rolling(window=21, min_periods=0).mean()
    df['7ma'] = df['close'].rolling(window=7, min_periods=0).mean()
    df['ratio_ma'] = df['21ma'] / df['7ma']
    m,_  = np.polyfit(df.index, df['21ma'], deg=1) # is there a way to create a rolling slope based
    #  rename volume and close to include the ETF name:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume", "21ma": f"{ratio}_21ma", "7ma": f"{ratio}_7ma", "ratio_ma": f"{ratio}_ratio_ma"}, inplace=True)

    df.set_index("date", inplace=True)  # set time as index 
    df = df[[f"{ratio}_close", f"{ratio}_volume", f"{ratio}_21ma", f"{ratio}_7ma", f"{ratio}_ratio_ma"]]  # ignore the other columns

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)

# print(main_df.head())  

print(m21)
    
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

print(main_df[[f'{RATIO_TO_PREDICT}_close', 'future', 'target']].head(10))
# print(main_df.future)
print(main_df.head())
sequential_data = []  # this is a list that will CONTAIN the sequences
prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

times = sorted(main_df.index.values)  # get the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times

validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

buys = []  # list that will store our buy sequences and targets
sells = []  # list that will store our sell sequences and targets

for seq, target in sequential_data:  # iterate over the sequential data
    if target == 0:  # if it's a "not buy"
        sells.append([seq, target])  # append to sells list
    elif target == 1:  # otherwise if the target is a 1...
        buys.append([seq, target])  # it's a buy!

np.random.shuffle(buys)  # shuffle the buys
np.random.shuffle(sells)  # shuffle the sells!

lower = min(len(buys), len(sells))  # what's the shorter length?

buys = buys[:lower]  # make sure both lists are only up to the shortest length.
sells = sells[:lower]  # make sure both lists are only up to the shortest length.

sequential_data = buys+sells  # add them together
np.random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

print (len(validation_y))
import time

EPOCHS = 100
BATCH_SIZE = 512
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"  # a unique name for the model

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint

model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.1))
# model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
# TensorBoard callback:

tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))


filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best one

# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
    )
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))


QQQ
SPY
DIA
IWM
0.02208672341597145
            QQQ_close    future  target
date                                   
05/26/2000   77.62480  92.94000       1
05/30/2000   85.00000  92.75000       1
05/31/2000   83.09375  93.68750       1
06/01/2000   87.62500  90.86135       1
06/02/2000   93.50050  94.06600       1
06/05/2000   93.86000  91.93440       0
06/06/2000   91.18680  93.65710       1
06/07/2000   92.94000  94.43800       1
06/08/2000   92.75000  98.05319       1
06/09/2000   93.68750  97.93800       1
            QQQ_close  QQQ_volume   QQQ_21ma    QQQ_7ma  QQQ_ratio_ma  \
date                                                                    
05/26/2000   77.62480    18086776  85.820114  79.896586      1.074140   
05/30/2000   85.00000    30061564  85.367690  79.798543      1.069790   
05/31/2000   83.09375    29029600  84.772455  80.013364      1.059479   
06/01/2000   87.62500    28897100  84.644121  80.870507      1.046662   
06/02/2000   93.50050    32000192  84.856955  

KeyboardInterrupt: 