NAME - **`ISHIKA SHARMA`**

E-mail ID - **`ishikasharma.aug2001@gmail.com`**

## DATA SCIENCE Major Project

In [1]:
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import numpy as np
import random
import time

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [2]:
import zipfile

In [3]:
local_zip = '/content/drive/MyDrive/crypto_data.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/MyDrive')
zip_ref.close()

In [26]:
#Constants Defined
SEQ_LEN = 60       # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3   # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "BCH-USD"
EPOCHS = 4
BATCH_SIZE = 64
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [5]:
# Creating the actual targets and rule for targets
def classify(current, future):
  if float(future) > float(current):  # if the future price is higher than the current, that's a buy, or a 1
    return 1
  else:
    return 0


In [6]:
def preprocess_df(df):
  df = df.drop('future', 1)

  for col in df.columns:
    if col != "target":   # normalize all ... except target

      df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies 
      #(each crypto coin has vastly diff values, we're really more interested in the other coin's movements)

      df.dropna(inplace=True)  # remove the nas created by pct_change

      df[col] = preprocessing.scale(df[col].values)

  df.dropna(inplace=True)

  sequential_data = []    # list to contain the sequences
  prev_days = deque(maxlen=SEQ_LEN)      # Our actual sequences.
                          # deque keeps the maximum length by 
                          # popping out older values as new ones come in.

  for i in df.values:
    prev_days.append([n for n in i[:-1]])    # store all but the target

    # Make sure we have 60 Sequences
    if len(prev_days) == SEQ_LEN:        
      sequential_data.append([np.array(prev_days), i[-1]])

  random.shuffle(sequential_data)

  buys = []    # list that will store our buy sequences and targets
  sells = []   # list that will store our sell sequences and targets

  for seq, target in sequential_data:
    if target == 0:   # not a buy
      sells.append([seq, target])
    elif target == 1:
      buys.append([seq, target])

  random.shuffle(buys)
  random.shuffle(sells)

  lower = min(len(buys), len(sells))

  # make sure both lists are only up to the shortest length.
  buys = buys[:lower]
  sells = sells[:lower]

  sequential_data = buys + sells
  random.shuffle(sequential_data)

  x = []
  y = []

  for seq, target in sequential_data:
    x.append(seq)
    y.append(target)

  return np.array(x), y



In [27]:
main_df = pd.DataFrame()

ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"]
for ratio in ratios:

  ratio = ratio.split('.csv')[0]  # split away the ticker from the file-name
  print(ratio)
  dataset = f"/MyDrive/crypto_data/{ratio}.csv"

  df = pd.read_csv(dataset, names=["time", "low", "high", "open", "close", "volume"])

  # rename volume and close to include the ticker so we can still which close/volume is which:
  df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

  df.set_index("time", inplace=True)
  df = df[[f"{ratio}_close", f"{ratio}_volume"]]

  if len(main_df) == 0:   #  dataframe empty
    main_df = df
  else:
    main_df = main_df.join(df)


BTC-USD
LTC-USD
ETH-USD
BCH-USD


In [28]:
main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)

main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df["future"]))

main_df.dropna(inplace=True)

#print(main_df[[f"{RATIO_TO_PREDICT}_close", "future"]].head(10))



In [29]:
# here, split away some slice of the future data from the main main_df.
times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]

# print(last_5pct)

In [30]:
# Split up the data
validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

In [31]:
#preprocess_df(main_df)
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [32]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 81428 validation: 3774
Dont buys: 40714, buys: 40714
VALIDATION Dont buys: 1887, buys: 1887


In [33]:
train_y = np.array(train_y, dtype='float64')
validation_y = np.array(validation_y, dtype='float64')

In [34]:
model = Sequential()

model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])

tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

 # unique file name that will include the epoch and the validation acc for that epoch
filepath = "RNN_Final-{epoch:02d}-{val_accuracy:.3f}" 

#Saving the best one:
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')) 

# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint]
)



Epoch 1/4




INFO:tensorflow:Assets written to: models/RNN_Final-01-0.824.model/assets


INFO:tensorflow:Assets written to: models/RNN_Final-01-0.824.model/assets


Epoch 2/4




INFO:tensorflow:Assets written to: models/RNN_Final-02-0.869.model/assets


INFO:tensorflow:Assets written to: models/RNN_Final-02-0.869.model/assets


Epoch 3/4




INFO:tensorflow:Assets written to: models/RNN_Final-03-0.882.model/assets


INFO:tensorflow:Assets written to: models/RNN_Final-03-0.882.model/assets


Epoch 4/4




INFO:tensorflow:Assets written to: models/RNN_Final-04-0.899.model/assets


INFO:tensorflow:Assets written to: models/RNN_Final-04-0.899.model/assets


In [35]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# Save model
model.save("models/{}".format(NAME))

Test loss: 0.221627876162529
Test accuracy: 0.8987811207771301




INFO:tensorflow:Assets written to: models/BCH-USD-60-SEQ-3-PRED-1618497026/assets


INFO:tensorflow:Assets written to: models/BCH-USD-60-SEQ-3-PRED-1618497026/assets
