# Predicting cryptocurrencies using RNNs
The problem can be approached from two different perspectives:
* Classification (e.g. buy, hold, sell)
* Regression (future price)

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/Lorenzo-Giardi/tf-keras/blob/master/6_RNN/3_cryptocurrencies_rnn.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

## Imports and global parameters

In [1]:
try:
    %tensorflow_version 2.x
except:
    pass

import pandas as pd
import numpy as np
import tensorflow as tf
import random
import time

from sklearn import preprocessing
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, GRU, BatchNormalization, Flatten
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [2]:
SEQ_LEN = 60 # last minutes to use as feature
FUTURE_PERIOD_PREDICT = 3 # period over which to make the prediction
RATIO_TO_PREDICT = "LTC-USD"
EPOCHS = 10
BATCH_SIZE = 64
NAME = f'{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}'

## Import Data

In [3]:
col_names = ['time', 'low', 'high', 'open', 'close', 'volume']
df = pd.read_csv("crypto_data/LTC-USD.csv", names = col_names)

print(df.head())

         time        low       high       open      close      volume
0  1528968660  96.580002  96.589996  96.589996  96.580002    9.647200
1  1528968720  96.449997  96.669998  96.589996  96.660004  314.387024
2  1528968780  96.470001  96.570000  96.570000  96.570000   77.129799
3  1528968840  96.449997  96.570000  96.570000  96.500000    7.216067
4  1528968900  96.279999  96.540001  96.500000  96.389999  524.539978


In [4]:
main_df = pd.DataFrame()

ratios = ['BTC-USD', 'LTC-USD', 'ETH-USD', 'BCH-USD']
for ratio in ratios:
    # read CSV from path
    df_path = f'crypto_data/{ratio}.csv'
    df = pd.read_csv(df_path, names = col_names)
    
    # rename columns
    df.rename(columns = {'close': f'{ratio}_close', 'volume': f'{ratio}_volume'}, inplace = True)
    # set time as index
    df.set_index('time', inplace = True)
    # select only close price and volume columns
    df = df[[f'{ratio}_close', f'{ratio}_volume']]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)

print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968660    6489.549805        0.587100      96.580002        9.647200   
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   

            ETH-USD_close  ETH-USD_volume  BCH-USD_close  BCH-USD_volume  
time                                                                      
1528968660            NaN             NaN     871.719971        5.675361  
1528968720      486.01001       26.019083     870.859985       26.856577  
1528968780      486.00000        8.449400     870.099976        1.124300  
1528968840      485.75000       26.994646     870.789978        1.749862  
1528968900      4

In [5]:
# Define a binary classification: 1 (buy) vs 0 (hold/sell)
def classify(current, future):
    if float(future) > float (current):
        return 1
    else:
        return 0

In [6]:
# Define a new column with future price
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
print(main_df[[f'{RATIO_TO_PREDICT}_close', 'future']].head())

            LTC-USD_close     future
time                                
1528968660      96.580002  96.500000
1528968720      96.660004  96.389999
1528968780      96.570000  96.519997
1528968840      96.500000  96.440002
1528968900      96.389999  96.470001


In [7]:
# Transform future price into a binary target
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
print(main_df[[f'{RATIO_TO_PREDICT}_close', 'future', 'target']].head())

            LTC-USD_close     future  target
time                                        
1528968660      96.580002  96.500000       0
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1


## Train-test split, normalization, sequence creation and data balancing

Notice that since sequences are very close together and are highly correlated, it would be a bad idea to take a random sample to use as a validation/test set, as it would be extremely similar to instances in the training set. Instead, we have to take a whole period (possibly the most recent one) and use it for testing.

In this case we'll take the last 5-10% of the data.

In [8]:
# ensure that time is sorted
times = sorted(main_df.index.values)
threshold = times[-int(0.05*len(times))]
print(f'Time threshold for train-validation split: {threshold}')

Time threshold for train-validation split: 1534922100


In [9]:
# tain-validation split
validation_main_df = main_df[(main_df.index >= threshold)]
train_main_df = main_df[(main_df.index < threshold)]

Now define a preprocessing function that will be applied to both, the training and validation sets.
* Transform absolute prices into percentage changes
* Normalize data to be in (0,1)
* Drop NAs
* Create sequences that will be used as features (X)

In [10]:
# define a preprocessing function
def preprocess_df(df):
    df = df.drop('future', axis = 1)
    
    for col in df.columns:
        if col != 'target':
            df[col] = df[col].pct_change()
            df.dropna(inplace = True)
            df[col] = preprocessing.scale(df[col].values)
            
    df.dropna(inplace = True)
    
    sequential_data = []
    prev_days = deque(maxlen = SEQ_LEN)
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
            
    random.shuffle(sequential_data)
    
    # Balance dataframe
    buys = []
    sells = []
    
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))
    buys = buys[:lower]
    sells = sells[:lower]
    
    sequential_data = buys + sells
    random.shuffle(sequential_data)
    
    # split into X and y
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    
    return np.array(X), np.array(y)

In [11]:
train_x, train_y = preprocess_df(train_main_df)
valid_x, valid_y = preprocess_df(validation_main_df)

## Recurrent Network


In [12]:
model = Sequential([
    LSTM(128, activation = 'relu', input_shape = (train_x.shape[1:]), return_sequences = True),
    Dropout(0.2),
    BatchNormalization(),
    
    LSTM(128, activation = 'relu', return_sequences = True),
    Dropout(0.2),
    BatchNormalization(),
    
    LSTM(128, activation = 'relu', return_sequences = False),
    Dropout(0.2),
    BatchNormalization(),
    
    Dense(32, activation = 'relu'),
    Dropout(0.2),
    
    Dense(1, activation = 'sigmoid'),
])

opt = tf.keras.optimizers.Adam(lr=0.001, decay = 1e-6)

model.compile(loss = 'binary_crossentropy',
             optimizer = opt,
             metrics = ['accuracy'],
             )

tensorboard = TensorBoard(log_dir = f'logs/{NAME}')
filepath = "RNN_Final-{epoch:02d}-{val_accuracy:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

In [13]:
history = model.fit(train_x, train_y,
                   batch_size = BATCH_SIZE,
                   epochs = EPOCHS,
                   validation_data = (valid_x, valid_y),
                   callbacks = [tensorboard, checkpoint])

Train on 69188 samples, validate on 3062 samples
Epoch 1/10
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: models/RNN_Final-01-0.488.model/assets
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
