# Some ideas to explore


* check performance with sliding window on/off
* check performance with binary vs float labels
* check performance with different ticker price windows and corresponding feature windows

# Check that GPU is listed for tensorflow

In [51]:
import keras

Using TensorFlow backend.


In [50]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9522870279201687960
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3831431168
locality {
  bus_id: 1
  links {
  }
}
incarnation: 631232787631189167
physical_device_desc: "device: 0, name: GeForce GTX 960M, pci bus id: 0000:02:00.0, compute capability: 5.0"
]


# Load Ticker Data

In [1]:
import pandas as pd

In [2]:
eth_ticker_raw = pd.read_csv("data/ticker_data/USDT_ETH.csv",index_col=0)
btc_ticker_raw = pd.read_csv("data/ticker_data/USDT_BTC.csv",index_col=0)          

In [3]:
eth_ticker_raw[eth_ticker_raw.Timestamp == 1439014500]

Unnamed: 0,Close,Timestamp,High,Low,Open
0,1.75,1439014500,0.33,1.61,0.33


In [4]:
btc_ticker_raw[btc_ticker_raw.Timestamp == 1439014500]

Unnamed: 0,Close,Timestamp,High,Low,Open
48805,273.947811,1439014500,275.603572,273.947811,275.603572


In [5]:
btc_ticker_raw.head()

Unnamed: 0,Close,Timestamp,High,Low,Open
0,225.0,1424373000,0.33,225.0,0.33
1,225.0,1424373300,225.0,225.0,225.0
2,225.0,1424373600,225.0,225.0,225.0
3,225.0,1424373900,225.0,225.0,225.0
4,225.0,1424374200,225.0,225.0,225.0


In [6]:
# sync the times of the two dataframes

# Data Preparation

* align the btc and eth data
* write function that can create data point windows - 5 minutes, 20 minutes, 6 hours
* create features and outputs

## Align Data

In [None]:
eth_ticker.dtypes

In [None]:
btc_ticker.dtypes

In [11]:
ticker_data_merged = eth_ticker_raw.set_index("Timestamp")\
                .join(
                        btc_ticker_raw.set_index("Timestamp"),
                        on="Timestamp",
                        how="inner",
                        lsuffix="_eth",
                        rsuffix="_btc")

In [12]:
ticker_data_merged.head()

Unnamed: 0_level_0,Close_eth,High_eth,Low_eth,Open_eth,Close_btc,High_btc,Low_btc,Open_btc
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1439014500,1.75,0.33,1.61,0.33,273.947811,275.603572,273.947811,275.603572
1439014800,1.85,1.85,1.85,1.85,273.905543,273.905543,273.626238,273.901814
1439015100,1.85,1.85,1.85,1.85,273.905543,273.905543,273.905543,273.905543
1439015400,1.85,1.85,1.85,1.85,273.917572,273.917572,273.917572,273.917572
1439015700,1.85,1.85,1.85,1.85,273.917572,273.917572,273.917572,273.917572


## Modify Time Spans

In [13]:
ticker_data_merged.dtypes

Close_eth    float64
High_eth     float64
Low_eth      float64
Open_eth     float64
Close_btc    float64
High_btc     float64
Low_btc      float64
Open_btc     float64
dtype: object

In [14]:
import numpy as np

# in minutes 
minutes = 10
data_point_bucket_size = str(minutes) + "T"

datetime = pd.to_datetime(ticker_data_merged.index,unit='s') 


agg_method = {'Close_eth': "last",
                "High_eth": np.max, 
                "Low_eth": np.min,
                "Open_eth": "first",
                "Close_btc": "last",
                "High_btc": np.max, 
                "Low_btc": np.min,
                "Open_btc": "first", 
                 }

ticker_data = ticker_data_merged.set_index(datetime)\
                                    .resample(data_point_bucket_size)\
                                    .agg(agg_method)

print("Shape of reshaped data: " + str(ticker_data.shape))
print("Shape of original data: " + str(ticker_data_merged.shape))

Shape of reshaped data: (150216, 8)
Shape of original data: (300430, 8)


In [15]:
ticker_data.head()

Unnamed: 0_level_0,Close_eth,High_eth,Low_eth,Open_eth,Close_btc,High_btc,Low_btc,Open_btc
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-08-08 06:10:00,1.75,0.33,1.61,0.33,273.947811,275.603572,273.947811,275.603572
2015-08-08 06:20:00,1.85,1.85,1.85,1.85,273.905543,273.905543,273.626238,273.901814
2015-08-08 06:30:00,1.85,1.85,1.85,1.85,273.917572,273.917572,273.917572,273.917572
2015-08-08 06:40:00,1.85,1.85,1.85,1.85,273.917572,273.917572,273.917572,273.917572
2015-08-08 06:50:00,1.71,1.71,1.71,1.71,274.15505,274.15505,274.15505,274.15505


## Adding Sentiment information

From the research it looked like sentiments from 4-2 days ago yielded the best results.
* I need to consider different time intervals and how i will slide the data?

## Construct Binary label to capture up or down movement beween days

# Construction of Features & Labels

The ratio of features to labels will be 16. And 6 days worth of data needs to be read at a time. This is in line with the research on sentiment analysis. 

For example:
* If the 5 minute intervals are used then the number of features need to be +- 1728 (8640 minutes) and the vector size of the label will be 108 (540 minutes or 9 hours)

**Temporal Golden Rule 1:**
* Temporal order must be preserved. Your features can not be further in time then your labels. 

**NOTE** the above should be doubled as the btc and eth values will be in the input layer

In [16]:
data_point_window = 5
days = 6
feature_vector_size = 6*24*60/data_point_window
output_vector_size = feature_vector_size/16

output_vector_minutes_span = output_vector_size*5
output_vector_hour_span = output_vector_minutes_span/60

print("Number of days feature vector will cover: " + str(days))
print("Data Point Window Size: " + str(data_point_window) + " minutes")
print("Size of feature vector: " + str(feature_vector_size))
print()
print("Number of minutes output vector will cover: " + str(output_vector_minutes_span))
print("Number of hours output vector will cover: " + str(output_vector_hour_span))
print("Size of output vector: " + str(output_vector_size))


Number of days feature vector will cover: 6
Data Point Window Size: 5 minutes
Size of feature vector: 1728.0

Number of minutes output vector will cover: 540.0
Number of hours output vector will cover: 9.0
Size of output vector: 108.0


The following class was obtained from [the following blog](https://nicholastsmith.wordpress.com/2017/11/13/cryptocurrency-price-prediction-using-deep-learning-in-tensorflow/)

In [17]:
##QUESTION!!!!???? bias introduced in the label if there is overlap with the next training row?

import numpy as np
import pandas as pd
 
class PastSampler:
    '''
    Forms training samples for predicting future values from past value
    '''
     
    def __init__(self, N, K, sliding_window = True):
        '''
        Predict K future sample using N previous samples
        '''
        self.K = K
        self.N = N
        self.sliding_window = sliding_window
 
    def transform(self, A):
        M = self.N + self.K     #Number of samples per row (sample + target)
        #indexes
        if self.sliding_window:
            I = np.arange(M) + np.arange(A.shape[0] - M + 1).reshape(-1, 1)
        else:
            if A.shape[0]%M == 0:
                I = np.arange(M)+np.arange(0,A.shape[0],M).reshape(-1,1)
                
            else:
                I = np.arange(M)+np.arange(0,A.shape[0] -M,M).reshape(-1,1)
            
        B = A[I].reshape(-1, M * A.shape[1], A.shape[2])
        ci = self.N * A.shape[1]    #Number of features per sample
        return B[:, :ci], B[:, ci:] #Sample matrix, Target matrix



In [47]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# normalization

df = ticker_data.copy()
time_stamps = df.index

original_df = ticker_data.copy()

columns = ["Close_eth","Close_btc"]

for c in columns:
    df[c] = scaler.fit_transform(df[c].values.reshape(-1,1))

 


In [48]:
#Features are input sample dimensions(channels)
A = np.array(df)[:,None,:]
original_A = np.array(original_df)[:,None,:]
time_stamps = np.array(time_stamps)[:,None,None]

##Make samples of temporal sequences of pricing data (channel)
#Number of past and future samples
NPS, NFS = 256, 16         
ps = PastSampler(NPS, NFS, sliding_window=True)

X, Y = ps.transform(A)
original_X, original_Y = ps.transform(original_A)

input_times, output_times = ps.transform(time_stamps)

In [49]:
print("Shape of original_A" + str(original_A.shape))
print("Shape of time_stamps" + str(time_stamps.shape))
print("Shape of original_X" + str(original_X.shape))
print("Shape of original_Y" + str(original_Y.shape))
print("Shape of X" + str(X.shape))
print("Shape of Y" + str(Y.shape))

Shape of original_A(150216, 1, 8)
Shape of time_stamps(150216, 1, 1)
Shape of original_X(149945, 256, 8)
Shape of original_Y(149945, 16, 8)
Shape of X(149945, 256, 8)
Shape of Y(149945, 16, 8)


# Build CNN

In [59]:
# set sizes
training_size = int(0.7* X.shape[0])
remaining_size = X.shape[0] - training_size
test_size = int(remaining_size/2)
validation_size = int(remaining_size/2) 


#split training validation
training_features = X[:training_size,:]
training_labels = Y[:training_size,:]

# test set
#test_features = X[training_size:,:]
#test_labels = Y[training_size:,:]

# validation set
#validation_features = X[training_size:,:]
#validation_labels = Y[training_size:,:]


In [None]:
#build model
from keras import Sequential
from keras.layers import Conv1D, Dropout

epochs = 100
step_size = X.shape[1]
batch_size= 8
nb_features = X.shape[2]

# 2 layers
model = Sequential()

model.add(Conv1D(activation='relu', input_shape=(step_size, nb_features), strides=3, filters=8, kernel_size=20))
model.add(Dropout(0.5))
model.add(Conv1D( strides=4, filters=nb_features, kernel_size=16))

'''
# 3 Layers
model.add(Conv1D(activation='relu', input_shape=(step_size, nb_features), strides=3, filters=8, kernel_size=8))
#model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Conv1D(activation='relu', strides=2, filters=8, kernel_size=8))
#model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Conv1D( strides=2, filters=nb_features, kernel_size=8))
# 4 layers
model.add(Conv1D(activation='relu', input_shape=(step_size, nb_features), strides=2, filters=8, kernel_size=2))
#model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Conv1D(activation='relu', strides=2, filters=8, kernel_size=2))
#model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Conv1D(activation='relu', strides=2, filters=8, kernel_size=2))
#model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Conv1D( strides=2, filters=nb_features, kernel_size=2))
'''
model.compile(loss='mse', optimizer='adam')

**Temporal Golden Rule 2:**
* Temporal Training Order: It can not train and predict on future data and then train and predict on past data.

In [62]:
trained_model = model.fit(training_datas, 
          training_labels,
          verbose=1, 
          batch_size=batch_size,
          validation_data=(validation_datas,
                           validation_labels), 
          epochs = epochs
                    
         )

Train on 119956 samples, validate on 29989 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100

KeyboardInterrupt: 