In [1]:
import pandas as pd
import numpy as np

In [2]:
# import some utility functions
from utilities import *

In [3]:
btc_daily = pd.read_csv('./time_series_data/btc_daily_all.csv')
btc_daily = clean_data(btc_daily,'btc')

In [4]:
eth_daily = pd.read_csv('./time_series_data/eth_daily_all.csv')
eth_daily = clean_data(eth_daily,'eth')

In [5]:
xrp_daily = pd.read_csv('./time_series_data/xrp_daily_all.csv')
xrp_daily = clean_data(xrp_daily,'xrp')

In [6]:
ltc_daily = pd.read_csv('./time_series_data/ltc_daily_all.csv')
ltc_daily = clean_data(ltc_daily,'ltc')

In [7]:
xmr_daily = pd.read_csv('./time_series_data/xmr_daily_all.csv')
xmr_daily = clean_data(xmr_daily,'xmr')

In [8]:
dash_daily = pd.read_csv('./time_series_data/dash_daily_all.csv')
dash_daily = clean_data(dash_daily,'dash')

In [9]:
xem_daily = pd.read_csv('./time_series_data/xem_daily_all.csv')
xem_daily = clean_data(xem_daily,'xem')

In [10]:
bcn_daily = pd.read_csv('./time_series_data/bcn_daily_all.csv')
bcn_daily = clean_data(bcn_daily,'bcn')

In [11]:
# combine all cryptocurrencies data together
# using left join to make sure all bitcoin data are included
coin_total = btc_daily.join([eth_daily,xrp_daily,ltc_daily,xmr_daily,dash_daily,xem_daily,bcn_daily],how='left')

In [12]:
coin_complete = coin_total.dropna(axis=0)       # complete daily prices and volumes for 8 cryptocurrencies
                                                # could do some time series analysis on this part
                                                # add other indices later

In [13]:
sp500_daily = pd.read_csv('./time_series_data/^GSPC.csv')
sp500_daily = clean_data(sp500_daily,'sp500')

In [14]:
n225_daily = pd.read_csv('./time_series_data/^N225.csv')
n225_daily = clean_data(n225_daily,'n225')

In [15]:
sxxp_daily = pd.read_csv('./time_series_data/^SXXP.csv')
sxxp_daily = clean_data(sxxp_daily,'sxxp',True,False)

In [16]:
vix_daily = pd.read_csv('./time_series_data/^VIX.csv')
vix_daily = clean_data(vix_daily,'vix')

In [17]:
dxsq_daily = pd.read_csv('./time_series_data/DXSQ.F.csv')
dxsq_daily = clean_data(dxsq_daily,'dxsq')

In [18]:
metal_daily = pd.read_csv('./time_series_data/metal.csv')
metal_daily = metal_daily.iloc[:,[0,1,3]]
metal_daily['Date'] = pd.to_datetime(metal_daily['Date'],format="%Y-%m-%d")
metal_daily = metal_daily.set_index('Date')

In [19]:
int1mo_daily = pd.read_csv('./time_series_data/ustreasuryrates_1mo.csv')
int1mo_daily = clean_data(int1mo_daily,'',False,False)

In [20]:
int10y_daily = pd.read_csv('./time_series_data/ustreasuryrates_10y.csv')
int10y_daily = clean_data(int10y_daily,'',False,False)

In [21]:
eur_usd_daily = pd.read_csv('./time_series_data/EUR_USD.csv')
eur_usd_daily = clean_data(eur_usd_daily,'eur_usd',True,False)

In [22]:
usd_jpy_daily = pd.read_csv('./time_series_data/USD_JPY.csv')
usd_jpy_daily = clean_data(usd_jpy_daily,'usd_jpy',True,False)

In [23]:
# put all ecnomic indices together with bitcoin data
# using left join to make sure all bitcoin data are included
features_total = btc_daily.join([sp500_daily,n225_daily,sxxp_daily,vix_daily,dxsq_daily,metal_daily,int1mo_daily,\
                                int10y_daily,eur_usd_daily,usd_jpy_daily],how='left')

In [24]:
# use forward fill so that holiday data are infered from previous business day's values
# drop first fill NA because no previous data are available
features_complete = features_total.fillna(method='ffill').dropna(axis=0)

In [25]:
# using exchange rate to change everything back to USD

features_complete['n225_close'] = features_complete['n225_close']/features_complete['usd_jpy_close']
features_complete['sxxp_close'] = features_complete['sxxp_close']*features_complete['eur_usd_close']
features_complete['dxsq_close'] = features_complete['dxsq_close']*features_complete['eur_usd_close']
features_complete.drop(['eur_usd_close','usd_jpy_close'],axis=1,inplace=True)

# taking log diff to compute log return
for col in features_complete.columns:
    features_complete[col] = np.log(features_complete[col]+1).diff()
features_complete.dropna(axis=0,inplace=True)

# create target 'y' to indicate whether the log return is positive or negative 
features_complete['y'] = features_complete['btc_close'].apply(lambda x: 1 if x>0 else 0)

In [26]:
bitcoin_news1 = pd.read_csv('./texts_data/bitcoin_news.csv')
bitcoin_news1 = clean_texts(bitcoin_news1,['Summary','Title'])

In [27]:
bitcoin_news2 = pd.read_csv('./texts_data/bitcoinist_news_cleaned.csv')
bitcoin_news2 = clean_texts(bitcoin_news2,['Summary','Title'])

In [28]:
news_complete = pd.concat([bitcoin_news1[['Date','Title','Summary']],bitcoin_news2[['Date','Summary','Title']]]).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [29]:
news_complete['Text'] = news_complete['Title']+' '+news_complete['Summary']

In [30]:
news_groupby = news_complete.groupby('Date')

In [31]:
# simply concatenate all the news from one day to a single (potentially very long) string 

new_dict = {'Date':[],'Text':[]}
for date, df in news_groupby:
    s = ''
    for t in df['Text']:
        s = s+' '+t
    s = s.strip()
    new_dict['Date'].append(date)
    new_dict['Text'].append(s)

In [32]:
news_complete2 = pd.DataFrame(new_dict).sort_values(by='Date').reset_index(drop=True)

In [33]:
news_complete2 = news_complete2.set_index('Date')

In [34]:
news_complete2.head()   # start from 2014-01-06

Unnamed: 0_level_0,Text
Date,Unnamed: 1_level_1
2014-01-06,china bites into bitcoin bitcoins were worth n...
2014-01-07,chilean nightclub to pioneer bitcoin in latin ...
2014-01-08,singapore government to tax some bitcoin trans...
2014-01-09,bitcoin conference new york city april 78 2014...
2014-01-15,silicon valley vc thinks a single bitcoin will...


In [35]:
# join news data to features
# fill NA with empty string
features_complete = features_complete.join(news_complete2,how='left').fillna('')

In [36]:
# I have several ideas to work with hourly bitcoin price/volume. But all of those ideas failed.
# 1. work with artificially created 2d data and use CNN ... the network failed to converge on train set
# 2. work with 1d data and use 1d convolution ... the network still failed to converge on train set
# 3. just work with regular NN ... this time it easily overfit the train set

#btc_hourly = pd.read_csv('./time_series_data/btc_hourly.txt')

#btc_hourly = btc_hourly[['Date','Volume_Currency','Weighted_Price']]
#btc_hourly.columns = ['Date','Volume','Price']

#btc_hourly['Date'] = pd.to_datetime(btc_hourly['Date'],format="%Y-%m-%d")

#price_picture=[]
#volume_picture = []
#timelist = []
#price_1d = []
#volume_1d = []
#feature_1d = []
#btc_h_groupbydate = btc_hourly.groupby('Date')


#for date,df in btc_h_groupbydate:
#    timelist.append(date)
#    vec_price = np.log(df['Price']).diff().iloc[1:]
#    vec_vol = np.log(df['Volume']).diff().iloc[1:]
#    vec_total = vec_price.append(vec_vol)
#    feature_1d.append(vec_total)
#    price_1d.append(vec_price)
#    volume_1d.append(vec_vol)
#    price_picture.append(create2d(vec_price,23,12,12))
#    volume_picture.append(create2d(vec_vol,23,12,12))

#timelist = timelist[:-1]
#price_picture = price_picture[:-1]
#volume_picture = volume_picture[:-1]
#price_1d = price_1d[:-1]
#volume_1d = volume_1d[:-1]
#feature_1d = feature_1d[:-1]

#price_picture = np.concatenate([np.zeros((1774,12,12),dtype=np.float32),np.array(price_picture,dtype=np.float32)],axis = 0)
#volume_picture = np.concatenate([np.zeros((1774,12,12),dtype=np.float32),np.array(volume_picture,dtype=np.float32)],axis = 0)
#price_1d = np.concatenate([np.zeros((1774,23),dtype=np.float32),np.array(price_1d,dtype=np.float32)],axis = 0)
#volume_1d = np.concatenate([np.zeros((1774,23),dtype=np.float32),np.array(volume_1d,dtype=np.float32)],axis = 0)
#feature_1d = np.concatenate([np.zeros((1774,46),dtype=np.float32),np.array(feature_1d,dtype=np.float32)],axis = 0)

#total_picture = np.stack([price_picture,volume_picture],axis=1)  # need NCWH for better training performance
#total_1d = np.stack([price_1d,volume_1d],axis=1)

In [37]:
# create different train/test sets for different models

# these names roughly correspond to the various inputs of functions from the scripts
# hourly_feature.py, lstm_train.py, text_feature.py

train = np.array(features_complete.iloc[1869:2869,0:15],dtype=np.float32)
print(train.shape)

train_label = np.array(features_complete['y'],dtype=np.float32)[1870:2870].reshape(-1,1)
print(train_label.shape)

test = np.array(features_complete.iloc[2869:-1,0:15],dtype=np.float32)
print(test.shape)

test_label = np.array(features_complete['y'],dtype=np.float32)[2870:].reshape(-1,1)
print(test_label.shape)

train_text = np.array(features_complete['Text'][1869:2869])
print(train_text.shape)

test_text = np.array(features_complete['Text'][2869:-1])
print(test_text.shape)

#train_image = total_picture[1869:2869]
#train_image.shape

#train_1d = total_1d[1869:2869]
#train_1d.shape

#test_image = total_picture[2869:-1]
#test_image.shape

#test_1d = total_1d[2869:-1]
#test_1d.shape

#train_price_1d = price_1d[1869:2869]
#train_price_1d.shape

#test_price_1d = price_1d[2869:-1]
#test_price_1d.shape

#train_volume_1d = volume_1d[1869:2869]
#train_volume_1d.shape

#test_volume_1d = volume_1d[2869:-1]
#test_volume_1d.shape

#train_feature_1d = feature_1d[1869:2869]
#train_feature_1d.shape

#test_feature_1d = feature_1d[2869:-1]
#test_feature_1d.shape

#total_text = np.array(features_complete['Text'][1869:-1])
#total_text.shape

(1000, 15)
(1000, 1)
(100, 15)
(100, 1)
(1000,)
(100,)


In [38]:
# we trained a model using word embedding
# and we extract the last layer before the final dense layer and add this to the features

# first, run the following codes to create a model and save its parameters

#from text_feature import *
#train_text_feature(100,train_text,train_label,test_text,test_label)

# then, run the following codes to extract the last hidden layer in the model

#total_text_feature = get_text_feature(total_text)
#total_text_feature = total_text_feature.astype(np.float32)

import pickle

# then, save the numpy array as pickle file

#with open('text_feature.pkl', 'wb') as f:  
#    pickle.dump(total_text_feature, f)

# after it is saved as pickle file, we don't need to rerun the above code each time
# we can directly read a numpy array from the pickle file
with open('text_feature.pkl','rb') as f:  
    total_text_feature = pickle.load(f)

In [39]:
total_text_feature.shape

(1100, 16)

In [40]:
lstm_train_features = np.concatenate([train,total_text_feature[:1000]],axis=1)
lstm_train_features.shape

(1000, 31)

In [41]:
lstm_test_features = np.concatenate([test,total_text_feature[1000:]],axis=1)
lstm_test_features.shape

(100, 31)

In [42]:
x = np.concatenate([lstm_train_features,lstm_test_features],axis=0)
x = x[1:]
x.shape

(1099, 31)

In [43]:
y = np.concatenate([train_label,test_label],axis=0)
y = y[1:]
y.shape

(1099, 1)

In [44]:
import tensorflow as tf

# enable eager execution
tf.enable_eager_execution()

In [45]:
from eager_train import *

In [46]:
# the test accuracy is low...roughly 50%... no better than random guess
# the train accuracy, however, is approaching 94% with merely 50 iterations
accuracy_train,accuracy_test = train_arc4(50,x,y,
               rnn_drop_prob = 0.5,batch_size=100,time_window = 100,num_units1=5,num_units2=5, lr = 0.01)

Time taken for epoch 0: 1.3448596000671387 sec

Time taken for epoch 1: 0.3195221424102783 sec

Time taken for epoch 2: 0.3121342658996582 sec

Time taken for epoch 3: 0.31519079208374023 sec

Time taken for epoch 4: 0.3335709571838379 sec

Time taken for epoch 5: 0.32268548011779785 sec

Time taken for epoch 6: 0.30518245697021484 sec

Time taken for epoch 7: 0.3002312183380127 sec

Time taken for epoch 8: 0.3291194438934326 sec

Time taken for epoch 9: 0.29244303703308105 sec

Time taken for epoch 10: 0.32365870475769043 sec

Time taken for epoch 11: 0.3024559020996094 sec

Time taken for epoch 12: 0.29841089248657227 sec

Time taken for epoch 13: 0.28620362281799316 sec

Time taken for epoch 14: 0.31017112731933594 sec

Time taken for epoch 15: 0.309173583984375 sec

Time taken for epoch 16: 0.3000519275665283 sec

Time taken for epoch 17: 0.31612539291381836 sec

Time taken for epoch 18: 0.3049156665802002 sec

Time taken for epoch 19: 0.3666234016418457 sec

Time taken for epoch 2

In [47]:
accuracy_train

[0.4099,
 0.4224,
 0.5339,
 0.5757,
 0.5793,
 0.5731,
 0.5846,
 0.5827,
 0.5824,
 0.587,
 0.5721,
 0.5798,
 0.5733,
 0.5729,
 0.5861,
 0.5845,
 0.5771,
 0.5807,
 0.5862,
 0.5755,
 0.5775,
 0.5725,
 0.5793,
 0.5811,
 0.5867,
 0.5872,
 0.6113,
 0.6172,
 0.6121,
 0.6201,
 0.6266,
 0.6463,
 0.6615,
 0.6584,
 0.6875,
 0.6819,
 0.7015,
 0.6971,
 0.7107,
 0.7125,
 0.741,
 0.7576,
 0.7616,
 0.7713,
 0.7784,
 0.793,
 0.7887,
 0.7995,
 0.7888,
 0.7955,
 0.7983,
 0.8102,
 0.8029,
 0.815,
 0.8001,
 0.7891,
 0.7882,
 0.804,
 0.7988,
 0.8101,
 0.8163,
 0.8169,
 0.8249,
 0.8101,
 0.8359,
 0.8144,
 0.8343,
 0.8022,
 0.8293,
 0.842,
 0.8243,
 0.8245,
 0.8422,
 0.8457,
 0.8468,
 0.8278,
 0.84,
 0.8321,
 0.8462,
 0.8514,
 0.8564,
 0.8476,
 0.8699,
 0.845,
 0.8417,
 0.8725,
 0.86,
 0.8682,
 0.8711,
 0.8634,
 0.8774,
 0.8642,
 0.8658,
 0.856,
 0.8659,
 0.8727,
 0.8706,
 0.8949,
 0.8656,
 0.8902,
 0.8581,
 0.8613,
 0.8733,
 0.8907,
 0.8813,
 0.8718,
 0.8777,
 0.8963,
 0.8891,
 0.8745,
 0.9003,
 0.8702,
 0.8

In [48]:
accuracy_test

[0.51,
 0.51,
 0.51,
 0.51,
 0.51,
 0.46,
 0.43,
 0.46,
 0.46,
 0.44,
 0.46,
 0.47,
 0.46,
 0.49,
 0.48,
 0.48,
 0.49,
 0.47,
 0.48,
 0.48,
 0.5,
 0.49,
 0.48,
 0.52,
 0.5,
 0.48,
 0.5,
 0.53,
 0.45,
 0.53,
 0.55,
 0.55,
 0.48,
 0.52,
 0.52,
 0.52,
 0.51,
 0.51,
 0.5,
 0.46,
 0.46,
 0.44,
 0.44,
 0.45,
 0.42,
 0.45,
 0.47,
 0.42,
 0.45,
 0.45]