## Importing libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

from numpy.random import seed

# from tensorflow import set_random_seed

In [4]:
seed(2019)

In [5]:
def split_train_test_chronological(df, ratio = 0.9):
    
    '''
    Input is a dataframe, and a ratio. Splits dataframe into 2 dataframes chronologically.
    Returns first dataframe up to the index of the length of the input dataframe times the input ratio, 
    and returns second dataframe of remaining elements.
    '''
    
    size = len(df) * ratio
    size_round = round(size)
    
    df_train = df[0:(size_round)]
    df_test = df[size_round:]
    
    return df_train, df_test

## Importing data

In [6]:
df_bitcoin_price = pd.read_pickle('../crypto_currency_and_market_value_history/data/bitcoin_4_year_hourly_history_df.pickle')

df_bitcoin_price.head(3)

Unnamed: 0,timestamp,open,high,low,close,volume (btc),volume (currency),weighted price
0,2015-05-31 00:00:00,232.11,232.11,231.92,231.92,223.37,51822.18,232.0
1,2015-05-31 01:00:00,231.88,232.32,231.87,232.32,217.49,50452.68,231.98
2,2015-05-31 02:00:00,231.97,232.51,231.88,232.51,46.45,10782.23,232.14


In [7]:
df_vix_daily = pd.read_pickle('../crypto_currency_and_market_value_history/data/vix_volatility_daily_change_history_df.pickle')


df_vix_daily.head(3)

Unnamed: 0,trade date,open,high,low,close,settle,change,total volume
0,2019-06-07,-0.014837,-0.019264,-0.006116,0.006591,0.015175,-2.666667,-0.0666
1,2019-06-06,-0.023754,-0.021143,-0.01506,0.002402,-0.009023,-0.75,-0.038795
2,2019-06-05,-0.054247,-0.04736,-0.034884,-0.032539,-0.034833,-0.428571,-0.149433


In [8]:
df_vix_hourly = pd.read_pickle('../crypto_currency_and_market_value_history/data/vix_volatility_hourly_history_df.pickle')


df_vix_hourly.head(3)

Unnamed: 0,Local time,Open,High,Low,Close,Volume,local_time
0,26.01.2017 00:00:00.000 GMT-0800,19.14,19.14,19.14,19.14,0.0,2017-01-25 16:00:00+00:00
1,26.01.2017 01:00:00.000 GMT-0800,19.14,19.14,19.14,19.14,0.0,2017-01-25 17:00:00+00:00
2,26.01.2017 02:00:00.000 GMT-0800,19.14,19.14,19.14,19.14,0.0,2017-01-25 18:00:00+00:00


In [9]:
df_gold = pd.read_csv('../crypto_currency_and_market_value_history/data/gold_hourly_data.csv', sep = ';')

df_gold.tail(3)

Unnamed: 0,Date,Compound_Score,n,Count_Negatives,Count_Positives,Count_Neutrals,Sent_Negatives,Sent_Positives,Open,High,Low,Close,Volume (BTC),Volume (Currency)
12933,21/01/2019 21:00,0.131914,917.0,186.0,421.0,310.0,-0.470776,0.49532,3580.26,3582.38,3559.74,3560.7,117607867.0,267000000000000000
12934,21/01/2019 22:00,0.082912,929.0,247.0,377.0,305.0,-0.409633,0.47269,3561.58,3569.04,3535.19,3567.92,18324.54,275000000000000000
12935,21/01/2019 23:00,0.12216,899.0,168.0,355.0,376.0,-0.372583,0.485678,3569.34,3579.33,3569.34,3573.67,10105.2,276000000000000000


## Preprocessing

In [10]:
df_bitcoin_price_FBP = df_bitcoin_price.copy()

df_bitcoin_price_FBP.rename(columns={'timestamp': 'ds', 'weighted price': 'y'}, inplace=True)

df_bitcoin_price_FBP = df_bitcoin_price_FBP[['ds', 'y']]

df_bitcoin_price_FBP.ds = pd.to_datetime(df_bitcoin_price_FBP.ds, utc=True)

df_bitcoin_price_FBP.tail()

Unnamed: 0,ds,y
35232,2019-06-09 16:00:00+00:00,7664.22
35233,2019-06-09 17:00:00+00:00,7645.34
35234,2019-06-09 18:00:00+00:00,7678.13
35235,2019-06-09 19:00:00+00:00,7711.1
35236,2019-06-09 20:00:00+00:00,7728.32


In [11]:
df_bitcoin_price_FBP[df_bitcoin_price_FBP.duplicated()]

df_bitcoin_price_FBP.drop_duplicates(inplace=True)

#### Preprocessing df_vix_hourly

In [13]:
df_vix_FBP = df_vix_hourly.copy()

df_vix_FBP.rename(columns={'local_time': 'ds', 'Open': 'y'}, inplace=True)

df_vix_FBP = df_vix_FBP[['ds', 'y']]

df_vix_FBP.ds = pd.to_datetime(df_vix_FBP.ds, utc = True)

df_vix_FBP.sort_values(by=['ds'], inplace = True)

df_vix_FBP.head()

Unnamed: 0,ds,y
0,2017-01-25 16:00:00+00:00,19.14
1,2017-01-25 17:00:00+00:00,19.14
2,2017-01-25 18:00:00+00:00,19.14
3,2017-01-25 19:00:00+00:00,19.14
4,2017-01-25 20:00:00+00:00,19.14


In [14]:
df_gold_FBP = df_gold.copy()

df_gold_FBP.rename(columns={'Date': 'ds', 'Open': 'y'}, inplace=True)

df_gold_FBP = df_gold_FBP[['ds', 'y']]

df_gold_FBP.ds = pd.to_datetime(df_gold_FBP.ds, utc = True)

df_gold_FBP.head()

Unnamed: 0,ds,y
0,2017-01-08 00:00:00+00:00,2855.81
1,2017-01-08 01:00:00+00:00,2823.01
2,2017-01-08 02:00:00+00:00,2846.27
3,2017-01-08 03:00:00+00:00,2841.84
4,2017-01-08 04:00:00+00:00,2862.92


## Getting the intersecting dates of the dataframes


In [15]:
df_bitcoin_price_FBP.head()

dates_gold = set(df_gold_FBP.ds)
dates_vix = set(df_vix_FBP.ds)
dates_bitcoin = set(df_bitcoin_price_FBP.ds)

dates_intersection = dates_gold & dates_vix & dates_bitcoin

df_bitcoin_inter = df_bitcoin_price_FBP[df_bitcoin_price_FBP.ds.isin(dates_intersection)]
df_vix_inter = df_vix_FBP[df_vix_FBP.ds.isin(dates_intersection)]
df_gold_inter = df_gold_FBP[df_gold_FBP.ds.isin(dates_intersection)]

first_timestamp = min(dates_intersection)
last_timestamp = max(dates_intersection)
all(first_timestamp == min(df.ds) for df in [df_bitcoin_inter, df_vix_inter, df_gold_inter])
all(last_timestamp == max(df.ds) for df in [df_bitcoin_inter, df_vix_inter, df_gold_inter])

True

In [16]:
df_bitcoin_inter['ds'] = df_bitcoin_inter.ds.dt.tz_localize(None)
df_vix_inter['ds'] = df_vix_inter.ds.dt.tz_localize(None)
df_gold_inter['ds'] = df_gold_inter.ds.dt.tz_localize(None)

df_bitcoin_inter.reset_index(inplace = True)
df_vix_inter.reset_index(inplace = True)
df_gold_inter.reset_index(inplace = True)

df_bitcoin_inter.drop('index', axis = 1, inplace = True)
df_vix_inter.drop('index', axis = 1, inplace = True)
df_gold_inter.drop('index', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.o

In [17]:
df_bitcoin_inter.head()

Unnamed: 0,ds,y
0,2017-02-08 00:00:00,1054.03
1,2017-02-08 01:00:00,1060.48
2,2017-02-08 02:00:00,1062.94
3,2017-02-08 03:00:00,1065.82
4,2017-02-08 04:00:00,1068.15


In [19]:
df_bitcoin_inter.to_pickle('./processed_data/df_bitcoin_inter.pickle')
df_vix_inter.to_pickle('./processed_data/df_vix_inter.pickle')
df_gold_inter.to_pickle('./processed_data/df_gold_inter.pickle')

In [20]:
df_combined = df_bitcoin_inter.copy()
df_combined['vix'] = df_vix_inter['y']
df_combined['gold'] = df_gold_inter['y']

In [22]:
df_combined.head(2)

Unnamed: 0,ds,y,vix,gold
0,2017-02-08 00:00:00,1054.03,18.879,2732.0
1,2017-02-08 01:00:00,1060.48,18.915,2693.0


In [23]:
df_combined.to_pickle('./processed_data/df_combined.pickle')