# Bitcoin Movement prediction

## Building Hourly Dataset

In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from pytrends.request import TrendReq
from pytrends import dailydata
import os

import warnings
warnings.filterwarnings("ignore")

### Getting Crypto Data

In [2]:
btc = yf.download(tickers='BTC-USD', period = '2y', interval = '1h')
eth = yf.download(tickers='ETH-USD', period = '2y', interval = '1h')
bnb = yf.download(tickers='BNB-USD', period = '2y', interval = '1h')
ada = yf.download(tickers='ADA-USD', period = '2y', interval = '1h')
lite = yf.download(tickers='LTC-USD', period = '2y', interval = '1h') 

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [3]:
btc.head(3)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
2020-06-22 16:00:00+00:00,9444.038086,9558.391602,9438.841797,9558.391602,9558.391602,0
2020-06-22 17:00:00+00:00,9555.646484,9555.646484,9506.036133,9548.454102,9548.454102,471283712
2020-06-22 18:00:00+00:00,9553.514648,9556.916992,9507.625,9513.469727,9513.469727,108763136


### Features Engineering

In [4]:
def prep(df):
  df['Close+1'] = df['Close'].shift(periods=-1)
  df['Close-7'] = df['Close'].shift(periods=7)
  df['Close-30'] = df['Close'].shift(periods=30)
  
  df['Return+1'] = round(((df['Close+1'] / df['Close']) - 1) * 100, 4)
  df['Return-7'] = round(((df['Close'] / df['Close-7']) - 1) * 100, 4)
  df['Return-30'] = round(((df['Close'] / df['Close-30']) - 1) * 100, 4)
  df['Return'] = round(((df['Close'] / df['Open']) - 1) * 100, 4)
  
  df['SMA30'] = abs(df['Return']).rolling(30).mean()
  
  df.loc[df['Return+1'] >= (df['SMA30']/2), 'target_multi'] = 1     #Buy
  df.loc[df['Return+1'] <= (-df['SMA30']/2), 'target_multi'] = -1    #Sell
  df.loc[df['Return+1'].between(-df['SMA30']/2, df['SMA30']/2, inclusive = False), 'target_multi'] = 0    #Keep
  
  df.loc[df['Return+1'] <= 0, 'target_bin'] = 0
  df.loc[df['Return+1'] > 0, 'target_bin'] = 1
  
  df['Month'] = df.index.month
  df['DayOfW'] = df.index.dayofweek
  
  df = df[['Close', 'Return', 'Return-7', 'Return-30', 'Return+1', 'target_multi', 'target_bin', 'Month', 'DayOfW']]
  df = df.dropna()
  
  return df

In [5]:
data = prep(btc)
data.head(5)

Unnamed: 0,Close,Return,Return-7,Return-30,Return+1,target_multi,target_bin,Month,DayOfW
2020-06-23 23:00:00+00:00,9629.658203,0.042,-0.1545,0.7456,0.0995,0.0,1.0,6,1
2020-06-24 00:00:00+00:00,9639.237305,0.0587,-0.0962,0.9508,0.1696,1.0,1.0,6,2
2020-06-24 01:00:00+00:00,9655.582031,0.1658,0.0351,1.4938,0.1243,1.0,1.0,6,2
2020-06-24 02:00:00+00:00,9667.585938,0.119,0.2707,1.5044,-0.0786,0.0,0.0,6,2
2020-06-24 03:00:00+00:00,9659.987305,-0.1128,0.2496,0.6682,0.0788,0.0,1.0,6,2


### Calcul du return sur tous les fonds

In [6]:
def return_calc(df, name):
  df['Return_' + name] = round(((df['Close'] / df['Open']) - 1) * 100, 4)
  return df

In [7]:
list_df = [eth, bnb, ada, lite]
name_df = ['eth', 'bnb', 'ada', 'lite']
i = 0
l = []

for df in list_df:
  df['Return_' + name_df[i]] = round(((df['Close'] / df['Open']) - 1) * 100, 4)
  df['Close_' + name_df[i]] = df['Close']
  l.append(df.drop(columns=['Open', 'Close', 'High', 'Low', 'Adj Close', 'Volume']))
  i += 1

### Joining Stock Return

In [8]:
for df in l:
  data = data.join(df)

In [9]:
data.head(5)

Unnamed: 0,Close,Return,Return-7,Return-30,Return+1,target_multi,target_bin,Month,DayOfW,Return_eth,Close_eth,Return_bnb,Close_bnb,Return_ada,Close_ada,Return_lite,Close_lite
2020-06-23 23:00:00+00:00,9629.658203,0.042,-0.1545,0.7456,0.0995,0.0,1.0,6,1,0.1681,244.142151,0.157,16.422567,-0.1337,0.08295,0.1331,44.179279
2020-06-24 00:00:00+00:00,9639.237305,0.0587,-0.0962,0.9508,0.1696,1.0,1.0,6,2,0.0933,244.354797,0.1299,16.45602,0.2838,0.083228,0.0948,44.232101
2020-06-24 01:00:00+00:00,9655.582031,0.1658,0.0351,1.4938,0.1243,1.0,1.0,6,2,0.0344,244.490173,0.1369,16.480837,1.5309,0.084565,0.2404,44.34211
2020-06-24 02:00:00+00:00,9667.585938,0.119,0.2707,1.5044,-0.0786,0.0,0.0,6,2,0.6328,246.071228,0.1496,16.507893,1.1516,0.085524,-0.0607,44.324131
2020-06-24 03:00:00+00:00,9659.987305,-0.1128,0.2496,0.6682,0.0788,0.0,1.0,6,2,-0.2402,245.479126,-0.1448,16.486835,0.0988,0.085553,-0.0111,44.322159


### Adding Google Trends Hourly

In [10]:
from datetime import date

day = int(date.today().strftime('%d'))
mo = int(date.today().strftime('%m'))
#hourly_trend = pytrends.get_historical_interest(kw_list, year_start=2020, month_start=1, day_start=1, hour_start=0, year_end=2022, month_end=mo, day_end=day, hour_end=0, cat=0, geo='', gprop='', sleep=0)

In [11]:
#hourly_trend.to_csv("HourlyTrends.csv")
hourly_trend = pd.read_csv("HourlyTrends.csv")
hourly_trend.index = hourly_trend["date"]
hourly_trend = hourly_trend.drop(columns=["date", "bitcoin", "isPartial"])
hourly_trend.index = pd.to_datetime(hourly_trend.index).tz_localize('Etc/UCT')
hourly_trend.tail(5)

Unnamed: 0_level_0,btc
date,Unnamed: 1_level_1
2022-06-08 20:00:00+00:00,23
2022-06-08 21:00:00+00:00,23
2022-06-08 22:00:00+00:00,20
2022-06-08 23:00:00+00:00,19
2022-06-09 00:00:00+00:00,17


In [12]:
data = data.join(hourly_trend)
data.tail(5)

Unnamed: 0,Close,Return,Return-7,Return-30,Return+1,target_multi,target_bin,Month,DayOfW,Return_eth,Close_eth,Return_bnb,Close_bnb,Return_ada,Close_ada,Return_lite,Close_lite,btc
2022-06-22 12:00:00+00:00,20258.265625,-1.1921,-0.8194,-3.9716,2.1854,1.0,1.0,6,2,-1.509,1079.978516,-0.9403,213.862045,-1.0069,0.467941,-1.6415,51.30257,
2022-06-22 13:00:00+00:00,20700.996094,1.6799,2.8979,-1.9365,-0.0202,0.0,0.0,6,2,2.1987,1107.334351,1.2527,217.608047,0.9186,0.474835,2.187,52.656708,
2022-06-22 14:00:00+00:00,20696.804688,0.0314,2.9226,-2.8652,-2.7441,-1.0,0.0,6,2,0.1256,1111.081543,0.27,219.542709,-0.595,0.474554,0.0254,52.719299,
2022-06-22 15:00:00+00:00,20128.861328,-2.5511,-0.3072,-5.0135,0.1893,0.0,1.0,6,2,-2.0699,1086.688232,-1.5175,216.141449,-1.1164,0.469006,-1.3988,51.915073,
2022-06-22 16:00:00+00:00,20166.960938,0.2156,-1.2117,-5.1741,-0.0399,0.0,0.0,6,2,-0.8427,1071.468994,0.1311,215.441895,0.293,0.468989,-0.1735,51.769821,


#### Calcul de la correlation entre les fonds

In [13]:
def pearson(df, name):
  window_size = 5
  for x in name:
    df['R_' + str(x)] = df['Close'].rolling(window=window_size, center=False).corr(df['Close_' + str(x)])
  return df

In [14]:
df_corr = pearson(data, name_df)
df_corr.tail(5)

Unnamed: 0,Close,Return,Return-7,Return-30,Return+1,target_multi,target_bin,Month,DayOfW,Return_eth,...,Close_bnb,Return_ada,Close_ada,Return_lite,Close_lite,btc,R_eth,R_bnb,R_ada,R_lite
2022-06-22 12:00:00+00:00,20258.265625,-1.1921,-0.8194,-3.9716,2.1854,1.0,1.0,6,2,-1.509,...,213.862045,-1.0069,0.467941,-1.6415,51.30257,,0.986678,0.969862,0.967034,0.984701
2022-06-22 13:00:00+00:00,20700.996094,1.6799,2.8979,-1.9365,-0.0202,0.0,0.0,6,2,2.1987,...,217.608047,0.9186,0.474835,2.187,52.656708,,0.997779,0.969249,0.966288,0.989308
2022-06-22 14:00:00+00:00,20696.804688,0.0314,2.9226,-2.8652,-2.7441,-1.0,0.0,6,2,0.1256,...,219.542709,-0.595,0.474554,0.0254,52.719299,,0.991783,0.918138,0.979078,0.991807
2022-06-22 15:00:00+00:00,20128.861328,-2.5511,-0.3072,-5.0135,0.1893,0.0,1.0,6,2,-2.0699,...,216.141449,-1.1164,0.469006,-1.3988,51.915073,,0.922444,0.715572,0.956229,0.844078
2022-06-22 16:00:00+00:00,20166.960938,0.2156,-1.2117,-5.1741,-0.0399,0.0,0.0,6,2,-0.8427,...,215.441895,0.293,0.468989,-0.1735,51.769821,,0.922108,0.791913,0.957388,0.852687


Drop nan

In [15]:
data_droped = df_corr.dropna()
print("Data before drop", data.shape)
print("Data after drop", data_droped.shape)
data.isna().sum()

data = data_droped

Data before drop (17216, 22)
Data after drop (16849, 22)


#### Instantaneous phase synchrony.

Measures moment-to-moment synchrony between two signals.
Need no Nan in both signals

In [16]:
from scipy.signal import hilbert, butter, filtfilt

In [17]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = filtfilt(b, a, data)
    return y

def synchrony(lowcut, highcut, fs, order, col1, col2):
  y1 = butter_bandpass_filter(col1, lowcut=lowcut, highcut=highcut, fs=fs, order=order)
  y2 = butter_bandpass_filter(col2, lowcut=lowcut, highcut=highcut, fs=fs, order=order)

  al1 = np.angle(hilbert(y1), deg=False)
  al2 = np.angle(hilbert(y2), deg=False)
  synchrony = 1-np.sin(np.abs(al1-al2)/2)
  return synchrony

In [18]:
data['Synch_lite'] = synchrony(.01, .5, 30., 1, data['Close'], data['Close_lite'])
data['Synch_eth'] = synchrony(.01, .5, 30., 1, data['Close'], data['Close_eth'])
data['Synch_bnb'] = synchrony(.01, .5, 30., 1, data['Close'], data['Close_bnb'])
data['Synch_ada'] = synchrony(.01, .5, 30., 1, data['Close'], data['Close_ada'])

We don't need Close column anymore

In [19]:
data = data[data.columns.drop(list(data.filter(regex='Close')))]

### Adding Blockchain Data

In [20]:
import functools as ft

In [21]:
data['Timestamp'] =  data.index.strftime('%Y-%m-%d')
data['Date'] = data.index

dir = 'BlockchainData'
l = list()
for path in os.listdir(dir):
    f = os.path.join(dir, path)
    l.append(f)

dfs = []
for filename in l:
  temp = pd.read_csv(filename, index_col=None, header=0)
  temp['Timestamp'] = pd.to_datetime(temp['Timestamp']).apply(lambda x: x.strftime('%Y-%m-%d'))
  temp = temp.drop_duplicates(subset='Timestamp', keep='first')
  dfs.append(temp)

block_features = ft.reduce(lambda left, right: pd.merge(left, right, on='Timestamp'), dfs)
df_final = data.merge(block_features, how='left', on='Timestamp')
df_final = df_final.drop(columns='Timestamp')
df_final = df_final.dropna()

In [22]:
df_final

Unnamed: 0,Return,Return-7,Return-30,Return+1,target_multi,target_bin,Month,DayOfW,Return_eth,Return_bnb,...,Date,avg-block-size,cost-per-transaction,hash-rate,miners-revenue,mvrv,my-wallet-n-users,n-transactions-per-block,nvt,transaction-fees-usd
0,0.1190,0.2707,1.5044,-0.0786,0.0,0.0,6,2,0.6328,0.1496,...,2020-06-24 02:00:00+00:00,1.288160,27.518410,1.090683e+08,8.594577e+06,1.612999,50472691.0,2246.913669,13.275366,398689.959360
1,-0.1128,0.2496,0.6682,0.0788,0.0,1.0,6,2,-0.2402,-0.1448,...,2020-06-24 03:00:00+00:00,1.288160,27.518410,1.090683e+08,8.594577e+06,1.612999,50472691.0,2246.913669,13.275366,398689.959360
2,0.0867,0.1787,0.1726,-0.0787,0.0,0.0,6,2,0.9910,0.4738,...,2020-06-24 04:00:00+00:00,1.288160,27.518410,1.090683e+08,8.594577e+06,1.612999,50472691.0,2246.913669,13.275366,398689.959360
3,-0.0669,0.3611,0.1231,0.0956,0.0,1.0,6,2,-0.0253,0.1905,...,2020-06-24 05:00:00+00:00,1.288160,27.518410,1.090683e+08,8.594577e+06,1.612999,50472691.0,2246.913669,13.275366,398689.959360
4,0.0993,0.4109,0.3860,-1.1273,-1.0,0.0,6,2,0.1344,0.1084,...,2020-06-24 06:00:00+00:00,1.288160,27.518410,1.090683e+08,8.594577e+06,1.612999,50472691.0,2246.913669,13.275366,398689.959360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16844,0.0613,-1.8106,1.0823,0.4103,1.0,1.0,6,2,0.3008,-0.0568,...,2022-06-08 20:00:00+00:00,0.963582,131.068277,2.664058e+08,3.428707e+07,1.308152,83042772.0,1469.646067,8.311092,330221.243787
16845,0.4263,-0.2371,1.5284,-0.1099,0.0,0.0,6,2,0.3069,0.2712,...,2022-06-08 21:00:00+00:00,0.963582,131.068277,2.664058e+08,3.428707e+07,1.308152,83042772.0,1469.646067,8.311092,330221.243787
16846,-0.1834,-0.4816,1.3086,-0.2987,-1.0,0.0,6,2,-0.2053,0.1285,...,2022-06-08 22:00:00+00:00,0.963582,131.068277,2.664058e+08,3.428707e+07,1.308152,83042772.0,1469.646067,8.311092,330221.243787
16847,-0.3047,-0.3359,0.9803,0.0136,0.0,1.0,6,2,-0.3457,-0.4071,...,2022-06-08 23:00:00+00:00,0.963582,131.068277,2.664058e+08,3.428707e+07,1.308152,83042772.0,1469.646067,8.311092,330221.243787


#### Export dataset to CSV

In [23]:
df_final.to_csv("HourlyDataset.csv")