In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot 

from functions.load_data import load_financial_data, load_sentiment_data
from functions.preprocessing import merge_sentiment_financials
from functions.data_analysis import test_stationarity
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from scipy.stats import linregress



In [None]:
# Load Data
df_financials = load_financial_data(filepath = 'data/top20_aggre_price_reduced.csv', start_date = '2017-07-28', end_date = '2018-07-27', tickers = 'BTC')
df_sentiment_aggr = load_sentiment_data(filepath = 'data/aggregated_tweets/BTC_sentiment_aggr_6h_shifted.csv', start_date = '2017-07-28', end_date = '2018-07-27')

# Aggregate financials to fit sentiment frequency
df_financials_aggr = df_financials[::6]

# shift index by a day for large aggregations
#df_sentiment_aggr.index = df_sentiment_aggr.index.shift(-1, freq='D')

# Merge the data
df_merged = merge_sentiment_financials(df_financials_aggr, df_sentiment_aggr)

In [None]:
# Get different values for BTC (log, diff, combination)

# Diff
df_merged['BTC_diff'] = df_merged['BTC']-df_merged['BTC'].shift()


# Log
df_merged['BTC_log']= np.log(df_merged['BTC'])

# Diff of log
df_merged['BTC_log_diff'] = df_merged['BTC_log']-df_merged['BTC_log'].shift()

# Drop empty rows which were created through diff
df_merged.dropna(inplace = True)

# Make financial data binary (rise: 1/not rise: 0)
df_merged['BTC_bin'] = np.sign(df_merged['BTC_diff'])
df_merged['BTC_bin'] = df_merged['BTC_bin'].replace(to_replace = -1, value = 0)

In [None]:
#Fill missing values with the ffill method
df_merged['amount_of_tweets'] = df_merged['amount_of_tweets'].replace(to_replace=0, method = 'ffill')
df_merged['sentiment'] = df_merged['sentiment'].replace(to_replace=0, method = 'ffill')
df_merged['weighted_sentiment'] = df_merged['weighted_sentiment'].replace(to_replace=0, method = 'ffill')
df_merged['sentiwordnet'] = df_merged['sentiwordnet'].replace(to_replace=0, method = 'ffill')
df_merged['weighted_sentiwordnet'] = df_merged['weighted_sentiwordnet'].replace(to_replace=0, method = 'ffill')

# Create Rolling Means
df_merged['sentiment_rm'] = df_merged['sentiment'].rolling(4, min_periods=4).mean()
df_merged['amount_of_tweets_rm'] = df_merged['amount_of_tweets'].rolling(4, min_periods=4).mean()
df_merged['weighted_sentiment_rm'] = df_merged['weighted_sentiment'].rolling(4, min_periods=4).mean()
df_merged['sentiwordnet_rm'] = df_merged['sentiwordnet'].rolling(4, min_periods=4).mean()
df_merged['weighted_sentiwordnet_rm'] = df_merged['weighted_sentiwordnet'].rolling(4, min_periods=4).mean()



df_merged.dropna(inplace = True)

#create diffs
df_merged['sentiment_diff'] =df_merged['sentiment']-df_merged['sentiment'].shift()
df_merged['sentiwordnet_diff'] =df_merged['sentiwordnet']-df_merged['sentiwordnet'].shift()
df_merged['weighted_sentiment_rm_diff'] =df_merged['weighted_sentiment_rm']-df_merged['weighted_sentiment_rm'].shift()
df_merged['weighted_sentiment_diff'] =df_merged['weighted_sentiment']-df_merged['weighted_sentiment'].shift()
df_merged['sentiment_rm_diff'] =df_merged['sentiment_rm']-df_merged['sentiment_rm'].shift()
df_merged['amount_of_tweets_rm_diff'] = df_merged['amount_of_tweets_rm'] -df_merged['amount_of_tweets_rm'].shift()
df_merged['sentiwordnet_rm_diff'] =df_merged['sentiwordnet_rm']-df_merged['sentiwordnet_rm'].shift()
df_merged['weighted_sentiwordnet_diff'] =df_merged['weighted_sentiwordnet']-df_merged['weighted_sentiwordnet'].shift()
df_merged['weighted_sentiwordnet_rm_diff'] =df_merged['weighted_sentiwordnet_rm']-df_merged['weighted_sentiwordnet_rm'].shift()
df_merged.dropna(inplace = True)

In [None]:
# Delete last row of df because it is not a full 6h
df_merged.drop(df_merged.tail(1).index, inplace = True)

In [None]:
# 6h Sentiment aggregation weighed vs unweighted
fig, ax = plot.subplots(2,1)
fig.suptitle("Sentiment Score (Method 1): Unweighted vs. Weighted ", fontsize=15)
ax[0].plot(df_merged['sentiment'])
ax[0].set_ylabel('Unweighted Score')
ax[0].set_ylim(-1,1)
ax[1].plot(df_merged['weighted_sentiment'])
ax[1].set_ylim(-1,1)
ax[1].set_ylabel('Weighted Score')
#plot.plot(df_merged['sentiment'])


plot.show()

In [None]:
# Load financials for multiple coins
financials_BTC = load_financial_data(filepath = 'data/top20_aggre_price_reduced.csv', start_date = '2017-07-28', end_date = '2018-07-27', tickers = 'BTC')
financials_ETH = load_financial_data(filepath = 'data/top20_aggre_price_reduced.csv', start_date = '2017-07-28', end_date = '2018-07-27', tickers = 'ETH')
financials_XRP = load_financial_data(filepath = 'data/top20_aggre_price_reduced.csv', start_date = '2017-07-28', end_date = '2018-07-27', tickers = 'XRP')


In [None]:
# Aggregate each coin
financials_BTC_aggr = financials_BTC[::24]
financials_ETH_aggr = financials_ETH[::24]
financials_XRP_aggr = financials_XRP[::24]
financials_df = pd.concat([financials_BTC_aggr, financials_ETH_aggr, financials_XRP_aggr], axis=1)

In [None]:
# Create daily change
financials_df['BTC_diff'] = financials_df['BTC']-financials_df['BTC'].shift()
financials_df['ETH_diff'] = financials_df['ETH']-financials_df['ETH'].shift()
financials_df['XRP_diff'] = financials_df['XRP']-financials_df['XRP'].shift()

In [None]:
# Plot Financial change and check for stationarity
plot.plot(df_merged['BTC_diff'])
plot.xlabel('Date', fontsize=12)
plot.ylabel('Change in USD', fontsize=12)
plot.title("6h Financial Value Change of Bitcoin (BTC_diff)", fontsize=15)
plot.show()
test_stationarity(df_merged['BTC_log'])

In [None]:
# Plot Sentiment vs. Rolling Mean Sentiment
plot.plot(df_merged['weighted_sentiment_diff'])
plot.plot(df_merged['weighted_sentiment_rm_diff'])
plot.xlabel('Date', fontsize=12)
plot.ylabel('Sentiment Score', fontsize=12)
plot.title("6h change in Weighted Sentiment (Method 1)", fontsize=15)
plot.legend()
plot.show()


In [None]:
# Plot SentiWordNet vs. Rolling Mean SentiWordNet
plot.plot(df_merged['weighted_sentiwordnet_diff'])
plot.plot(df_merged['weighted_sentiwordnet_rm_diff'])
plot.xlabel('Date', fontsize=12)
plot.ylabel('Sentiment Score', fontsize=12)
plot.title("6h change in Weighted Sentiment (Method 2)", fontsize=15)
plot.legend()
plot.show()


In [None]:
# Plot unchanged BTC
plot.plot(df_merged['BTC'])
plot.xlabel('Date', fontsize=12)
plot.ylabel('Price in USD', fontsize=12)
plot.title("Price distribution of bitcoin", fontsize=15)
plot.show()
test_stationarity(df_merged['BTC'])

In [None]:
# Check if log solves stationarity issue
plot.plot(df_merged['BTC_log'])
plot.xlabel('Date', fontsize=12)
plot.ylabel('Price in USD logged', fontsize=12)
plot.title("Price distribution of bitcoin logged", fontsize=15)
plot.show()
test_stationarity(df_merged['BTC_log'])

In [None]:
# Sanity check: Log and diff should provide stationary graph
plot.plot(df_merged['BTC_log_diff'])
plot.xlabel('Date', fontsize=12)
plot.ylabel('Price change in logged USD', fontsize=12)
plot.title("Price change distribution of logged bitcoin", fontsize=15)
plot.show()
test_stationarity(df_merged['BTC_log_diff'])


In [None]:
# Plot both Sentiment and BTC Price
fig, ax1 = plot.subplots()
color = 'tab:blue'
ax1.set_xlabel('Date')
ax1.set_ylabel('Sentiwordnet RM 24h', color=color)
ax1.plot(df_merged['sentiwordnet_rm'], color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:red'
ax2.set_ylabel('Price', color=color)  # we already handled the x-label with ax1
ax2.plot(df_merged['BTC'], color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plot.show()


In [None]:
# Plot both weighted Sentiment and BTC Price
fig, ax1 = plot.subplots()
color = 'tab:blue'
ax1.set_xlabel('Date')
ax1.set_ylabel('Weighted Sentiment RM 24h', color=color)
ax1.plot(df_merged['weighted_sentiment_rm'], color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:red'
ax2.set_ylabel('Price', color=color)  # we already handled the x-label with ax1
ax2.plot(df_merged['BTC'], color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plot.show()


In [None]:
# Plot number of tweets
plot.plot(df_merged['amount_of_tweets_rm'])
plot.xlabel('Date', fontsize=12)
plot.ylabel('Amount of Tweets RM 24h', fontsize=12)
plot.title("Amount of Tweets distribution of bitcoin", fontsize=15)
plot.show()
test_stationarity(df_merged['amount_of_tweets'])

In [None]:
# Partial Autocorrelation of BTC log diff
# alpha = 0.01 
series = df_merged['weighted_sentiwordnet_rm_diff']
plot_pacf(series, lags=10, alpha=0.05)
plot.title('Partial Autocorrelation of log diff BTC')
plot.show()

In [None]:
# Partial Autocorrelation of Sentiment
series = df_merged['weighted_sentiment_rm']
plot_pacf(series, lags=50)
plot.title('Partial Autocorrelation of weighted_sentiwordnet_rm with 6h aggr ')
plot.show()

In [None]:
# Autocorrelation of Sentiment
series = df_merged['weighted_sentiment_rm']
plot_pacf(series, lags=50)
plot.title('Partial Autocorrelation of weighted Sentiment 24h RM')
plot.show()

In [None]:
# Autocorrelation of Sentiment
series = df_merged['amount_of_tweets_rm']
plot_pacf(series, lags=50)
plot.title('Partial Autocorrelation of the amount of tweets')
plot.show()

In [None]:
# Basic values for linear regression
a,b,c,d,e = linregress(df_merged['sentiment_rm'], df_merged['BTC_log_diff'])
print('Linear regression of sentiment_rm and BTC_log_diff')
print ('Slope = ' + str(a))
print ('Intercept = ' + str(b))
print('Rvalue = ' + str(c))
print('Pvalue = ' + str(d))
print('STDerr = ' + str(e))
print('')
a,b,c,d,e = linregress(df_merged['weighted_sentiment_rm'], df_merged['BTC_log_diff'])
print('Linear regression of weighted_sentiment_rm and BTC_log_diff')
print ('Slope = ' + str(a))
print ('Intercept = ' + str(b))
print('Rvalue = ' + str(c))
print('Pvalue = ' + str(d))
print('STDerr = ' + str(e))
print('')
a,b,c,d,e = linregress(df_merged['weighted_sentiment_rm'], df_merged['BTC_log'])
print('Linear regression of sentiment_rm and BTC_log')
print ('Slope = ' + str(a))
print ('Intercept = ' + str(b))
print('Rvalue = ' + str(c))
print('Pvalue = ' + str(d))
print('STDerr = ' + str(e))

In [None]:
# OLS for 6H aggregation
import statsmodels.api as sm
x = np.array(df_merged['sentiwordnet_rm']).reshape(-1,1)
y = np.array(df_merged['BTC_log_diff']).reshape(-1,1)

x = sm.add_constant(x)
model = sm.OLS(y,x).fit()
model.summary()

In [None]:
# QQ Plot for sentiwordnet_rm vs BTC_log_diff
import matplotlib.pyplot as plt
res = model.resid # residuals
fig = sm.qqplot(res, line='s')


In [None]:
# Check for cointegration of variables
print(coint_johansen(df_merged[['BTC', 'amount_of_tweets', 'sentiment', 'weighted_sentiment']], -1, 1).eig)
print(coint_johansen(df_merged[['BTC_log', 'amount_of_tweets', 'sentiment', 'weighted_sentiment']], -1, 1).eig)
print(coint_johansen(df_merged[['BTC_diff', 'amount_of_tweets', 'sentiment', 'weighted_sentiment']], -1, 1).eig)
print(coint_johansen(df_merged[['BTC_log_diff', 'amount_of_tweets', 'sentiment', 'weighted_sentiment']], -1, 1).eig)

