In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import scipy.stats as ss
import seaborn as sns

from functions.load_data import load_financial_data, load_sentiment_data
from functions.data_analysis import df_derived_by_shift
from functions.preprocessing import merge_sentiment_financials
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from scipy.stats import linregress


In [None]:
# Load Data
df_financials = load_financial_data(filepath = r'C:\Users\lukas\Dropbox\STUDIUM\Python\master_thesis\data\top20_aggre_price_reduced.csv', start_date = '2017-07-28', end_date = '2018-07-27', tickers = 'BTC')
df_sentiment_aggr = load_sentiment_data(filepath = 'C:/Users/lukas/Dropbox/STUDIUM/Python/master_thesis/data/aggregated_tweets/BTC_sentiment_aggr_12h_shifted.csv', start_date = '2017-07-28', end_date = '2018-07-27')

# Aggregate financials to fit sentiment frequency
df_financials_aggr = df_financials[::12]

# Merge the data
df_merged = merge_sentiment_financials(df_financials_aggr, df_sentiment_aggr)

In [None]:
# Load Shorter Timeframe jan-apr
df_financials = load_financial_data(filepath = r'C:\Users\lukas\Dropbox\STUDIUM\Python\master_thesis\data\top20_aggre_price_reduced.csv', start_date = '2018-01-28', end_date = '2018-04-27', tickers = 'BTC')
df_sentiment_aggr = load_sentiment_data(filepath = 'C:/Users/lukas/Dropbox/STUDIUM/Python/master_thesis/data/aggregated_tweets/BTC_sentiment_aggr_6h_shifted.csv', start_date = '2018-01-28', end_date = '2018-04-27')

# Aggregate financials to fit sentiment frequency
df_financials_aggr = df_financials[::6]

# Merge the data
df_merged = merge_sentiment_financials(df_financials_aggr, df_sentiment_aggr)

In [None]:
# Load Shorter Timeframe Oct-Dez
df_financials = load_financial_data(filepath = r'C:\Users\lukas\Dropbox\STUDIUM\Python\master_thesis\data\top20_aggre_price_reduced.csv', start_date = '2017-10-01', end_date = '2018-01-01', tickers = 'BTC')
df_sentiment_aggr = load_sentiment_data(filepath = 'C:/Users/lukas/Dropbox/STUDIUM/Python/master_thesis/data/aggregated_tweets/BTC_sentiment_aggr_6h_shifted.csv', start_date = '2017-10-01', end_date = '2018-01-01')

# Aggregate financials to fit sentiment frequency
df_financials_aggr = df_financials[::6]

# Merge the data
df_merged = merge_sentiment_financials(df_financials_aggr, df_sentiment_aggr)

In [None]:
# Get different values for BTC (log, diff, combination)

# Diff
df_merged['BTC_diff'] = df_merged['BTC']-df_merged['BTC'].shift()


# Log
df_merged['BTC_log']= np.log(df_merged['BTC'])

# Diff of log
df_merged['BTC_log_diff'] = df_merged['BTC_log']-df_merged['BTC_log'].shift()

# Drop empty rows whaich were created through diff
df_merged.dropna(inplace = True)

# Make financial data binary (rise: 1/not rise: 0)
df_merged['BTC_bin'] = np.sign(df_merged['BTC_diff'])
df_merged['BTC_bin'] = df_merged['BTC_bin'].replace(to_replace = -1, value = 0)


In [None]:
#Fill missing values with the ffill method
df_merged['amount_of_tweets'] = df_merged['amount_of_tweets'].replace(to_replace=0, method = 'ffill')
df_merged['sentiment'] = df_merged['sentiment'].replace(to_replace=0, method = 'ffill')
df_merged['weighted_sentiment'] = df_merged['weighted_sentiment'].replace(to_replace=0, method = 'ffill')
df_merged['sentiwordnet'] = df_merged['sentiwordnet'].replace(to_replace=0, method = 'ffill')
df_merged['weighted_sentiwordnet'] = df_merged['weighted_sentiwordnet'].replace(to_replace=0, method = 'ffill')

# Create rolling means
df_merged['sentiment_rm'] = df_merged['sentiment'].rolling(24, min_periods=24).mean()
df_merged['amount_of_tweets_rm'] = df_merged['amount_of_tweets'].rolling(24, min_periods=24).mean()
df_merged['weighted_sentiment_rm'] = df_merged['weighted_sentiment'].rolling(24, min_periods=24).mean()
df_merged['sentiwordnet_rm'] = df_merged['sentiwordnet'].rolling(24, min_periods=24).mean()
df_merged['weighted_sentiwordnet_rm'] = df_merged['weighted_sentiwordnet'].rolling(24, min_periods=24).mean()

df_merged.dropna(inplace = True)

df_merged['sentiment_diff'] =df_merged['sentiment']-df_merged['sentiment'].shift()
df_merged['sentiwordnet_diff'] =df_merged['sentiwordnet']-df_merged['sentiwordnet'].shift()


df_merged['weighted_sentiment_rm_diff'] =df_merged['weighted_sentiment_rm']-df_merged['weighted_sentiment_rm'].shift()
df_merged['sentiment_rm_diff'] =df_merged['sentiment_rm']-df_merged['sentiment_rm'].shift()
df_merged['amount_of_tweets_rm_diff'] = df_merged['amount_of_tweets_rm'] -df_merged['amount_of_tweets_rm'].shift()
df_merged['sentiwordnet_rm_diff'] =df_merged['sentiwordnet_rm']-df_merged['sentiwordnet_rm'].shift()
df_merged['weighted_sentiwordnet_rm_diff'] =df_merged['weighted_sentiwordnet_rm']-df_merged['weighted_sentiwordnet_rm'].shift()
df_merged.dropna(inplace = True)

In [None]:
# Delete last row of df because it is not a full 12h
df_merged.drop(df_merged.tail(1).index, inplace = True)

In [None]:
# Chose which two time-series to compare
df_tolag = df_merged[['weighted_sentiwordnet_rm_diff', 'BTC_diff']]

In [None]:
# Create lagged timeseries
df_lagged = df_derived_by_shift(df_tolag, 2)
df_lagged =df_lagged.dropna()

In [None]:
# Calculate Spearman Crosscorrelation and its P Value
corr, pval = ss.spearmanr(df_lagged)

In [None]:
# Create Colormap of Crosscorrelations for the p-value
colormap = plt.cm.RdBu
plt.figure(figsize=(15,10))
plt.title(u'Pvalues of Crosscorrelation of sentiwordnet_rm_diff and BTC_log_diff for up to 36 hours of lag (6h aggr)', y=1.05, size=16)

mask = np.zeros_like(pval)
mask[np.triu_indices_from(mask)] = True

svm = sns.heatmap(pval, mask=mask, linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True, xticklabels=column_names, yticklabels=column_names)

In [None]:
# Create Colormap of Crosscorrelations for the crosscorr coeff
colormap = plt.cm.RdBu
plt.figure(figsize=(15,10))
plt.title(u'Pvalues of Crosscorrelation of sentiwordnet_rm_diff and BTC_log_diff for up to 36 hours of lag (6h aggr)', y=1.05, size=16)

mask = np.zeros_like(pval)
mask[np.triu_indices_from(mask)] = True

svm = sns.heatmap(corr, mask=mask, linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True, xticklabels=column_names, yticklabels=column_names)

In [None]:
#All in one Approach: For chosen Month and Monthspan, create all Heatmaps

In [None]:
start_date = ['2017-08-01','2017-09-01','2017-10-01','2017-11-01','2017-12-01','2018-01-01','2018-02-01','2018-03-01','2018-04-01','2018-05-01','2018-06-01','2018-07-01']
end_date = ['2017-08-31','2017-09-30','2017-10-31','2017-11-30','2017-12-31','2018-01-31','2018-02-28','2018-03-31','2018-04-30','2018-05-31','2018-06-30','2018-07-27']

#parameters
#i: Wich month (i=0: august 2017 until i=11: july 2018
#n: Timespan in months
i=0
n=12



#1.8. bis 27.7.
# Load Shorter Timeframe jan-apr
df_financials = load_financial_data(filepath = r'C:\Users\lukas\Dropbox\STUDIUM\Python\master_thesis\data\top20_aggre_price_reduced.csv', start_date = start_date[i], end_date = end_date[i+n-1], tickers = 'BTC')
df_sentiment_aggr = load_sentiment_data(filepath = 'C:/Users/lukas/Dropbox/STUDIUM/Python/master_thesis/data/aggregated_tweets/BTC_sentiment_aggr_6h_shifted.csv', start_date = start_date[i], end_date = end_date[i+n-1])

# Aggregate financials to fit sentiment frequency
df_financials_aggr = df_financials[::6]

# Merge the data
df_merged = merge_sentiment_financials(df_financials_aggr, df_sentiment_aggr)

# Get different values for BTC (log, diff, combination)

# Diff
df_merged['BTC_diff'] = df_merged['BTC']-df_merged['BTC'].shift()


# Log
df_merged['BTC_log']= np.log(df_merged['BTC'])

# Diff of log
df_merged['BTC_log_diff'] = df_merged['BTC_log']-df_merged['BTC_log'].shift()

# Drop empty rows whaich were created through diff
df_merged.dropna(inplace = True)

# Make financial data binary (rise: 1/not rise: 0)
df_merged['BTC_bin'] = np.sign(df_merged['BTC_diff'])
df_merged['BTC_bin'] = df_merged['BTC_bin'].replace(to_replace = -1, value = 0)

#Fill missing values with the ffill method
df_merged['amount_of_tweets'] = df_merged['amount_of_tweets'].replace(to_replace=0, method = 'ffill')
df_merged['sentiment'] = df_merged['sentiment'].replace(to_replace=0, method = 'ffill')
df_merged['weighted_sentiment'] = df_merged['weighted_sentiment'].replace(to_replace=0, method = 'ffill')
df_merged['sentiwordnet'] = df_merged['sentiwordnet'].replace(to_replace=0, method = 'ffill')
df_merged['weighted_sentiwordnet'] = df_merged['weighted_sentiwordnet'].replace(to_replace=0, method = 'ffill')

# Create rolling means
df_merged['sentiment_rm'] = df_merged['sentiment'].rolling(24, min_periods=24).mean()
df_merged['amount_of_tweets_rm'] = df_merged['amount_of_tweets'].rolling(24, min_periods=24).mean()
df_merged['weighted_sentiment_rm'] = df_merged['weighted_sentiment'].rolling(24, min_periods=24).mean()
df_merged['sentiwordnet_rm'] = df_merged['sentiwordnet'].rolling(24, min_periods=24).mean()
df_merged['weighted_sentiwordnet_rm'] = df_merged['weighted_sentiwordnet'].rolling(24, min_periods=24).mean()

df_merged.dropna(inplace = True)

df_merged['sentiment_diff'] =df_merged['sentiment']-df_merged['sentiment'].shift()
df_merged['sentiwordnet_diff'] =df_merged['sentiwordnet']-df_merged['sentiwordnet'].shift()


df_merged['weighted_sentiment_rm_diff'] =df_merged['weighted_sentiment_rm']-df_merged['weighted_sentiment_rm'].shift()
df_merged['sentiment_rm_diff'] =df_merged['sentiment_rm']-df_merged['sentiment_rm'].shift()
df_merged['amount_of_tweets_rm_diff'] = df_merged['amount_of_tweets_rm'] -df_merged['amount_of_tweets_rm'].shift()
df_merged['sentiwordnet_rm_diff'] =df_merged['sentiwordnet_rm']-df_merged['sentiwordnet_rm'].shift()
df_merged['weighted_sentiwordnet_rm_diff'] =df_merged['weighted_sentiwordnet_rm']-df_merged['weighted_sentiwordnet_rm'].shift()
df_merged.dropna(inplace = True)

# Delete last row of df because it is not a full 12h
df_merged.drop(df_merged.tail(1).index, inplace = True)

df_tolag = df_merged[['weighted_sentiwordnet_rm_diff', 'BTC_diff']]

df_lagged = df_derived_by_shift(df_tolag, 2)
df_lagged =df_lagged.dropna()

corr, pval = ss.spearmanr(df_lagged)

colormap = plt.cm.RdBu
plt.figure(figsize=(15,10))
title = u'Pvalues of Crosscorrelation for startmonth i=' + str(i) + ' and endmonth i=' + str(i+n-1)
plt.title(title, y=1.05, size=16)

mask = np.zeros_like(pval)
mask[np.triu_indices_from(mask)] = True

svm = sns.heatmap(pval, mask=mask, linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True, xticklabels=column_names, yticklabels=column_names)

