In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from functions.load_data import load_financial_data, load_sentiment_data
from functions.preprocessing import merge_sentiment_financials
from thermal_optimal_path.lattice import partition_function, iter_lattice 
from thermal_optimal_path.statistics import average_path

# TOP method adapted from https://github.com/amwatt/thermal_optimal_path/tree/master/thermal_optimal_path

In [None]:
# Load Data
df_financials = load_financial_data(filepath = r'C:\Users\lukas\Dropbox\STUDIUM\Python\master_thesis\data\top20_aggre_price_reduced.csv', start_date = '2017-07-28', end_date = '2018-07-27', tickers = 'BTC')
df_sentiment_aggr = load_sentiment_data(filepath = 'C:/Users/lukas/Dropbox/STUDIUM/Python/master_thesis/data/aggregated_tweets/BTC_sentiment_aggr_5d_shifted.csv', start_date = '2017-07-28', end_date = '2018-07-27')

# Aggregate financials to fit sentiment frequency
df_financials_aggr = df_financials[::120]

# shift date for large aggregation
#df_sentiment_aggr.index = df_sentiment_aggr.index.shift(-2,freq='d')

# Merge the data
df_merged = merge_sentiment_financials(df_financials_aggr, df_sentiment_aggr)



In [None]:
# Get different values for BTC (log, diff, combination)

# Diff
df_merged['BTC_diff'] = df_merged['BTC']-df_merged['BTC'].shift()


# Log
df_merged['BTC_log']= np.log(df_merged['BTC'])

# Diff of log
df_merged['BTC_log_diff'] = df_merged['BTC_log']-df_merged['BTC_log'].shift()

# Drop empty rows which were created through diff
df_merged.dropna(inplace = True)

# Make financial data binary (rise: 1/not rise: 0)
df_merged['BTC_bin'] = np.sign(df_merged['BTC_diff'])
df_merged['BTC_bin'] = df_merged['BTC_bin'].replace(to_replace = -1, value = 0)


In [None]:
# Diff for other coins
df_merged['XRP_diff'] = df_merged['XRP']-df_merged['XRP'].shift()

# Drop empty rows which were created through diff
df_merged.dropna(inplace = True)

In [None]:
#Fill missing values with the ffill method
df_merged['amount_of_tweets'] = df_merged['amount_of_tweets'].replace(to_replace=0, method = 'ffill')
df_merged['sentiment'] = df_merged['sentiment'].replace(to_replace=0, method = 'ffill')
df_merged['weighted_sentiment'] = df_merged['weighted_sentiment'].replace(to_replace=0, method = 'ffill')
df_merged['sentiwordnet'] = df_merged['sentiwordnet'].replace(to_replace=0, method = 'ffill')
df_merged['weighted_sentiwordnet'] = df_merged['weighted_sentiwordnet'].replace(to_replace=0, method = 'ffill')

# Create rolling means
df_merged['sentiment_rm'] = df_merged['sentiment'].rolling(24, min_periods=24).mean()
df_merged['amount_of_tweets_rm'] = df_merged['amount_of_tweets'].rolling(24, min_periods=24).mean()
df_merged['weighted_sentiment_rm'] = df_merged['weighted_sentiment'].rolling(24, min_periods=24).mean()
df_merged['sentiwordnet_rm'] = df_merged['sentiwordnet'].rolling(24, min_periods=24).mean()
df_merged['weighted_sentiwordnet_rm'] = df_merged['weighted_sentiwordnet'].rolling(24, min_periods=24).mean()

df_merged.dropna(inplace = True)

df_merged['weighted_sentiment_rm_diff'] =df_merged['weighted_sentiment_rm']-df_merged['weighted_sentiment_rm'].shift()
df_merged['sentiment_rm_diff'] =df_merged['sentiment_rm']-df_merged['sentiment_rm'].shift()
df_merged['amount_of_tweets_rm_diff'] = df_merged['amount_of_tweets_rm'] -df_merged['amount_of_tweets_rm'].shift()
df_merged['sentiwordnet_rm_diff'] =df_merged['sentiwordnet_rm']-df_merged['sentiwordnet_rm'].shift()
df_merged['weighted_sentiwordnet_rm_diff'] =df_merged['weighted_sentiwordnet_rm']-df_merged['weighted_sentiwordnet_rm'].shift()
df_merged.dropna(inplace = True)

In [None]:
#Standardise the time-series
def standardise(ts):
    ts -= np.mean(ts)
    return ts / np.std(ts)

x = df_merged['BTC_diff'].values
y = df_merged['weighted_sentiwordnet_rm_diff'].values

x = standardise(x)
y = standardise(y)



In [None]:
# Plot the lead-lag structure

temperature = 1
g = partition_function(x,y , temperature)
avg = average_path(g)
plt.plot(avg[::2], label='T = 1')

temperature = 2
g = partition_function(x,y , temperature)
avg = average_path(g)
plt.plot(avg[::2], label='T = 2')

temperature = 4
g = partition_function(x,y , temperature)
avg = average_path(g)
plt.plot(avg[::2], label='T = 4')

temperature = 8
g = partition_function(x,y , temperature)
avg = average_path(g)
plt.plot(avg[::2], label='T = 8')
plt.axhline(linewidth=1, linestyle='dashed', color='black')
my_xticks = np.array(['Aug 17', 'Oct 17', 'Dec 17', 'Feb 18', 'Apr 18', 'Jun 18', 'Aug 18'])
my_x_axis = np.array([0,8,16,24,32,40,48])
plt.xticks(my_x_axis, my_xticks)

plt.xlabel('t', fontsize=12)
plt.ylabel(r'$\tau(t)$', fontsize=12)

plt.title("Lead-lag structure XRP Value and SentiWordNet Score", fontsize=15)
plt.legend()
#positive tau means Sentiment lags behind BTC ??