## USEFULL LINKS

Time Series Analysis:

http://www.blackarbs.com/blog/time-series-analysis-in-python-linear-models-to-garch/11/1/2016

https://quant.stackexchange.com/questions/16481/why-do-we-usually-model-returns-and-not-prices/16484

https://towardsdatascience.com/selecting-the-best-machine-learning-algorithm-for-your-regression-problem-20c330bad4ef

## LIBRARIES

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline

from jupyterthemes import jtplot
jtplot.style(theme="onedork", context="notebook", ticks=True, grid=False)

## IMPORT DATA

### BTC

In [23]:
btc = pd.read_csv("BTC-hourlyTS.csv")
print(btc.shape)
btc.head()

(20851, 37)


Unnamed: 0,time,high,low,open,volumefrom,volumeto,close,analysis_page_views,charts_page_views,code_repo_closed_issues,...,reddit_comments_per_hour,reddit_posts_per_hour,reddit_subscribers,total_page_views,trades_page_views,twitter_favourites,twitter_followers,twitter_following,twitter_lists,twitter_statuses
0,2019-12-16 12:00:00,6368.64,6345.56,6359.56,296.25,1883833.11,6359.62,1052752,8777040,4368,...,90.61,3.45,1215575,47767846,767354,1000,844049,165,6631,20316
1,2019-12-16 11:00:00,6379.02,6335.51,6339.52,434.41,2760457.16,6359.56,1052743,8776923,4368,...,81.47,3.05,1215553,47767155,767351,1000,844049,165,6631,20316
2,2019-12-16 10:00:00,6351.25,6336.91,6341.45,404.08,2563742.31,6339.52,1052737,8776837,4368,...,83.74,3.05,1215529,47766522,767349,1000,844049,165,6631,20316
3,2019-12-16 09:00:00,6352.29,6336.02,6346.54,368.97,2340380.72,6341.45,1052733,8776769,4368,...,57.4,3.05,1215492,47765916,767344,1000,844049,165,6631,20316
4,2019-12-16 08:00:00,6354.45,6332.84,6342.16,349.59,2217356.03,6346.54,1052718,8776654,4368,...,71.39,2.97,1215465,47765308,767339,1000,844049,165,6631,20316


In [24]:
btc.dtypes

time                             object
high                            float64
low                             float64
open                            float64
volumefrom                      float64
volumeto                        float64
close                           float64
analysis_page_views               int64
charts_page_views                 int64
code_repo_closed_issues           int64
code_repo_closed_pull_issues      int64
code_repo_contributors            int64
code_repo_forks                   int64
code_repo_open_issues             int64
code_repo_open_pull_issues        int64
code_repo_subscribers             int64
comments                          int64
fb_likes                          int64
fb_talking_about                  int64
followers                         int64
forum_page_views                  int64
influence_page_views              int64
markets_page_views                int64
overview_page_views               int64
points                            int64


### SENTIMENT

In [25]:
sen = pd.read_csv("Sentiment-hourlyTS.csv")
print(sen.shape)
sen.head()

(106111, 12)


Unnamed: 0.1,Unnamed: 0,time,text,share_neg,share_neu,share_pos,comp,neg,neu,pos,blob_pol,blob_suj
0,0,2019-11-30 23:00:00,Big moves and trend changes are preceded by ab...,0.121,0.807,0.072,-0.4019,1,0,0,-0.072619,-0.072619
1,1,2019-11-30 22:00:00,That is a sign of a change of trend ... We can...,0.0,1.0,0.0,0.0,0,0,1,0.225,0.225
2,2,2019-11-30 21:00:00,"Bitcoin’s Price Drop, an Opportunity! - Future...",0.188,0.536,0.276,0.2481,0,0,1,0.0,0.0
3,3,2019-11-30 21:00:00,The Bitcoin market tends to repeat itself ofte...,0.0,1.0,0.0,0.0,0,0,1,0.05,0.05
4,4,2019-11-30 21:00:00,#BTCUSD - BITCOIN - VA 51 - La halfway ed il c...,0.0,1.0,0.0,0.0,0,1,0,0.0,0.0


In [26]:
sen.dtypes

Unnamed: 0      int64
time           object
text           object
share_neg     float64
share_neu     float64
share_pos     float64
comp          float64
neg             int64
neu             int64
pos             int64
blob_pol      float64
blob_suj      float64
dtype: object

In [27]:
#set time type to datetime
sen["time"] = pd.to_datetime(sen["time"], errors="coerce")

## MANIPULATION & MERGE

### SENTIMENT

In [28]:
#drop useless columns
sen.drop(["Unnamed: 0","text","blob_suj"],axis =1, inplace=True)
sen.head()

Unnamed: 0,time,share_neg,share_neu,share_pos,comp,neg,neu,pos,blob_pol
0,2019-11-30 23:00:00,0.121,0.807,0.072,-0.4019,1,0,0,-0.072619
1,2019-11-30 22:00:00,0.0,1.0,0.0,0.0,0,0,1,0.225
2,2019-11-30 21:00:00,0.188,0.536,0.276,0.2481,0,0,1,0.0
3,2019-11-30 21:00:00,0.0,1.0,0.0,0.0,0,0,1,0.05
4,2019-11-30 21:00:00,0.0,1.0,0.0,0.0,0,1,0,0.0


In [29]:
# Group by hour computing summing SHARES of NEGATIVE, NEUTRAL AND POSITIVE, computing MEAN
# for rest except counter
sen_hour = sen.groupby(by="time").agg({
    "share_neg":"sum","share_neu":"sum","share_pos":"sum","comp":"mean","neg":"mean","neu":"mean",
    "pos":"mean","blob_pol":"mean"})

sen_hour.sort_values(by="time",ascending=False,inplace=True)
sen_hour.reset_index(inplace=True)
#sen_hour.reset_index
print(sen_hour.shape)
sen_hour


(19436, 9)


Unnamed: 0,time,share_neg,share_neu,share_pos,comp,neg,neu,pos,blob_pol
0,2019-11-30 23:00:00,0.121,0.807,0.072,-0.401900,1.00,0.000000,0.000000,-0.072619
1,2019-11-30 22:00:00,0.000,1.000,0.000,0.000000,0.00,0.000000,1.000000,0.225000
2,2019-11-30 21:00:00,0.188,2.536,0.276,0.082700,0.00,0.333333,0.666667,0.016667
3,2019-11-30 20:00:00,0.236,3.534,0.230,-0.018600,0.50,0.250000,0.250000,0.115938
4,2019-11-30 19:00:00,0.068,2.609,0.323,0.290667,0.00,0.000000,1.000000,0.100269
...,...,...,...,...,...,...,...,...,...
19431,2017-07-02 04:00:00,0.000,2.000,0.000,0.000000,1.00,0.000000,0.000000,-0.155556
19432,2017-07-02 03:00:00,0.000,1.000,0.000,0.000000,0.00,1.000000,0.000000,0.000000
19433,2017-07-02 02:00:00,0.000,4.000,0.000,0.000000,0.75,0.000000,0.250000,-0.179167
19434,2017-07-02 01:00:00,0.000,1.000,0.000,0.000000,0.00,0.000000,1.000000,0.400000


### BTC

In [30]:
#compute close price of previous hour
#btc["close_lag1"] = btc["close"] - btc["close"].shift(-1)

#compute difference between high and low prices in ABSOLUTE TERMS
btc["delta_hl"] = btc["high"] - btc["low"]

#compute hourly return close on close price in %
btc["return"] = ((btc["close"] - btc["close"].shift(-1))/btc["close"].shift(-1))*100

In [31]:
#fill last row of price previous hour and return with 0 cause it is a Null
#print(btc["close_lag1"].isna().sum())
#btc["close_lag1"].fillna(0)

print(btc["return"].isna().sum())
btc["return"].fillna(0)

btc.head()

1


Unnamed: 0,time,high,low,open,volumefrom,volumeto,close,analysis_page_views,charts_page_views,code_repo_closed_issues,...,reddit_subscribers,total_page_views,trades_page_views,twitter_favourites,twitter_followers,twitter_following,twitter_lists,twitter_statuses,delta_hl,return
0,2019-12-16 12:00:00,6368.64,6345.56,6359.56,296.25,1883833.11,6359.62,1052752,8777040,4368,...,1215575,47767846,767354,1000,844049,165,6631,20316,23.08,0.000943
1,2019-12-16 11:00:00,6379.02,6335.51,6339.52,434.41,2760457.16,6359.56,1052743,8776923,4368,...,1215553,47767155,767351,1000,844049,165,6631,20316,43.51,0.316112
2,2019-12-16 10:00:00,6351.25,6336.91,6341.45,404.08,2563742.31,6339.52,1052737,8776837,4368,...,1215529,47766522,767349,1000,844049,165,6631,20316,14.34,-0.030435
3,2019-12-16 09:00:00,6352.29,6336.02,6346.54,368.97,2340380.72,6341.45,1052733,8776769,4368,...,1215492,47765916,767344,1000,844049,165,6631,20316,16.27,-0.080201
4,2019-12-16 08:00:00,6354.45,6332.84,6342.16,349.59,2217356.03,6346.54,1052718,8776654,4368,...,1215465,47765308,767339,1000,844049,165,6631,20316,21.61,0.069062


In [32]:
btc.dtypes

time                             object
high                            float64
low                             float64
open                            float64
volumefrom                      float64
volumeto                        float64
close                           float64
analysis_page_views               int64
charts_page_views                 int64
code_repo_closed_issues           int64
code_repo_closed_pull_issues      int64
code_repo_contributors            int64
code_repo_forks                   int64
code_repo_open_issues             int64
code_repo_open_pull_issues        int64
code_repo_subscribers             int64
comments                          int64
fb_likes                          int64
fb_talking_about                  int64
followers                         int64
forum_page_views                  int64
influence_page_views              int64
markets_page_views                int64
overview_page_views               int64
points                            int64


In [33]:
#set time type to datetime
btc["time"] = pd.to_datetime(btc["time"], errors="coerce")

In [34]:
#check columns and zeros
for col in btc.columns:
    print(col, "ZEROS:", len(btc[btc[col]==0]))

time ZEROS: 0
high ZEROS: 0
low ZEROS: 0
open ZEROS: 0
volumefrom ZEROS: 1
volumeto ZEROS: 1
close ZEROS: 0
analysis_page_views ZEROS: 12450
charts_page_views ZEROS: 12450
code_repo_closed_issues ZEROS: 12453
code_repo_closed_pull_issues ZEROS: 12450
code_repo_contributors ZEROS: 18910
code_repo_forks ZEROS: 12450
code_repo_open_issues ZEROS: 12450
code_repo_open_pull_issues ZEROS: 12451
code_repo_subscribers ZEROS: 12450
comments ZEROS: 12450
fb_likes ZEROS: 12450
fb_talking_about ZEROS: 12990
followers ZEROS: 12450
forum_page_views ZEROS: 12450
influence_page_views ZEROS: 12450
markets_page_views ZEROS: 12450
overview_page_views ZEROS: 12450
points ZEROS: 12450
posts ZEROS: 12450
reddit_active_users ZEROS: 12450
reddit_comments_per_hour ZEROS: 12450
reddit_posts_per_hour ZEROS: 12450
reddit_subscribers ZEROS: 12450
total_page_views ZEROS: 12450
trades_page_views ZEROS: 12450
twitter_favourites ZEROS: 12450
twitter_followers ZEROS: 12450
twitter_following ZEROS: 12450
twitter_lists ZE

In [35]:
#drop all non-market realted columns
btc_mkt = btc.drop([col for col in btc.columns[7:-3]],axis=1)
btc_mkt.head()

Unnamed: 0,time,high,low,open,volumefrom,volumeto,close,twitter_statuses,delta_hl,return
0,2019-12-16 12:00:00,6368.64,6345.56,6359.56,296.25,1883833.11,6359.62,20316,23.08,0.000943
1,2019-12-16 11:00:00,6379.02,6335.51,6339.52,434.41,2760457.16,6359.56,20316,43.51,0.316112
2,2019-12-16 10:00:00,6351.25,6336.91,6341.45,404.08,2563742.31,6339.52,20316,14.34,-0.030435
3,2019-12-16 09:00:00,6352.29,6336.02,6346.54,368.97,2340380.72,6341.45,20316,16.27,-0.080201
4,2019-12-16 08:00:00,6354.45,6332.84,6342.16,349.59,2217356.03,6346.54,20316,21.61,0.069062


### MERGE

In [36]:
#Merge Full Datasets, some rows lost because sen_hour must have some missing.
ds_full = sen_hour.merge(btc, on="time",how="inner")
print(ds_full.shape)
ds_full.head()

(18884, 47)


Unnamed: 0,time,share_neg,share_neu,share_pos,comp,neg,neu,pos,blob_pol,high,...,reddit_subscribers,total_page_views,trades_page_views,twitter_favourites,twitter_followers,twitter_following,twitter_lists,twitter_statuses,delta_hl,return
0,2019-11-30 23:00:00,0.121,0.807,0.072,-0.4019,1.0,0.0,0.0,-0.072619,6879.78,...,1200370,47531326,765363,1000,844049,165,6631,20316,20.32,0.17946
1,2019-11-30 22:00:00,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.225,6873.15,...,1200321,47530656,765359,1000,844049,165,6631,20316,18.08,-0.035122
2,2019-11-30 21:00:00,0.188,2.536,0.276,0.0827,0.0,0.333333,0.666667,0.016667,6861.99,...,1200304,47529902,765355,1000,844049,165,6631,20316,42.37,0.598884
3,2019-11-30 20:00:00,0.236,3.534,0.23,-0.0186,0.5,0.25,0.25,0.115938,6835.86,...,1200267,47529178,765348,1000,844049,165,6631,20316,33.16,-0.168459
4,2019-11-30 19:00:00,0.068,2.609,0.323,0.290667,0.0,0.0,1.0,0.100269,6851.14,...,1200215,47528393,765340,1000,844049,165,6631,20316,50.23,0.464051


In [42]:
#Merge BTC mkt only Datasets, some rows lost because sen_hour must have some missing.
ds_mkt = sen_hour.merge(btc_mkt, on="time",how="inner")
print(ds_mkt.shape)
ds_mkt.head()

(18884, 18)


Unnamed: 0,time,share_neg,share_neu,share_pos,comp,neg,neu,pos,blob_pol,high,low,open,volumefrom,volumeto,close,twitter_statuses,delta_hl,return
0,2019-11-30 23:00:00,0.121,0.807,0.072,-0.4019,1.0,0.0,0.0,-0.072619,6879.78,6859.46,6859.46,114.89,789037.39,6871.77,20316,20.32,0.17946
1,2019-11-30 22:00:00,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.225,6873.15,6855.07,6861.87,139.1,955102.29,6859.46,20316,18.08,-0.035122
2,2019-11-30 21:00:00,0.188,2.536,0.276,0.0827,0.0,0.333333,0.666667,0.016667,6861.99,6819.62,6821.02,120.21,822955.38,6861.87,20316,42.37,0.598884
3,2019-11-30 20:00:00,0.236,3.534,0.23,-0.0186,0.5,0.25,0.25,0.115938,6835.86,6802.7,6832.53,195.42,1331813.08,6821.02,20316,33.16,-0.168459
4,2019-11-30 19:00:00,0.068,2.609,0.323,0.290667,0.0,0.0,1.0,0.100269,6851.14,6800.91,6800.97,170.6,1165442.89,6832.53,20316,50.23,0.464051


### LAGGED DATAFRAMES

In [43]:
lag = 1

In [44]:
#compute lagged values:
ds_full_lag = pd.DataFrame()
#all columns except for time and return
for col in ds_full.columns[1:-1]:
    ds_full_lag[col] = ds_full[col] - ds_full[col].shift(-lag)

ds_full_lag.drop(ds_full_lag.tail(1).index,inplace=True)#drop last row
ds_full_lag


Unnamed: 0,share_neg,share_neu,share_pos,comp,neg,neu,pos,blob_pol,high,low,...,reddit_posts_per_hour,reddit_subscribers,total_page_views,trades_page_views,twitter_favourites,twitter_followers,twitter_following,twitter_lists,twitter_statuses,delta_hl
0,0.121,-0.193,0.072,-0.401900,1.000000,0.000000,-1.000000,-0.297619,6.63,4.39,...,-0.01,49.0,670.0,4.0,0.0,0.0,0.0,0.0,0.0,2.24
1,-0.188,-1.536,-0.276,-0.082700,0.000000,-0.333333,0.333333,0.208333,11.16,35.45,...,0.02,17.0,754.0,4.0,0.0,0.0,0.0,0.0,0.0,-24.29
2,-0.048,-0.998,0.046,0.101300,-0.500000,0.083333,0.416667,-0.099271,26.13,16.92,...,0.02,37.0,724.0,7.0,0.0,0.0,0.0,0.0,0.0,9.21
3,0.168,0.925,-0.093,-0.309267,0.500000,0.250000,-0.750000,0.015668,-15.28,1.79,...,-0.08,52.0,785.0,8.0,0.0,0.0,0.0,0.0,0.0,-17.07
4,0.027,-1.730,-0.297,-0.092653,0.000000,-0.200000,0.200000,0.032896,21.46,7.26,...,-0.08,18.0,827.0,7.0,0.0,0.0,0.0,0.0,0.0,14.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18878,0.000,1.849,0.151,0.164633,0.333333,0.333333,-0.666667,-0.182189,-7.44,-42.22,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.78
18879,0.000,0.091,-0.091,-0.051600,0.000000,0.000000,0.000000,0.214828,-1.59,-3.14,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.55
18880,0.000,0.016,-0.016,0.000000,0.000000,0.000000,0.000000,0.000000,19.09,26.06,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.97
18881,0.000,-0.592,-0.408,-0.562600,0.000000,0.000000,0.000000,-0.405556,20.30,1.02,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.28


In [46]:
#compute lagged values:
ds_mkt_lag = pd.DataFrame()
#all columns except for time and return
for col in ds_mkt.columns[1:-1]:
    ds_mkt_lag[col] = ds_mkt[col] - ds_mkt[col].shift(-lag)

ds_mkt_lag.drop(ds_mkt_lag.tail(1).index,inplace=True)#drop last row
ds_mkt_lag

Unnamed: 0,share_neg,share_neu,share_pos,comp,neg,neu,pos,blob_pol,high,low,open,volumefrom,volumeto,close,twitter_statuses,delta_hl
0,0.121,-0.193,0.072,-0.401900,1.000000,0.000000,-1.000000,-0.297619,6.63,4.39,-2.41,-24.21,-166064.90,12.31,0.0,2.24
1,-0.188,-1.536,-0.276,-0.082700,0.000000,-0.333333,0.333333,0.208333,11.16,35.45,40.85,18.89,132146.91,-2.41,0.0,-24.29
2,-0.048,-0.998,0.046,0.101300,-0.500000,0.083333,0.416667,-0.099271,26.13,16.92,-11.51,-75.21,-508857.70,40.85,0.0,9.21
3,0.168,0.925,-0.093,-0.309267,0.500000,0.250000,-0.750000,0.015668,-15.28,1.79,31.56,24.82,166370.19,-11.51,0.0,-17.07
4,0.027,-1.730,-0.297,-0.092653,0.000000,-0.200000,0.200000,0.032896,21.46,7.26,-1.43,-15.49,-101858.31,31.56,0.0,14.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18878,0.000,1.849,0.151,0.164633,0.333333,0.333333,-0.666667,-0.182189,-7.44,-42.22,-16.98,467.66,1135481.30,-38.16,0.0,34.78
18879,0.000,0.091,-0.091,-0.051600,0.000000,0.000000,0.000000,0.214828,-1.59,-3.14,15.54,-22.39,-60040.82,-16.98,0.0,1.55
18880,0.000,0.016,-0.016,0.000000,0.000000,0.000000,0.000000,0.000000,19.09,26.06,20.03,154.02,397636.48,15.54,0.0,-6.97
18881,0.000,-0.592,-0.408,-0.562600,0.000000,0.000000,0.000000,-0.405556,20.30,1.02,7.39,272.89,677399.31,22.28,0.0,19.28


# MACHINE LEARNING -  SUPERVISED REG

##  INPUTS =  ONLY SENTIMENT & MKT - NO LAG

In [None]:
#DEFINE X and y:

#Drop TIME, number,Volume high and low,close,returns and turn in np array and check dimensions
X = ds_full.drop(["time","close","delta_h","return",
                 "high","low"],axis=1)
#Sentiment drop: SHARES
#X.drop(["share_neg","share_neu","share_pos"],axis=1,inplace=True)
#Sentiment drop: NEUTRAL, VADER COMPUND AND TEXT_BLOB POLARITY
#X.drop(["neu","comp","blob_pol"],axis=1,inplace=True)
#Sentiment drop: NEUTRAL, OVERALL POS AND NEG
#X.drop(["neu","comp","blob_pol"],axis=1,inplace=True)

print("Dimensions of X:", X.ndim)
print("Features:", [col for col in X.columns])


#define TARGET and reshape to 2 dimensions (possible Targets: close, delta_h or return)
y = ds_mkt["return"]
print("Dimensions of y:", y.ndim)


In [None]:
#turn X and y into arrays:
X = np.array(X)
y = np.array(y)

In [None]:
#Train-Test split:
#from sklearn.model_selection import train_test_split
test_size = 0.2 #LAST 20% of data

#TRAIN
X_train = X[:round(len(X)*(1-test_size))]
y_train = y[:round(len(y)*(1-test_size))]

#TEST
X_test = X[-round(len(X)*test_size):]
y_test = y[-round(len(y)*test_size):]

### LINEAR REGRESSION

In [None]:
#create pipeline of Linear Regressions with Polynomial degrees from 1 to 4
#Standardized Values 
lin_models = {}
scores = []
for k in range(1,3):
    poly_model = make_pipeline (StandardScaler(), PolynomialFeatures(k),LinearRegression())
    
    model = poly_model.fit(X_train, y_train) #fit model
    lin_models.update({str(k) : model})#store models in dict to access the best performing one
    scores.append(model.score(X_test, y_test))#store scores for performance comparison
    print("Poly Deg:",k, "R2:", model.score(X_test, y_test)) #evaluate performance with r2


In [None]:
scores.index(max(scores))+1 

In [None]:
#Apply Features Reduction to best performing model: 
k = scores.index(max(scores))+1 #polynomial degree based on above
                 
best_model = lin_models[str(k)]["linearregression"]#selects best model based on performance
rfe_selector = RFECV(best_model,step=1,cv=7)
rfe_selector = rfe_selector.fit(X,y) #fit to training set and reduce the features

selection = rfe_selector.support_ # mask to apply to select features
ranking = rfe_selector.ranking_ # ranking of selected features based on explanatory power

print(selection)
print(ranking)

In [None]:
#Reduce inital X according to RFECV output 
X_reduced = X*selection
print(X_reduced)
#Train-Test split
X_reduced_train = X_reduced[:round(len(X_reduced)*(1-test_size))]
X_reduced_test = X_reduced[-round(len(X_reduced)*test_size):]


In [None]:
#apply best model to reduced features set
best_model_reduced = best_model.fit(X_reduced_train,y_train)
#predict y with reduced features
y_pred_reduced = best_model_reduced.predict(X_reduced_test)

#compute r2 score with reduced features
r2_score(y_test,y_pred_reduced)

### RANDOM FOREST REGRESSION