## Forecasting and trading cryptocurrencies with machine learning under changing market conditions
https://jfin-swufe.springeropen.com/articles/10.1186/s40854-020-00217-x

#### - coins
Bitcoin, ethereum, litecoin

#### - regressors
closing prices, high, low, daily trading volume, market capi (from coin market cap), 12 indicators of blockchain information (from https://coinmetrics.io/)

Dependent variable: daily log return (closing price)

In [412]:
import pandas as pd
import re
import numpy as np
import plotly.express as px
from helper_funcs import get_data, convert_unix_to_datetime, separate_symbols

In [413]:
tickers = ['BTC', 'ETH', 'LTC']
data = get_data(tickers)

df = pd.DataFrame(data, columns = ['id', 'symbol', 'date', 'high', 'low', 'open', 'close', 'volumeto', 'volumefor'])
df = df.drop('id', axis=1)
df['date'] = convert_unix_to_datetime(df['date'])

btc, eth, ltc = separate_symbols(df)
coinmetric_df = pd.read_csv('coin_metrics_btc_data.csv', encoding='utf-16')

Finding data for: 'BTC', 'ETH', 'LTC'


In [414]:
def get_coin_cols(coin):
    cols = []
    for col in coinmetric_df.columns:
        if re.match(coin, col):
            cols.append(col)
    time_df = pd.DataFrame(coinmetric_df['Time'])
    time_df.rename(columns={"Time": "date"}, inplace=True)
    return time_df.join(coinmetric_df[cols])

def take_diff(column_list, df):
    for col in column_list:
        df[col] = df[col].diff()
    return df

def preprocess(df, symbol):
    "symbol needs to be uppercase"
    df['close'] = np.log(df['close'])
    df['rel_price_change'] = 2 * (df['high'] - df['low']) / (df['high'] + df['low'])
    df['parkinson_vol'] = np.sqrt((np.log(df['high']/df['low'])**2)/4*np.log(2))
    df = df[['date', 'close', 'volumeto', 'volumefor', 'rel_price_change', 'parkinson_vol']]
    
    for i in range(8):
        lag_close_col = df['close'].shift(i)
        lag_park_col = df['parkinson_vol'].shift(i)
        df['close_lag'+str(i)] = lag_close_col
        df['parkinson_lag'+str(i)] = lag_park_col
    
    df = df.merge(get_coin_cols(symbol), on='date')
    df = df.set_index('date')
    df.columns =[re.sub(symbol+' / ', '', col) for col in df.columns]
    
    column_list = ['Market Cap (USD)', 'Tx Cnt', 'Active Addr Cnt', 
               'Mean Difficulty', 'Block Cnt', 'Xfer Cnt']
    btc_df = take_diff(column_list, df=df)
    return df


#### Dataframe processing list

- First 7 lags of the closing price and parkinson's volatility
- First diff of market cap, # transactions, active address, average difficulty, number of blocks, block size, number of payments


In [None]:
#run once
btc = preprocess(btc, 'BTC')
eth = preprocess(eth, 'ETH')
ltc = preprocess(ltc, 'LTC')

#### Model building
- Rolling window: 
    - First 50% used to train the model. Training sample
    - Next 25% each close is forecasted. Used to choose variables/hyperparameters. Validation sample
    - Use the models that showed the best performance in the validation sample. Test sample
    

In [423]:
def plot_graph_sets(column, title=None):
    p50 = int(len(column) * 0.5)
    p75 = int(len(column) * 0.75)
    
    fig = px.scatter(data_frame=column, range_color=(0,1000), title=title)
    
    fig.add_vrect(x0 = column.index[0], x1=column.index[p50:p50+1][0], annotation_text="Training_set",
                 annotation_position="top right", fillcolor="blue", opacity=0.25, line_width=0)

    fig.add_vrect(x0 = column.index[p50:p50+1][0], x1=column.index[p75:p75+1][0], annotation_text="Validation_set",
                 annotation_position="top right", fillcolor="green", opacity=0.25, line_width=0)

    fig.add_vrect(x0 = column.index[p75:p75+1][0], x1=column.index[-1], annotation_text="Test_set",
                 annotation_position="top right", fillcolor="orange", opacity=0.25, line_width=0)
    return fig

In [424]:
plot_graph_sets(btc['close'], title='Bitcoin')

In [425]:
plot_graph_sets(eth['close'], title='Ethereum')

In [427]:
plot_graph_sets(ltc['close'], title='Litecoin')

In [483]:
btc = btc.dropna()
train = btc[:int(len(btc) * 0.75)]
test = btc[int(len(btc) * 0.75):]

y = train['close']
X = train.drop('close', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=False)

### Linear Models

### Random Forest Regression

In [464]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [491]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
rf_y_pred = rf_reg.predict(X_valid)
MSE(y_valid, rf_y_pred)

0.00012293595110841285

### Random Forest classifier

In [513]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [510]:
clf_y_train = y_train.diff() > 0
clf_y_valid = y_valid.diff() > 0

In [512]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, clf_y_train)
rf_clf_pred = rf_clf.predict(X_valid)

In [515]:
print(classification_report(clf_y_valid, rf_clf_pred))

              precision    recall  f1-score   support

       False       0.93      0.95      0.94       177
        True       0.95      0.93      0.94       197

    accuracy                           0.94       374
   macro avg       0.94      0.94      0.94       374
weighted avg       0.94      0.94      0.94       374



### Support Vector Machines

In [473]:
from sklearn.svm import SVR

In [518]:
svr_reg = SVR()
svr_reg.fit(X_train, y_train)
svr_y_pred = svr_reg.predict(X_valid)
MSE(y_valid, svr_y_pred)

1.6010727760415508

### Support Vector Classifier

In [516]:
from sklearn.svm import SVC

In [521]:
svc_clf = SVC()
svc_clf.fit(X_train, clf_y_train)
svc_y_pred = svc_clf.predict(X_valid)

In [522]:
print(classification_report(svc_y_pred, rf_clf_pred))

              precision    recall  f1-score   support

       False       0.48      0.97      0.64        90
        True       0.98      0.67      0.80       284

    accuracy                           0.74       374
   macro avg       0.73      0.82      0.72       374
weighted avg       0.86      0.74      0.76       374

