In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import itertools
import ccxt
import time
import datetime
import coinmetrics
from finta import TA

### Collect Data

In [2]:
def fetch_daily_data(symbol):
    binance = ccxt.binance({
    'apiKey': 'y',
    'secret': 'Y', })
    now = binance.milliseconds()
    since = now - 1000 * 60 * 60 * 24 * (4 * 365)
    time_price_dict = {}
    while since < now:
        ohlcvs = binance.fetch_ohlcv(symbol, '1d', since, 30)
        for ohlcv in ohlcvs:
            ohlcv_dict = {'Open': ohlcv[1], 'High': ohlcv[2], 'Low': ohlcv[3], 'Close': ohlcv[4]}
            time_price_dict[datetime.datetime.fromtimestamp(float(ohlcv[0]/1000))] = ohlcv_dict
        since += 1000 * 60 * 60 * 24 * 30
        # don't hit the rateLimit or you will be banned
        time.sleep(binance.rateLimit / 1000)
    return time_price_dict

def binance_add_date_column(binance_df):
    binance_df['datetime'] = pd.to_datetime(binance_df['Timestamp'])
    binance_df['Date'] = binance_df['datetime'].dt.date
    binance_df['Date'] = binance_df['Date'].astype('str')
    binance_df = binance_df.drop(['Timestamp', 'datetime'], 1)
    return binance_df
    

def create_dataframe(time_price_dict_btc):
    open_prices = []
    close_prices = []
    high_prices = []
    low_prices = []
    timestamps = []
    for key in sorted(time_price_dict_btc.keys()):
        timestamps.append(key)
        open_prices.append(time_price_dict_btc[key]['Open'])
        high_prices.append(time_price_dict_btc[key]['High'])
        low_prices.append(time_price_dict_btc[key]['Low'])
        close_prices.append(time_price_dict_btc[key]['Close'])
    d = {"Timestamp": timestamps, "Open": open_prices, "High": high_prices, "Low": low_prices, "Close": close_prices}
    df = pd.DataFrame(data=d, columns=["Timestamp", "Open", "High", "Low", "Close"])
    return df
    
def binance_to_dataframe(symbol, period):
    if (period == '4y'):
        binance_ohlcvs = fetch_daily_data(symbol)
    elif (period == '1d'):
        binance_ohlcvs = fetch_one_day(symbol)
    binance_df = create_dataframe(binance_ohlcvs)
    binance_df = binance_add_date_column(binance_df)
    return binance_df


In [3]:
def coinmetrics_to_dataframe(begin_timestamp, end_timestamp):
    cm = coinmetrics.Community()
    metric = "AdrActCnt,BlkCnt,BlkSizeByte,TxTfrCnt,FeeMeanUSD,HashRate,ROI30d"
    #asset_data = cm.get_asset_data_for_time_range('btc', metric, begin_timestamp, end_timestamp)
    asset_data = cm.get_asset_data_for_time_range('btc', 'all', begin_timestamp, end_timestamp)
    coin_features = coinmetrics.cm_to_pandas(asset_data)
    coin_features['Date'] = coin_features.index
    coin_features['Date'] = pd.to_datetime(coin_features['Date'])
    coin_features['Date'] = coin_features['Date'].dt.date
    coin_features['Date'] = coin_features['Date'].astype('str')
    coin_features.reset_index(drop=True, inplace=True)
    return coin_features

In [4]:
def get_target(prices, offset, price_change):
    target = []
    for i in range(len(prices) - offset):
        positive_return = False
        for j in range(i, i + offset):
            if prices[j] / prices[i] > 1 + price_change:
                positive_return = True
        if positive_return == True:
            target.append(1)
        else:
            target.append(0)
    for i in range(offset):
        target.append(0)
    return np.array(target)

In [5]:
def prepare_yahoo():
    btc = yf.Ticker("BTC-USD")
    btc_data = btc.history(period="5y")
    btc_data['Date'] = btc_data.index
    btc_data['Date'] = pd.to_datetime(btc_data['Date'])
    btc_data = btc_data[btc_data['Date'] >= '2016-01-01']
    btc_data = btc_data.drop(['Dividends', 'Stock Splits'], 1)
    btc_data.reset_index(drop=True, inplace=True)
    btc_data['weekday'] = btc_data['Date'].dt.dayofweek
    btc_data['Date'] = btc_data['Date'].astype('str')
    btc_data = btc_data.rename(columns={'Open': 'open', 'High': 'high', 'Low': 'low', 
                                        'Close': 'close', 'Volume':'volume'})
    gbtc = yf.Ticker("GBTC")
    gbtc_data = gbtc.history(period="6y")
    gbtc_data['Date'] = gbtc_data.index
    gbtc_data['Date'] = pd.to_datetime(gbtc_data['Date'])
    gbtc_data = gbtc_data[gbtc_data['Date'] >= '2015-01-01']
    gbtc_data = gbtc_data.drop(['Dividends', 'Stock Splits'], 1)
    gbtc_data.reset_index(drop=True, inplace=True)
    gbtc_data['Date'] = gbtc_data['Date'].astype('str')
    gbtc_data = gbtc_data.rename(columns={'Open': 'gopen', 'High': 'ghigh', 'Low': 'glow', 
                                        'Close': 'gclose', 'Volume':'gvolume'})
    gbtc_data['gopen_shifted'] = gbtc_data['gopen'].shift(180)
    gbtc_data['ghigh_shifted'] = gbtc_data['ghigh'].shift(180)
    gbtc_data['glow_shifted'] = gbtc_data['glow'].shift(180)
    gbtc_data['gclose_shifted'] = gbtc_data['gclose'].shift(180)
    gbtc_data['gvolume_shifted'] = gbtc_data['gopen'].shift(180)
    gbtc_data = gbtc_data[gbtc_data['Date'] >= '2016-01-01']
    data = pd.merge(btc_data, gbtc_data, on='Date', how='inner')
    return data
    
def add_technicals(data, symbol):
    if symbol == 'btc':
        ohlcv = data[['open', 'high', 'low', 'close', 'volume']]
    elif symbol == 'gbtc':
        ohlcv = data[['gopen', 'ghigh', 'glow', 'gclose', 'gvolume']]
        ohlcv = ohlcv.rename(columns={'gopen': 'open', 'ghigh': 'high', 'glow': 'low', 
                                        'gclose': 'close', 'gvolume':'volume'})
    bb = TA.BBANDS(ohlcv, period=21)
    atr = TA.ATR(ohlcv)
    adx = TA.ADX(ohlcv)
    mfi = TA.MFI(ohlcv)
    srsi = TA.STOCHRSI(ohlcv)
    ppo = TA.PPO(ohlcv)
    macd = TA.MACD(ohlcv)
    trima = TA.TRIMA(ohlcv)
    chaikin = TA.CHAIKIN(ohlcv)
    roc = TA.ROC(ohlcv)
    cmo = TA.CMO(ohlcv)
    baspn = TA.BASPN(ohlcv)
    data['volatility_21'] = (ohlcv['close'].rolling(21)).std(ddof=0)
    data['PCT'] = ohlcv['close'].pct_change()
    data['bb_up'] = bb['BB_UPPER']
    data['bb_mid'] = bb['BB_MIDDLE']
    data['bb_low'] = bb['BB_LOWER']
    data['atr'] = atr
    data['adx'] = adx
    data['mfi'] = mfi
    data['trima'] = trima
    data['macd'] = macd['MACD']
    data['srsi'] = srsi
    data['ppo'] = ppo['PPO']
    data['chaikin'] = chaikin
    data['cmo'] = cmo
    data['roc'] = roc
    data['buy'] = baspn['Buy.']
    data['sell'] = baspn['Sell.']
    data = data.fillna(0)
    return data
    

def collect_btc_data():
    begin_timestamp = "2016-01-01"
    end_timestamp = datetime.datetime.utcnow().strftime("%Y%m%d")
    
    # Yahoo Finance Data
    btc_yahoo = prepare_yahoo()
    btc_yahoo = add_technicals(btc_yahoo, 'btc')
    
    # CoinMetrics Data
    coin_features = coinmetrics_to_dataframe(begin_timestamp, end_timestamp)
    
    features = pd.merge(btc_yahoo, coin_features, on='Date', how='inner')

    #features['target'] = (features['open'] / features['close']).apply(lambda x: int(x > 1)).shift(1)
    features['gbtc_premium'] = features['gclose'] / features['close']
    features['target'] = get_target(np.array(features['close']), 7, 0.05)
    #features['target'] = (features['close'].shift(-14) / features['close']).apply(lambda x: int(x > 1))
    #features['target'] = features['target'].fillna(0)
    return features

In [6]:
def contains_btc_reference(text):
    keywords = ['crypto', 'bitcoin', 'Bitcoin', 'Crypto', 'BTC', 'Coin', 'coin']
    for keyword in keywords:
        if keyword in text:
            return 1
    return 0

In [7]:
news_df = pd.read_csv('/Users/alex/Documents/jupyter notebooks/csv/nyt_news_sentiment.csv')
news_df.drop('Unnamed: 0', 1, inplace=True)
news_df = news_df.rename(columns={'date': 'Date'})
news_df['Date'] = news_df['Date'].astype('str')
news_df['btc_reference'] = news_df['text'].apply(lambda x: contains_btc_reference(x))
ref_count = news_df.groupby('Date')['btc_reference'].sum()
news_df['btc_reference'] = news_df['Date'].map(ref_count)
news_df = news_df.drop_duplicates('Date', keep='first')
news_df['sentiment'] = news_df['mean_sentiment'].apply(lambda x: int(x > 0.5))
news_df = news_df.reset_index()
news_df.drop('index', 1, inplace=True)

In [8]:
trends_df = pd.read_csv('/Users/alex/Documents/jupyter notebooks/csv/btc_google_trends.csv')
trends_df.drop('Unnamed: 0', 1, inplace=True)
trends_df = trends_df.rename(columns={'date': 'Date'})
trends_df['Date'] = trends_df['Date'].astype('str')

In [9]:
features = collect_btc_data()

In [10]:
features = pd.merge(features, news_df[['Date', 'sentiment', 'mean_sentiment', 'btc_reference']], on='Date', how='inner')
features = pd.merge(features, trends_df, on='Date', how='inner')

In [11]:
features = features.loc[20:features.index[-1] - 7]

In [12]:
features.head(30)

Unnamed: 0,open,high,low,close,volume,Date,weekday,gopen,ghigh,glow,...,TxTfrValUSD,VtyDayRet180d,VtyDayRet30d,VtyDayRet60d,gbtc_premium,target,sentiment,mean_sentiment,btc_reference,search_idx
20,421.299011,422.342987,419.601013,421.444,50634300,2016-04-04,0,0.601648,0.626264,0.598901,...,697222500.0,0.032617,0.010596,0.015812,0.001486,0,1,0.583333,0,1775
21,421.016998,424.256989,420.614014,424.029999,60718000,2016-04-05,1,0.637363,0.637363,0.626374,...,710171700.0,0.032615,0.010307,0.015705,0.001499,0,0,0.269231,0,1832
22,424.283997,424.527008,422.729004,423.412994,59091000,2016-04-06,2,0.635714,0.635714,0.627473,...,731145600.0,0.032616,0.009457,0.015297,0.001489,0,0,0.342105,0,1837
23,423.619995,423.657013,420.518005,422.744995,57858600,2016-04-07,3,0.626374,0.626374,0.607363,...,836293400.0,0.032616,0.009415,0.015302,0.001437,0,0,0.285714,0,1839
24,422.907013,425.360992,419.63501,420.348999,63454700,2016-04-08,4,0.607253,0.640659,0.607253,...,715498000.0,0.03262,0.009506,0.015241,0.001524,0,0,0.304348,0,1800
25,421.872009,422.739014,420.53299,422.483002,50747500,2016-04-11,0,0.653846,0.653846,0.615385,...,672872500.0,0.032585,0.008355,0.015135,0.001457,0,0,0.407407,0,1840
26,422.842987,427.277008,422.842987,425.190002,70728800,2016-04-12,1,0.626374,0.632527,0.626264,...,913947100.0,0.032588,0.008493,0.01511,0.001473,0,0,0.241379,0,1832
27,425.631989,426.65799,422.915985,423.734009,69060400,2016-04-13,2,0.617802,0.617802,0.608022,...,955137500.0,0.032526,0.0085,0.014974,0.001435,1,0,0.4,0,1742
28,423.934998,425.371002,423.013,424.282013,45281000,2016-04-14,3,0.61044,0.637363,0.61044,...,812452400.0,0.032439,0.008504,0.014017,0.001452,1,0,0.25,0,1732
29,424.427002,429.928009,424.427002,429.713013,54801500,2016-04-15,4,0.623077,0.653846,0.623077,...,900102100.0,0.032344,0.0087,0.013882,0.001499,1,0,0.44,0,1065


In [12]:
features['target'].value_counts()

0    636
1    579
Name: target, dtype: int64

# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
from sklearn.metrics import f1_score

from sklearn.model_selection import RandomizedSearchCV

from pprint import pprint

In [10]:
rf = RandomForestClassifier()

In [48]:
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 55, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000]}


In [49]:
cols = ['adx',  'cmo', 'roc',  'AdrActCnt', 'BlkSizeByte', 'CapMVRVCur', 
          'FeeTotNtv', 'ROI30d', 'TxCnt', 'gvolume', #'gclose_shifted', 
          'TxTfrCnt', 'TxTfrValAdjNtv', 'TxTfrValUSD', 'VtyDayRet30d']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(features[cols], features['target'], test_size=0.2, shuffle=False)

In [51]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 4, 
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  1.2min finished


RandomizedSearchCV(cv=4, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 10, 15, 20, 25, 30, 35,
                                                      40, 45, 50, 55, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000]},
                   random_state=42, verbose=2)

In [52]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 5,
 'bootstrap': True}

In [53]:
best_rf = RandomForestClassifier(n_estimators=200, min_samples_split=10, min_samples_leaf=2,
                                 max_features='sqrt', max_depth=5, bootstrap=True)

#### Train Model

In [54]:
best_rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=10, n_estimators=200)

In [55]:
y_test.value_counts()

1    133
0    113
Name: target, dtype: int64

In [56]:
y_pred = best_rf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.6788617886178862


In [57]:
print(roc_auc_score(y_test, y_pred))

0.6657462239669971


In [58]:
y_test.describe()

count    246.000000
mean       0.540650
std        0.499361
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: target, dtype: float64

In [59]:
print(confusion_matrix(y_test,y_pred))

[[ 57  56]
 [ 23 110]]


In [60]:
f1_score(y_test, y_pred)

0.7357859531772575

#### Cross-Validation

In [61]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [62]:
scores = cross_val_score(best_rf, features[cols], features['target'], cv=3)
print(scores)

[0.5599022  0.60635697 0.61369193]


In [63]:
# Make cross validated predictions
predictions = cross_val_predict(best_rf, features[cols], features['target'], cv=3)
accuracy = roc_auc_score(features['target'], predictions)
print('Cross-Predicted Accuracy:', accuracy)

Cross-Predicted Accuracy: 0.5959392081055859


# Research

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
from sklearn.metrics import f1_score

from sklearn.model_selection import RandomizedSearchCV

from pprint import pprint

In [33]:
rf = RandomForestClassifier()

In [34]:
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [35]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 55, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000]}


In [36]:
cols = ['adx',  'cmo', 'roc',  'AdrActCnt', 'BlkSizeByte', 'CapMVRVCur', 
          'FeeTotNtv', 'ROI30d', 'TxCnt', 'gvolume',  'mean_sentiment', 'btc_reference',
          'TxTfrCnt', 'TxTfrValAdjNtv', 'TxTfrValUSD', 'VtyDayRet30d']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(features[cols], features['target'], test_size=0.2, shuffle=False)

In [38]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   54.7s finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 10, 15, 20, 25, 30, 35,
                                                      40, 45, 50, 55, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000]},
                   random_state=42, verbose=2)

In [39]:
rf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 5,
 'bootstrap': False}

In [40]:
best_rf = RandomForestClassifier(n_estimators=800, min_samples_split=2, min_samples_leaf=4,
                                 max_features='auto', max_depth=5, bootstrap=True)

#### Train Model

In [41]:
best_rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=4, n_estimators=800)

In [42]:
y_test.value_counts()

1    135
0    108
Name: target, dtype: int64

In [43]:
y_pred = best_rf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.6625514403292181


In [44]:
print(roc_auc_score(y_test, y_pred))

0.637962962962963


In [45]:
y_test.describe()

count    243.000000
mean       0.555556
std        0.497930
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: target, dtype: float64

In [46]:
print(confusion_matrix(y_test,y_pred))

[[ 45  63]
 [ 19 116]]


In [47]:
f1_score(y_test, y_pred)

0.7388535031847134

#### Cross-Validation

In [48]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [49]:
scores = cross_val_score(best_rf, features[cols], features['target'], cv=3)
print(scores)

[0.53580247 0.61481481 0.58024691]


In [50]:
# Make cross validated predictions
predictions = cross_val_predict(best_rf, features[cols], features['target'], cv=3)
accuracy = roc_auc_score(features['target'], predictions)
print('Cross-Predicted Accuracy:', accuracy)

Cross-Predicted Accuracy: 0.5850604490500864
