# 7. Out-of-sample predictions

Having the optimal hyperparameters for each pipeline, we make predictions with unseen data. Given that bitcoin's prices are a constant stream of data, the models will be retrained once every day so that they are updated with the latest data.

In [25]:
import pandas as pd
import joblib
from utils import *

In [26]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
import pandas as pd
from sklearn.linear_model import LassoLarsCV, ElasticNetCV, SGDRegressor, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import RobustScaler, PowerTransformer, KBinsDiscretizer, MaxAbsScaler, StandardScaler, Normalizer, MinMaxScaler, Binarizer, QuantileTransformer
from sklearn.feature_selection import SelectPercentile, f_regression, SelectFwe
from xgboost import XGBRegressor
from tpot.export_utils import set_param_recursive
from tpot.builtins import StackingEstimator, ZeroCount

In [28]:
filtered_vars = joblib.load('models/filtered_vars.joblib')
cutoff_date = joblib.load('models/cutoff_date.joblib')
df = pd.read_csv('data/req_data.csv', index_col=0, parse_dates=True).dropna()
feats = df.drop(labels=['target'], axis=1)
to_predict = df.loc[:, 'target']
del df

In [29]:
best_pipelines_all = {
    'cuberoot': make_pipeline(
        KBinsDiscretizer(encode="ordinal", n_bins=500, strategy="quantile"),
        ExtraTreesRegressor(bootstrap=True, max_features=0.5, min_samples_leaf=18, min_samples_split=8, n_estimators=100)
    ),
    'arsinh': make_pipeline(
        SelectFwe(score_func=f_regression, alpha=0.048),
        StandardScaler(),
        GradientBoostingRegressor(alpha=0.75, learning_rate=0.001, loss="lad", max_depth=9, max_features=0.2, min_samples_leaf=16, min_samples_split=18, n_estimators=100, subsample=0.4)
    ),
    'none': make_pipeline(
        SelectFwe(score_func=f_regression, alpha=0.029),
        GradientBoostingRegressor(alpha=0.99, learning_rate=0.001, loss="lad", max_depth=9, max_features=0.2, min_samples_leaf=13, min_samples_split=8, n_estimators=100, subsample=0.7500000000000001)
    )}

best_pipelines_clusters = {'arsinh':{
    '3': make_pipeline(
        StackingEstimator(estimator=LassoLarsCV(normalize=True)),
        StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.1, fit_intercept=True, l1_ratio=0.5, learning_rate="constant", loss="epsilon_insensitive", penalty="elasticnet", power_t=50.0)),
        XGBRegressor(learning_rate=0.5, max_depth=3, min_child_weight=13, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.9000000000000001, verbosity=0)
    ),
    '2': make_pipeline(
        KBinsDiscretizer(encode="ordinal", n_bins=50, strategy="quantile"),
        ExtraTreesRegressor(bootstrap=True, max_features=0.5, min_samples_leaf=4, min_samples_split=8, n_estimators=100)
    ),
    '1': make_pipeline(
        QuantileTransformer(),
        Normalizer(norm="l1"),
        StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.85, learning_rate=0.5, loss="quantile", max_depth=10, max_features=0.9000000000000001, min_samples_leaf=2, min_samples_split=18, n_estimators=100, subsample=0.05)),
        GradientBoostingRegressor(alpha=0.99, learning_rate=0.001, loss="lad", max_depth=9, max_features=0.05, min_samples_leaf=15, min_samples_split=18, n_estimators=100, subsample=0.4)
    ),
    '0': make_pipeline(XGBRegressor(learning_rate=0.1, max_depth=1, min_child_weight=13, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.05, verbosity=0))    
}, 'cuberoot': {
    '3': make_pipeline(
        Normalizer(norm="l1"),
        RandomForestRegressor(bootstrap=True, max_features=0.45, min_samples_leaf=8, min_samples_split=7, n_estimators=100)
    ),
    '2': make_pipeline(
        Normalizer(norm="l1"),
        ExtraTreesRegressor(bootstrap=True, max_features=0.5, min_samples_leaf=4, min_samples_split=8, n_estimators=100)
    ),
    '1': make_pipeline(
        StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.85, learning_rate=0.001, loss="quantile", max_depth=8, max_features=0.9500000000000001, min_samples_leaf=19, min_samples_split=6, n_estimators=100, subsample=0.7000000000000001)),
        StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=True, l1_ratio=0.0, learning_rate="constant", loss="squared_loss", penalty="elasticnet", power_t=100.0)),
        ExtraTreesRegressor(bootstrap=False, max_features=0.1, min_samples_leaf=9, min_samples_split=20, n_estimators=100)
    ),
    '0': make_pipeline(XGBRegressor(learning_rate=0.1, max_depth=5, min_child_weight=12, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.8500000000000001, verbosity=0)),    
}, 'none': {
    '3': make_pipeline(
        StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.95, learning_rate=0.5, loss="ls", max_depth=2, max_features=0.7000000000000001, min_samples_leaf=8, min_samples_split=14, n_estimators=100, subsample=0.55)),
        Normalizer(norm="l1"),
        RandomForestRegressor(bootstrap=True, max_features=1.0, min_samples_leaf=5, min_samples_split=7, n_estimators=100)
    ),
    '2': make_pipeline(
        KBinsDiscretizer(encode="ordinal", n_bins=50, strategy="quantile"),
        ExtraTreesRegressor(bootstrap=True, max_features=0.5, min_samples_leaf=12, min_samples_split=8, n_estimators=100)
    ),
    '1': make_pipeline(ExtraTreesRegressor(bootstrap=True, max_features=0.05, min_samples_leaf=15, min_samples_split=15, n_estimators=100)),
    '0': make_pipeline(
        KBinsDiscretizer(encode="ordinal", n_bins=500, strategy="uniform"),
        QuantileTransformer(),
        GradientBoostingRegressor(alpha=0.9, learning_rate=0.01, loss="huber", max_depth=9, max_features=0.05, min_samples_leaf=9, min_samples_split=8, n_estimators=100, subsample=0.3)
    )    
    }
}

The following are the optimal hyperparameter spaces found during the previous phase (pipeline_optimization notebook)

In [30]:
all_params = {'cuberoot': {'extratreesregressor__bootstrap': True, 'extratreesregressor__max_depth': 28, 
            'extratreesregressor__max_features': 0.5700023695679756, 'extratreesregressor__min_samples_leaf': 54, 
            'extratreesregressor__min_samples_split': 51, 'extratreesregressor__n_estimators': 91, 
            'kbinsdiscretizer__n_bins': 720, 'kbinsdiscretizer__strategy': 'quantile', 'roll_mean': 4}, 
            'arsinh': {'gradientboostingregressor__alpha': 0.9370232212650587, 
            'gradientboostingregressor__learning_rate': 0.04189021009373599, 
            'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__max_depth': 6, 
            'gradientboostingregressor__max_features': 0.3112224638324981, 
            'gradientboostingregressor__min_samples_leaf': 35, 'gradientboostingregressor__min_samples_split': 7, 
            'gradientboostingregressor__n_estimators': 26, 'gradientboostingregressor__subsample': 0.21195913484335138, 
            'roll_mean': 5, 'selectfwe__alpha': 0.07260318354539178}, 
            'none': {'gradientboostingregressor__alpha': 0.7553060976421809, 
            'gradientboostingregressor__learning_rate': 0.2839323749428636, 
            'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__max_depth': 10, 
            'gradientboostingregressor__max_features': 0.26080007177634584, 
            'gradientboostingregressor__min_samples_leaf': 60, 'gradientboostingregressor__min_samples_split': 17, 
            'gradientboostingregressor__n_estimators': 381, 
            'gradientboostingregressor__subsample': 0.8338267581776144, 'roll_mean': 2, 
            'selectfwe__alpha': 0.015479514163984265}
}

In [31]:
bycluster_params = {
    'cuberoot': {'0': {'xgbregressor__learning_rate': 0.9802569498830725, 'xgbregressor__max_depth': 7, 
                       'xgbregressor__min_child_weight': 5, 'xgbregressor__n_estimators': 37, 
                       'xgbregressor__reg_alpha': 4.3998350343693864e-06, 'xgbregressor__reg_lambda': 18565878.962410465, 
                       'xgbregressor__subsample': 0.9036179379050534}, 
                 '1': {'extratreesregressor__bootstrap': True, 'extratreesregressor__max_depth': 22, 
                       'extratreesregressor__max_features': 0.7001038397966569, 'extratreesregressor__min_samples_leaf': 57, 
                       'extratreesregressor__min_samples_split': 30, 'extratreesregressor__n_estimators': 193, 
                       'stackingestimator-1__estimator__alpha': 0.5146486765865822, 
                       'stackingestimator-1__estimator__learning_rate': 0.4246918739681736, 
                       'stackingestimator-1__estimator__loss': 'quantile', 'stackingestimator-1__estimator__max_depth': 7, 
                       'stackingestimator-1__estimator__max_features': 0.8754081806543295, 
                       'stackingestimator-1__estimator__min_samples_leaf': 16, 'stackingestimator-1__estimator__min_samples_split': 49, 
                       'stackingestimator-1__estimator__n_estimators': 197, 'stackingestimator-1__estimator__subsample': 0.6573745835019558, 
                       'stackingestimator-2__estimator__alpha': 0.00800706857244989, 'stackingestimator-2__estimator__eta0': 0.45928191378028616, 
                       'stackingestimator-2__estimator__l1_ratio': 0.3280129701853963, 'stackingestimator-2__estimator__loss': 'epsilon_insensitive', 
                       'stackingestimator-2__estimator__power_t': 76}, 
                 '2': {'extratreesregressor__bootstrap': True, 'extratreesregressor__max_depth': 13, 
                       'extratreesregressor__max_features': 0.23598692146039585, 'extratreesregressor__min_samples_leaf': 58, 
                       'extratreesregressor__min_samples_split': 27, 'extratreesregressor__n_estimators': 285}, 
                 '3': {'randomforestregressor__bootstrap': True, 'randomforestregressor__max_depth': 27, 
                       'randomforestregressor__max_features': 0.741541341268798, 'randomforestregressor__min_samples_leaf': 43, 
                       'randomforestregressor__min_samples_split': 46, 'randomforestregressor__n_estimators': 345}, 
                 'roll_mean': 1},
    'arsinh': {'0': {'xgbregressor__learning_rate': 0.16058781257257088, 'xgbregressor__max_depth': 6, 
                     'xgbregressor__min_child_weight': 2, 'xgbregressor__n_estimators': 214, 
                     'xgbregressor__reg_alpha': 0.014105255689679444, 'xgbregressor__reg_lambda': 3.6505989722328457e-07, 
                     'xgbregressor__subsample': 0}, 
               '1': {'gradientboostingregressor__alpha': 0.845978637036461, 
                     'gradientboostingregressor__learning_rate': 0.6815380281604567, 'gradientboostingregressor__loss': 'ls', 
                     'gradientboostingregressor__max_depth': 9, 'gradientboostingregressor__max_features': 0.74371794312691, 
                     'gradientboostingregressor__min_samples_leaf': 62, 'gradientboostingregressor__min_samples_split': 67, 
                     'gradientboostingregressor__n_estimators': 92, 
                     'gradientboostingregressor__subsample': 0.20883109367799846, 'stackingestimator__estimator__alpha': 0.551247090515097, 
                     'stackingestimator__estimator__learning_rate': 0.8773161945057965, 'stackingestimator__estimator__loss': 'huber', 
                     'stackingestimator__estimator__max_depth': 2, 'stackingestimator__estimator__max_features': 0.7001653012092475, 
                     'stackingestimator__estimator__min_samples_leaf': 69, 'stackingestimator__estimator__min_samples_split': 5, 
                     'stackingestimator__estimator__n_estimators': 68, 'stackingestimator__estimator__subsample': 0.5612304035809031}, 
               '2': {'extratreesregressor__bootstrap': True, 'extratreesregressor__max_depth': 25, 
                     'extratreesregressor__max_features': 0.5219908546257384, 'extratreesregressor__min_samples_leaf': 27, 
                     'extratreesregressor__min_samples_split': 66, 'extratreesregressor__n_estimators': 253, 
                     'kbinsdiscretizer__n_bins': 390, 'kbinsdiscretizer__strategy': 'quantile'}, 
               '3': {'stackingestimator-2__estimator__alpha': 0.00823134009261331, 'stackingestimator-2__estimator__eta0': 0.14003486046635275, 
                     'stackingestimator-2__estimator__l1_ratio': 0.9938660260698831, 'stackingestimator-2__estimator__loss': 'huber', 
                     'stackingestimator-2__estimator__power_t': 53, 'xgbregressor__learning_rate': 0.061756015510473494, 
                     'xgbregressor__max_depth': 8, 'xgbregressor__min_child_weight': 11, 'xgbregressor__n_estimators': 203, 
                     'xgbregressor__reg_alpha': 1.1313558221038988e-09, 'xgbregressor__reg_lambda': 1.0840535491250761e-07, 
                     'xgbregressor__subsample': 0.4291352880026349}, 
               'roll_mean': 1},
    'none': {'0': {'gradientboostingregressor__alpha': 0.5817632567531272, 
                   'gradientboostingregressor__learning_rate': 0.18688149351416955, 
                   'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__max_depth': 1, 
                   'gradientboostingregressor__max_features': 0.5869665881877083, 
                   'gradientboostingregressor__min_samples_leaf': 33, 'gradientboostingregressor__min_samples_split': 46, 
                   'gradientboostingregressor__n_estimators': 87, 'gradientboostingregressor__subsample': 0.4285299877887999, 
                   'kbinsdiscretizer__n_bins': 260, 'kbinsdiscretizer__strategy': 'uniform'}, 
             '1': {'extratreesregressor__bootstrap': False, 'extratreesregressor__max_depth': 22, 
                   'extratreesregressor__max_features': 0.3900812656678244, 'extratreesregressor__min_samples_leaf': 11, 
                   'extratreesregressor__min_samples_split': 27, 'extratreesregressor__n_estimators': 361}, 
             '2': {'extratreesregressor__bootstrap': True, 'extratreesregressor__max_depth': 14, 
                   'extratreesregressor__max_features': 0.6575456771952172, 'extratreesregressor__min_samples_leaf': 12, 
                   'extratreesregressor__min_samples_split': 40, 'extratreesregressor__n_estimators': 389, 
                   'kbinsdiscretizer__n_bins': 790, 'kbinsdiscretizer__strategy': 'quantile'}, 
             '3': {'randomforestregressor__bootstrap': False, 'randomforestregressor__max_depth': 10, 
                   'randomforestregressor__max_features': 0.10620987698601198, 'randomforestregressor__min_samples_leaf': 49, 
                   'randomforestregressor__min_samples_split': 15, 'randomforestregressor__n_estimators': 261, 
                   'stackingestimator__estimator__alpha': 0.7426861881178207, 'stackingestimator__estimator__learning_rate': 0.36543828911890164, 
                   'stackingestimator__estimator__loss': 'quantile', 'stackingestimator__estimator__max_depth': 4, 
                   'stackingestimator__estimator__max_features': 0.7165788758137186, 'stackingestimator__estimator__min_samples_leaf': 37, 
                   'stackingestimator__estimator__min_samples_split': 36, 'stackingestimator__estimator__n_estimators': 346, 
                   'stackingestimator__estimator__subsample': 0.43857423516757926}, 
            'roll_mean': 1}
}

In [32]:
def slicedict(d, s):
    return {k:v for k,v in d.items() if not k.startswith(s)}

In [33]:
# Set the optimal hyperparameters of each pipeline
for key in best_pipelines_all:
    best_pipelines_all[key].set_params(**slicedict(all_params[key], 'roll_'))
    
for k1, v1 in best_pipelines_clusters.items():
    for k2, v2 in v1.items():
        best_pipelines_clusters[k1][k2].set_params(**bycluster_params[k1][k2])

In [34]:
vars_to_lag = ['h_high_close', 'h_low_close', 'h_candle_body', 'h_rsi_13h', 'h_ema_50', 'h_ema_200', 'h_obv10_obv50',
              'h_obv50_obv200', 'h_close_ma']

In [35]:
data_transformations = {'none': [lambda x: x, lambda x: x], 'arsinh': [lambda x: np.arcsinh(x), lambda x: np.sinh(x)],
                       'cuberoot': [lambda x: np.cbrt(x), lambda x: x**(3)]}

In [36]:
oos_predictions = {}
do_not_transform = ['h_weekday', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'cluster_mode', 'd_obv10_obv50',
                   'd_obv50_obv200', 'd_hc_15davg', 'd_lc_15davg', 'd_cb_15davg', 'd_rsi_13', 'd_ret60d']

As stated previously, given that bitcoin's price fluctuates constantly, it is a good idea to fit the model with new data as it comes. We will create two functions that will help us with this task.

In [37]:
def rolling_fit_predict_clusters(x, y, pipelines, cutoff_date, refit_freq, gap=7, export_model=True, model_name=None):
    
    """
    Fits and predicts in a rolling basis
    """
    
    n_freq = {'D': 24, 'W': 24*7, 'M': 24*30}
    refit_dates = pd.date_range(cutoff_date, x.index[-1], freq=refit_freq)
    fit_ix_iter = zip(list(map(lambda z: range(x.index.get_loc(z)), refit_dates)), 
                      list(map(lambda z: range(x.index.get_loc(z) + gap, x.index.get_loc(z) + n_freq[refit_freq] + gap), refit_dates)))
    
    preds = {key: [] for key in pipelines.keys()}
    idx = x.index
    for train, test in fit_ix_iter:
        if test[-1] > len(x):
            test = range(test[0], len(x))
        temp_x = x.iloc[train, :]
        cluster_indices = {'0': temp_x[temp_x['cluster_mode']==0].index, '1': temp_x[temp_x['cluster_mode']==1].index,
                           '2': temp_x[temp_x['cluster_mode']==2].index, '3': temp_x[temp_x['cluster_mode']==3].index}

        for key, value in pipelines.items():
            value.fit(temp_x.loc[cluster_indices[key], filtered_vars[int(key)]], y.loc[cluster_indices[key]])
            x_to_predict = x.iloc[test].loc[:, filtered_vars[int(key)]]
            preds[key].append(pd.Series(data=value.predict(x_to_predict), index=x_to_predict.index))
    
    if export_model:
        for key, value in pipelines.items():
            joblib.dump(value, 'models/cluster_' + key + '_' + model_name + '.joblib')
        
    preds = pd.concat({k: pd.concat(v, axis=0) for k, v in preds.items()}, axis=1)
    preds = preds[~preds.index.duplicated(keep='last')]
    preds.columns = [str(col) + '_' + transformation + '_cluster' for col in preds.columns]
    
    return preds

In [38]:
def rolling_fit_predict_all(x, y, pipeline, cutoff_date, refit_freq, gap=7, export_model=True, model_name=None):
    
    """
    Fits and predicts in a rolling basis
    """
    
    n_freq = {'D': 24, 'W': 24*7, 'M': 24*30}
    refit_dates = pd.date_range(cutoff_date, x.index[-1], freq=refit_freq)
    fit_ix_iter = zip(list(map(lambda z: range(x.index.get_loc(z)), refit_dates)), 
                      list(map(lambda z: range(x.index.get_loc(z) + gap, x.index.get_loc(z) + n_freq[refit_freq] + gap), refit_dates)))
    
    preds = []
    idx = x.index
    for train, test in fit_ix_iter:
        if test[-1] > len(x):
            test = range(test[0], len(x))
        pipeline.fit(x.iloc[train].loc[:, filtered_vars['all']], y.iloc[train])
        x_to_predict = x.iloc[test].loc[:, filtered_vars['all']]
        preds.append(pd.Series(data=pipeline.predict(x_to_predict), index=x_to_predict.index))
    
    if export_model:
        joblib.dump(pipeline, 'models/all_' + model_name + '.joblib')
        
    preds = pd.concat(preds, axis=0)
    preds = preds[~preds.index.duplicated(keep='last')]
    preds.name = transformation + '_all'
    
    return preds

Now we will predict unseen data to see how the models perform

In [39]:
for transformation, function in data_transformations.items():
    print(transformation)
    temp_features = feats.copy()
    temp_features.loc[:, ~temp_features.columns.isin(do_not_transform)] = temp_features.loc[:, ~temp_features.columns.isin(do_not_transform)].apply(function[0], axis=1) 
    lagged_feature = shift_dataset(temp_features.copy(), lag=True, forecast=False, nlag=50, dropna=True,
                                  var_lags=vars_to_lag)
    target = to_predict.loc[lagged_feature.index].apply(function[0])
    oos_predictions[transformation + '_cluster'] = rolling_fit_predict_clusters(lagged_feature.copy(), target, best_pipelines_clusters[transformation], cutoff_date, 'D', export_model=True, model_name=transformation)
    oos_predictions[transformation + '_all'] = rolling_fit_predict_all(lagged_feature.copy(), target, best_pipelines_all[transformation], cutoff_date, 'D', export_model=True, model_name=transformation)
    pd.concat(oos_predictions.values(), axis=1).to_csv('temp_oos_preds.csv')

none
arsinh
cuberoot


In [41]:
predictions = pd.concat(oos_predictions.values(), axis=1)
preds_by_cluster = predictions.loc[:, list(map(lambda x: x.split('_')[-1] != 'all', predictions.columns))]
preds_all = predictions.loc[:, list(map(lambda x: x.split('_')[-1] == 'all', predictions.columns))]

Given that we have a prediction for each cluster and timestamp, we have to filter out the ones that do not correspond to the observed cluster in a given timestamp

In [42]:
preds_cluster_dict = {}
for transformation in data_transformations.keys():
    preds_by_cluster_temp = preds_by_cluster.loc[:, list(map(lambda x: x.split('_')[1] == transformation, preds_by_cluster.columns))]
    preds_by_cluster_temp.columns = list(map(lambda x: int(x.split('_')[0]), preds_by_cluster_temp.columns))
    preds_by_cluster_temp = pd.concat([preds_by_cluster_temp, feats.loc[:, 'cluster_mode']], axis=1)
    preds_by_cluster_temp.dropna(inplace=True)
    melted = preds_by_cluster_temp.melt(ignore_index=False, id_vars='cluster_mode')
    melted = melted[melted['cluster_mode'] == melted['variable']]
    preds_cluster_dict['bycluster_' + transformation] = melted.iloc[:, -1]

In [43]:
preds_cluster_df = pd.concat(preds_cluster_dict.values(), axis=1)
preds_cluster_df.columns = preds_cluster_dict.keys()
preds_output = pd.concat([preds_all, preds_cluster_df], axis=1).dropna()

In [44]:
preds_output.to_csv('oos_preds.csv')