In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression, ElasticNetCV, ElasticNet
from xgboost import XGBRegressor
from math import sqrt
import os

### Load data:

In [2]:
cleaned = pd.read_csv('data/cleaned.csv', parse_dates = ['time'])
cleaned.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)

### fit LR model:

In [3]:
start_test_date = '2018-03-01'

In [4]:
numeric_columns = [col for col in cleaned.columns if col not in ['time', 'target', 'refID_coin']]

In [36]:
numeric_columns = cleaned.filter(regex = 'MA_VAR').columns

In [37]:
numeric_columns

Index(['price_MA_VAR_2', 'price_MA_VAR_4', 'price_MA_VAR_8', 'price_MA_VAR_12',
       'price_MA_VAR_16', 'price_MA_VAR_24', 'price_MA_VAR_36',
       'price_MA_VAR_48', 'price_MA_VAR_72', 'price_MA_VAR_144',
       'price_MA_VAR_288', 'price_MA_VAR_1440', 'CirculatingSupply_MA_VAR_2',
       'CirculatingSupply_MA_VAR_4', 'CirculatingSupply_MA_VAR_8',
       'CirculatingSupply_MA_VAR_12', 'CirculatingSupply_MA_VAR_16',
       'CirculatingSupply_MA_VAR_24', 'CirculatingSupply_MA_VAR_36',
       'CirculatingSupply_MA_VAR_48', 'CirculatingSupply_MA_VAR_72',
       'CirculatingSupply_MA_VAR_144', 'CirculatingSupply_MA_VAR_288',
       'CirculatingSupply_MA_VAR_1440', 'percent_market_cap_MA_VAR_2',
       'percent_market_cap_MA_VAR_4', 'percent_market_cap_MA_VAR_8',
       'percent_market_cap_MA_VAR_12', 'percent_market_cap_MA_VAR_16',
       'percent_market_cap_MA_VAR_24', 'percent_market_cap_MA_VAR_36',
       'percent_market_cap_MA_VAR_48', 'percent_market_cap_MA_VAR_72',
       'percent

In [5]:
def mape_error(actual, forecasted):
    actual = pd.Series(actual)
    forecasted = pd.Series(forecasted)
    return 100 * sum(abs((actual - forecasted) / actual)) / actual.size

def ds_error(actual, forecasted):
    actual = pd.Series(actual)
    forecasted = pd.Series(forecasted)
    signs = (actual - actual.shift(1)) * (forecasted - forecasted.shift(1)) > 0
    return 100 * sum((actual - actual.shift(1)) * (forecasted - forecasted.shift(1)) > 0) / (actual.size - 1)

def mape_var_coeff(mape_errors):
    mape_errors = pd.Series(mape_errors)
    mean_mape_error = mape_errors.mean()
    return sqrt(sum((mape_errors - mean_mape_error)**2) / (mape_errors.size - 1)) / mean_mape_error

In [40]:
for coin in cleaned.refID_coin.unique():
    print(coin)
    train = cleaned[(cleaned.time <= start_test_date) & 
                             (cleaned.refID_coin == coin)].tail(-3)
    test  = cleaned[(cleaned.time > start_test_date) & 
                             (cleaned.refID_coin == coin)].head(-1)
    X_train = train[numeric_columns].values
    X_test = test[numeric_columns].values
    y_train = train.target.values - train.price.values
    y_test = test.target.values - test.price.values
    
    lr = LinearRegression(normalize=True)
    lr.fit(X_train, y_train)
    y_pred = np.exp(lr.predict(X_test) + test.price.values)-1
    y_pred_random_walk = np.exp(test.price.values) - 1
    y_test = np.exp(y_test + test.price.values) - 1
    
#     xgb = XGBRegressor(n_estimators=20, max_depth=5)
#     xgb.fit(X_train, y_train)
#     y_pred = xgb.predict(X_test) + test.price.values
#     y_pred_random_walk = test.price.values
#     y_test = test.target.values
    
    print(mape_error(y_test, y_pred), mape_error(y_test, y_pred_random_walk))
    print(ds_error(y_pred, y_test), ds_error(y_pred_random_walk, y_test))

Bitcoin
0.12053230475389493 0.11922249359209483
39.480159950784376 29.990772070132266
Ethereum
0.1258331294626563 0.12301136305073541
38.91110427560751 29.40633651184251
Ripple
0.17672608044148905 0.172998960369813
37.00399876960935 27.191633343586588


In [43]:
lr.fit?

In [42]:
list(zip(lr.coef_, numeric_columns))

[(0.0014543237090469052, 'price_MA_VAR_2'),
 (-11044525668.204918, 'price_MA_VAR_4'),
 (4577987391.070144, 'price_MA_VAR_8'),
 (718504253.0169837, 'price_MA_VAR_12'),
 (718504253.0167243, 'price_MA_VAR_16'),
 (718504253.0167785, 'price_MA_VAR_24'),
 (718504253.016727, 'price_MA_VAR_36'),
 (718504253.016727, 'price_MA_VAR_48'),
 (718504253.016727, 'price_MA_VAR_72'),
 (718504253.016727, 'price_MA_VAR_144'),
 (718504253.016727, 'price_MA_VAR_288'),
 (718504253.016727, 'price_MA_VAR_1440'),
 (282382266235.8168, 'CirculatingSupply_MA_VAR_2'),
 (2178754056017.8093, 'CirculatingSupply_MA_VAR_4'),
 (2129371659446.215, 'CirculatingSupply_MA_VAR_8'),
 (2121296935182.6086, 'CirculatingSupply_MA_VAR_12'),
 (2144365007316.8203, 'CirculatingSupply_MA_VAR_16'),
 (-5839523374708.743, 'CirculatingSupply_MA_VAR_24'),
 (-5839539723668.098, 'CirculatingSupply_MA_VAR_36'),
 (2196411987285.6023, 'CirculatingSupply_MA_VAR_48'),
 (2190871734863.8508, 'CirculatingSupply_MA_VAR_72'),
 (2154821847654.554, 'Circ

### Solve sample test files:

In [8]:
files = os.listdir('data/test')
files = ['/data/test/' + file for file in files]

In [10]:
for file in files:
    data = pd.read_csv(file)
    min_date = data.date.min()
    

FileNotFoundError: File b'/data/test/test_02.csv' does not exist