In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
%matplotlib inline

In [None]:
path = './data/m1/USD000UTSTOM/'

In [None]:
np.load(path + 'time.npy', encoding='bytes')

In [None]:
def get_pd_full_ticker_data(ticker, freq, lags=1, y_forward_shift = 0):
    path = './data/'
    path += freq +'/'
    path += ticker +'/'
    
    final = pd.DataFrame()
    final['close'] = np.load(path + 'close.npy')
    final['open'] = np.load(path + 'open.npy')
    final['open'] = (final['close'] - final['open'])/final['close']
    final['high'] = np.load(path + 'high.npy')
    final['high'] = (final['close'] - final['high'])/final['close']
    final['low'] = np.load(path + 'low.npy')
    final['low'] = (final['close'] - final['low'])/final['close']
    final['time'] = np.load(path + 'time.npy', encoding='bytes')
    final['count'] = np.load(path + 'count.npy')
    final['volume'] = np.load(path + 'volume.npy')
    final['close'] = final.close.pct_change()
    final = final.set_index('time')
    for i in range(1, lags+1):
        for column in ['open', 'high', 'low', 'count', 'volume', 'close']:
            final[column + '_lag' + str(i)] = final[column].shift(i)
    del final['open']
    del final['high']
    del final['low']
    del final['count']
    del final['volume']
    final['close'] = final['close'].shift(-y_forward_shift)
    final = final.dropna(axis=0)
    y = final['close']
    del final['close']
    return y, final.dropna(axis=0)

In [None]:
y, X = get_pd_full_ticker_data('USD000UTSTOM', 'm1')

In [None]:
X.head()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
plot_acf(X.volume_lag1, lags=100)

In [None]:
plot_pacf(X.volume_lag1, lags=100)

In [None]:
import xgboost as xgb

In [None]:
param = {'max_depth': 5, 'min_child_weight': 6, 'eta': 0.05, 'silent': 1, 'objective': 'reg:linear',
         'subsample': 0.5, 'colsample_bytree': 1.0, 'colsample_bylevel': 1.0}

In [None]:
y, X = get_pd_full_ticker_data('USD000UTSTOM', 'm1', lags=10, y_forward_shift=5)
dmatrix = xgb.DMatrix(X, y)

In [None]:
X.shape

In [None]:
cv = xgb.cv(param, dmatrix, num_boost_round=300, nfold=5, metrics={'rmse'})

In [None]:
cv.loc[150:, ['test-rmse-mean', 'train-rmse-mean']].plot()

In [None]:
print(cv.loc[199, 'test-rmse-mean'])

In [None]:
print(cv.loc[199, 'test-rmse-mean'])

In [None]:
print(cv.loc[199, 'test-rmse-mean'])

In [None]:
model = xgb.train(param, dmatrix, num_boost_round=200)

In [None]:
pd.Series(model.get_fscore()).sort_values(ascending=False)

In [None]:
predicted = model.predict(dmatrix)
X['predicted'] = predicted + 0.00000001
X['error'] = y - X['predicted']
X['actual'] = y + 0.00000001
# X.plot(kind='scatter', x='predicted', y='error', figsize=(12,12), xlim=[-0.03, 0.03], ylim=[-0.03, 0.03])

In [None]:
X['right'] = np.sign(X['actual']) * np.sign(X['predicted'])

In [None]:
X['pnl'] = np.abs(X['actual'])*X['right']

In [None]:
X['pnl']

In [None]:
np.abs(X['predicted']).describe()

In [None]:
np.abs(X['predicted']).quantile(0.25)

In [None]:
start = 1
for return_value in X['pnl'].values:
    start = start*(1+return_value)
print(start)

In [None]:
X['pnl'].sum()

In [None]:
X.loc[np.abs(X.predicted)>np.abs(X['predicted']).quantile(0.25),'pnl'].sum()