In [1]:
import numpy as np

from data_loaders import *

# Instructions on how to load data

In [10]:
# How to read one file: careful, this might take a long time
# data is by defult QUANTILE scaled so between [0, 1]
a = read_scale(SAMPLE_PATH, no_obs=400000, slice_size=10)
a.shape

(399990, 200)

In [5]:
# How to read best bid and ask prices quickly
buy, sell = read_bid_ask(SAMPLE_PATH, no_obs=400000)
buy

0         3171.8
1         3172.2
2         3169.8
3         3173.2
4         3172.8
           ...  
399995    2985.0
399996    2985.0
399997    2985.0
399998    2983.8
399999    2983.8
Name: B1, Length: 400000, dtype: float64

In [3]:
# How to iterate over all datafiles available
#  the generator loops over all csv files in data/order_books
for a in generate_data(read_scale, no_obs=2000, slice_size=10):
    print(a)
    # Do anything you want with a, like train the model

[[0.85835836 0.85835836 0.86036036 ... 0.21471471 0.32832833 0.33533534]
 [0.88338338 0.88638639 0.88788789 ... 0.         0.         0.91841842]
 [0.91741742 0.90690691 0.90840841 ... 0.         0.         0.        ]
 ...
 [0.50650651 0.50800801 0.50800801 ... 0.75425425 0.9964965  0.        ]
 [0.50650651 0.50800801 0.50800801 ... 0.79129129 0.75975976 0.996997  ]
 [0.50650651 0.50800801 0.50800801 ... 0.79129129 0.78078078 0.996997  ]]
[[0.84684685 0.91321403 0.98307532 ... 0.         0.         0.8973974 ]
 [0.98748749 0.99899544 0.99099099 ... 0.         0.         0.8973974 ]
 [0.98748749 0.99899544 0.99099099 ... 0.         0.         0.        ]
 ...
 [0.5955956  0.48248248 0.58408408 ... 0.         0.         0.        ]
 [0.5955956  0.48248248 0.58408408 ... 0.         0.         0.        ]
 [0.5955956  0.48248248 0.58408408 ... 0.82432432 0.         0.        ]]


# Sample model

In [13]:
# Let's fit a basic NN that tries to predict the best bid price
#  10 minutes in the future. Train it on first half of the SAMPLE_PATH file.
#  Report test performance on a different file.

offset = 120 # 10 minutes roughly
data_size = 200000 # size of train/test set
slice_size = 10

X = read_scale(SAMPLE_PATH, no_obs=2*data_size, slice_size=slice_size)
y = read_bid_ask(SAMPLE_PATH, no_obs=2*data_size+offset-slice_size)[1][offset:]

X_tr, X_ts = X[:data_size], X[data_size:]
y_tr, y_ts = y[:data_size], y[data_size:]

In [20]:
# Training model. Takes 20s on my laptop. Ignore the ConvergenceWarning (need mroe data)

from sklearn.neural_network import MLPRegressor

nn = MLPRegressor(hidden_layer_sizes=(100, 100, 100,), max_iter=10)
nn.fit(X_tr, y_tr)



MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
             learning_rate_init=0.001, max_iter=10, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

In [21]:
nn.score(X_tr, y_tr) # This is the R^2
# As expected, performs very well on training data (massive overfitting)

0.8800552784956912

In [23]:
nn.score(X_ts, y_ts)
# And terribly on test data

-0.8562876471564211