In [7]:
import numpy as np
import pandas as pd
import pickle

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import f1_score, precision_recall_fscore_support, confusion_matrix

In [2]:
def labelizer(true, pred):
    true_trinary, pred_trinary = true.copy(), pred.copy()
    true_trinary[true_trinary>0], pred_trinary[pred_trinary>0] = 1, 1
    true_trinary[true_trinary<=0],  pred_trinary[pred_trinary<=0] = -1, -1
    return true_trinary, pred_trinary

In [3]:
class MetaLoader(Dataset):
    '''
    Class for loading data in the meta-labelling model.
    '''
    def __init__(self, df, forecast_history, forecast_length, start_stamp=0, end_stamp=None, LAG=0):
        super().__init__()
        self.forecast_history = forecast_history
        self.forecast_length = forecast_length
        self.LAG = LAG
        self.df = df.copy()
        if start_stamp != 0 and end_stamp is not None:
            if self.LAG == 0:
                self.df = self.df[start_stamp:end_stamp]
            else:
                self.df = self.df[start_stamp:end_stamp+self.LAG]
        elif start_stamp != 0:
            self.df = self.df[start_stamp:]
        elif end_stamp is not None:
            if self.LAG == 0:
                self.df = self.df[:end_stamp]
            else:
                self.df = self.df[:end_stamp+self.LAG]
        if (len(self.df) - self.df.count()).max() != 0:
            print('Missing values in data.')
            print(len(self.df) - self.df.count())
        self.counter = 0

    def __getitem__(self, idx):
        rows = self.df.iloc[idx: self.forecast_history + idx].copy().to_numpy()
        epsilon = 0.0001
        targs_idx_start = self.forecast_history + idx
        future_return = 0
        if self.LAG == 0:
            targ_rows = self.df.iloc[targs_idx_start: self.forecast_length + targs_idx_start].copy().to_numpy()
        else:
            future_prices = self.df.iloc[targs_idx_start + self.LAG - 1: targs_idx_start + self.LAG].copy().to_numpy()
            future_return = np.log(future_prices[0,1]/rows[-1:,1])*10_000
        src_data = rows
        src_std = np.std(src_data, axis = 0)+epsilon
        src_median = np.mean(src_data, axis = 0)
        src_std, src_median = src_std.flatten(), src_median.flatten()
        src_data_medianized = torch.from_numpy((src_data-src_median)/src_std).float()
        future_return_trinary = np.array([0,0,1]) if future_return > 0 else np.array([1,0,0]) if future_return < 0 else np.array([0,1,0])
        return src_data_medianized, future_return_trinary, src_data


    def __len__(self):
        if self.LAG == 0:
            return (len(self.df) - self.forecast_history - self.forecast_length - 1)
        else:
            return (len(self.df) - self.forecast_history - self.LAG - 1)

In [4]:
class Meta(torch.nn.Module):
    '''
    Meta-labelling model
    '''
    def __init__(self, input_size, d_model, output_size):
        super(Meta, self).__init__()
        self.linear1 = nn.Linear(input_size, d_model)
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(d_model, output_size)
        self.activation2 = nn.Softmax()

    def forward(self, x):
        out = self.linear1(x)
        out = self.activation1(out)
        out = out[:,0,:]
        out = self.linear2(out)
        out = self.activation2(out)
        return out

In [5]:
orderbook = pd.read_csv('./input_data/all/orderbook.csv')
orderbook['price'] = orderbook['w_midprice']

In [8]:
with open('./results_HFformer/1658424401_list_results.pkl', 'rb') as f:
    predictions_hfformer = pickle.load(f)

In [15]:
pred_returns, true_returns = predictions_hfformer[19][0], predictions_hfformer[19][1]
true_trinary, pred_trinary = labelizer(true_returns, pred_returns)

In [17]:
precision_recall_fscore_support(true_trinary, pred_trinary)

(array([0.64033568, 0.62742576]),
 array([0.64495063, 0.62271627]),
 array([0.64263487, 0.62506215]),
 array([101983,  97921]))

In [18]:
orderbook = orderbook[2_000_000:2_000_000+len(pred_returns)]
orderbook['predicted_return'] = pred_trinary

In [20]:
def augment_trade_data(df, lag, forecast_window=None):
    if forecast_window:
        df['lag_return'] = np.log(df['price'].shift(forecast_window)/df['price'].shift(forecast_window+1))
        return df.iloc[forecast_window+1:,:]
    if lag == 0:
        return df
    else:
        col_name = 'log_lag'+str(lag)+'_price'
        df[col_name] = np.log(df.price) - np.log(df.price).shift(lag)
        return df.iloc[lag:,:]

## Training

In [38]:
forecast_history = 400
forecast_window = 30
epochs = 20
batch_size = 64
shuffle = False

In [35]:
trade = augment_trade_data(orderbook, lag=0, forecast_window=forecast_window)

features = ['predicted_return',
            'price', 'lag_return',
            'bid1', 'bidqty1', 'bid2', 'bidqty2', 'bid3', 'bidqty3', 'bid4', 'bidqty4', 'bid5', 'bidqty5',
            'bid6', 'bidqty6', 'bid7', 'bidqty7', 'bid8', 'bidqty8', 'bid9', 'bidqty9',
            'ask1', 'askqty1', 'ask2', 'askqty2', 'ask3', 'askqty3', 'ask4', 'askqty4', 'ask5', 'askqty5',
            'ask6', 'askqty6', 'ask7', 'askqty7', 'ask8', 'askqty8', 'ask9', 'askqty9']

train_set = MetaLoader(df=trade[features], forecast_history=forecast_history, forecast_length=1,
                          start_stamp=0, end_stamp=10_000, LAG=2)

val_set = MetaLoader(df=trade[features], forecast_history=forecast_history, forecast_length=1,
                          start_stamp=0, end_stamp=10_000, LAG=2)

In [36]:
model = Meta(39, 16, 3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adamax(model.parameters(), lr=0.01)

In [39]:
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=shuffle, sampler=None, batch_sampler=None, num_workers=0)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=shuffle, sampler=None, batch_sampler=None, num_workers=0)

for epoch in range(1, epochs+1):
    
    output_all = torch.Tensor(0)
    trg_all = torch.Tensor(0)
    model1_pred = torch.Tensor(0)
    
    train_loss = 0
    model.train()
    for src, trg, src_data in train_loader:
        optimizer.zero_grad()
        output = model(src)
        loss = criterion(output, trg.float())
        train_loss =+ loss.item()
        loss.backward()
        optimizer.step()
    
    val_loss = 0
    model.eval()
    for src, trg, src_data in val_loader:
        output = model(src)
        
        output_all = torch.cat((output_all, output))
        trg_all = torch.cat((trg_all, trg))
        
        loss = criterion(output, trg.float())
        val_loss += loss.item()
    
    output_all = output_all.cpu().detach().numpy()
    trg_all = trg_all.cpu().detach().numpy()
    
    output_all_trinary = np.zeros_like(output_all)
    output_all_trinary[np.arange(len(output_all)), output_all.argmax(1)] = 1
    
    diff = np.abs(trg_all-output_all_trinary)
    diff_sum = np.sum(diff, axis=1)
    correct_actions = len(diff_sum[np.where(diff_sum==0)])
             
    f1 = f1_score(trg_all, output_all_trinary, average='macro')
    print(confusion_matrix(trg_all.argmax(axis=1), output_all_trinary.argmax(axis=1)))
    
    #Determine same matches between model1 and model2
    
    
    print(f'| epoch: {epoch} | train loss: {train_loss} | val loss: {val_loss} | f1 score: {f1} |')  

  out = self.activation2(out)


[[4594    0  223]
 [   2    0    0]
 [4522    0  258]]
| epoch: 1 | train loss: 0.9850111603736877 | val loss: 144.4152673482895 | f1 score: 0.2524757269844349 |


  out = self.activation2(out)


[[4588    0  229]
 [   2    0    0]
 [4509    0  271]]
| epoch: 2 | train loss: 0.983620822429657 | val loss: 144.27565652132034 | f1 score: 0.2540121319547879 |


  out = self.activation2(out)


[[4577    0  240]
 [   2    0    0]
 [4472    0  308]]
| epoch: 3 | train loss: 0.9817782640457153 | val loss: 144.17260360717773 | f1 score: 0.2585654590269531 |


  out = self.activation2(out)


[[4529    0  288]
 [   2    0    0]
 [4435    0  345]]
| epoch: 4 | train loss: 0.9808297157287598 | val loss: 144.11379289627075 | f1 score: 0.26155243080357155 |


  out = self.activation2(out)


[[4489    0  328]
 [   2    0    0]
 [4358    0  422]]
| epoch: 5 | train loss: 0.9792852997779846 | val loss: 144.02448111772537 | f1 score: 0.2698603125085182 |


  out = self.activation2(out)


[[4452    0  365]
 [   1    0    1]
 [4318    0  462]]
| epoch: 6 | train loss: 0.9778462052345276 | val loss: 143.96687304973602 | f1 score: 0.27334956538390637 |


  out = self.activation2(out)


[[4399    0  418]
 [   1    0    1]
 [4238    0  542]]
| epoch: 7 | train loss: 0.9763882160186768 | val loss: 143.89722138643265 | f1 score: 0.2809001979908427 |


  out = self.activation2(out)


[[4285    0  532]
 [   1    0    1]
 [4059    0  721]]
| epoch: 8 | train loss: 0.9755994081497192 | val loss: 143.76732003688812 | f1 score: 0.2966986566239284 |


  out = self.activation2(out)


[[4252    0  565]
 [   1    0    1]
 [4024    0  756]]
| epoch: 9 | train loss: 0.9756187200546265 | val loss: 143.7134144306183 | f1 score: 0.2990817925118713 |


  out = self.activation2(out)


[[4171    0  646]
 [   1    0    1]
 [3942    0  838]]
| epoch: 10 | train loss: 0.9740628004074097 | val loss: 143.62005281448364 | f1 score: 0.3042114479601546 |


  out = self.activation2(out)


[[4175    0  642]
 [   1    0    1]
 [3923    0  857]]
| epoch: 11 | train loss: 0.9741186499595642 | val loss: 143.57824009656906 | f1 score: 0.30647163872522043 |


  out = self.activation2(out)


[[3983    0  834]
 [   1    0    1]
 [3669    0 1111]]
| epoch: 12 | train loss: 0.9752256870269775 | val loss: 143.4572075009346 | f1 score: 0.3230576497877789 |


  out = self.activation2(out)


[[3968    0  849]
 [   1    0    1]
 [3631    0 1149]]
| epoch: 13 | train loss: 0.9732086062431335 | val loss: 143.35918962955475 | f1 score: 0.32603727774896674 |


  out = self.activation2(out)


[[3837    0  980]
 [   1    0    1]
 [3487    0 1293]]
| epoch: 14 | train loss: 0.9715458154678345 | val loss: 143.2275237441063 | f1 score: 0.3328738647299836 |


  out = self.activation2(out)


[[3816    0 1001]
 [   1    0    1]
 [3447    0 1333]]
| epoch: 15 | train loss: 0.9723817110061646 | val loss: 143.18185472488403 | f1 score: 0.33547903955631414 |


  out = self.activation2(out)


[[3778    0 1039]
 [   1    0    1]
 [3391    0 1389]]
| epoch: 16 | train loss: 0.9709256291389465 | val loss: 143.0759425163269 | f1 score: 0.3385670630397986 |


  out = self.activation2(out)


[[3773    0 1044]
 [   1    0    1]
 [3383    0 1397]]
| epoch: 17 | train loss: 0.9709489941596985 | val loss: 143.02171349525452 | f1 score: 0.33902406874946794 |


  out = self.activation2(out)


[[3726    0 1091]
 [   1    0    1]
 [3318    0 1462]]
| epoch: 18 | train loss: 0.9704731702804565 | val loss: 142.9431659579277 | f1 score: 0.34230520359647265 |


  out = self.activation2(out)


[[3725    0 1092]
 [   2    0    0]
 [3324    0 1456]]
| epoch: 19 | train loss: 0.9726157784461975 | val loss: 142.91895043849945 | f1 score: 0.3417061230056581 |


  out = self.activation2(out)


[[3690    0 1127]
 [   2    0    0]
 [3258    0 1522]]
| epoch: 20 | train loss: 0.9731727242469788 | val loss: 142.74199426174164 | f1 score: 0.34564109731630555 |


In [40]:
confusion_matrix(trg_all.argmax(axis=1), output_all_trinary.argmax(axis=1))

array([[3690,    0, 1127],
       [   2,    0,    0],
       [3258,    0, 1522]])