In [None]:
! pip install torchinfo

In [42]:
import os
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch.nn.functional as F
from torch.utils import data
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
from multiprocessing import Pool

In [None]:
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
path = './data/CU0.csv'

data = pd.read_csv(path)
data.replace(-121, np.nan, inplace=True)
print(set(data['TradingDay'].values))
data = data[['TradingDay', 'LastPrice', 'PreSettlementPrice', 'PreClosePrice',
             'PreOpenInterest', 'OpenPrice', 'HighestPrice', 'LowestPrice', 'OpenInterest',
             'BidPrice1', 'BidVolume1', 'AskPrice1', 'AskVolume1',
             'BidPrice2', 'BidVolume2', 'AskPrice2', 'AskVolume2',
             'BidPrice3', 'BidVolume3', 'AskPrice3', 'AskVolume3',
             'BidPrice4', 'BidVolume4', 'AskPrice4', 'AskVolume4',
             'BidPrice5', 'BidVolume5', 'AskPrice5', 'AskVolume5',
             'delta_Volume', 'delta_Turnover']]
data

In [45]:
# feature_cols = ['BidPrice1', 'BidVolume1', 'AskPrice1', 'AskVolume1',
#              'BidPrice2', 'BidVolume2', 'AskPrice2', 'AskVolume2',
#              'BidPrice3', 'BidVolume3', 'AskPrice3', 'AskVolume3',
#              'BidPrice4', 'BidVolume4', 'AskPrice4', 'AskVolume4',
#              'BidPrice5', 'BidVolume5', 'AskPrice5', 'AskVolume5',
#              'delta_Volume', 'delta_Turnover']

feature_cols = ['BidPrice1', 'BidVolume1', 'AskPrice1', 'AskVolume1',
             'BidPrice2', 'BidVolume2', 'AskPrice2', 'AskVolume2',
             'BidPrice3', 'BidVolume3', 'AskPrice3', 'AskVolume3',
             'BidPrice4', 'BidVolume4', 'AskPrice4', 'AskVolume4',
             'BidPrice5', 'BidVolume5', 'AskPrice5', 'AskVolume5']

price_cols = ['BidPrice1', 'AskPrice1', 'BidPrice2',
              'AskPrice2', 'BidPrice3', 'AskPrice3',
              'BidPrice4', 'AskPrice4', 'BidPrice5', 'AskPrice5']

vol_cols = ['BidVolume1', 'AskVolume1', 'BidVolume2', 'AskVolume2',
            'BidVolume3', 'AskVolume3', 'BidVolume4', 'AskVolume4',
            'BidVolume5', 'AskVolume5']

In [46]:
def data_preprocess(raw_data, ret_window = 120):
    raw_data['midprice'] = (raw_data['BidPrice1']+raw_data['AskPrice1'])/2
    raw_data['fut_midprice'] = raw_data['midprice'].shift(-ret_window)
    raw_data['label'] = 100*(raw_data['fut_midprice']-raw_data['midprice'])/raw_data['midprice']

    raw_data['midvolume'] = (raw_data['BidVolume1']+raw_data['AskVolume1'])/2

    # price_rolling_mean = raw_data['midprice'].rolling(window=10).mean()
    # volume_rolling_mean = raw_data['midvolume'].rolling(window=10).mean()

    # raw_data['rolling_midprice'] = price_rolling_mean
    # raw_data['rolling_midvolume'] = volume_rolling_mean

    raw_data.drop(raw_data.tail(ret_window).index, inplace=True)
    raw_data.drop(raw_data.head(10).index, inplace=True)

    label = raw_data[['label']].values

    # raw_data[vol_cols] = raw_data[vol_cols].div(raw_data['rolling_midvolume'], axis=0)
    # raw_data[price_cols] = raw_data[price_cols].div(raw_data['rolling_midprice'], axis=0)-1
    # raw_data[price_cols] = 10000*raw_data[price_cols]
    # raw_data[vol_cols] = np.log(raw_data[vol_cols])
    raw_data = raw_data[feature_cols].values

    return raw_data, label

In [None]:
dataset = data_preprocess(data)
del data
dataset

In [None]:
plt.hist(dataset[1], bins=20)

In [49]:
class MyDataset(Dataset):
    def __init__(self, data, period=600, small_period=1):
        self.data = data
        self.period = period
        self.processed_data_x = []
        self.processed_data_y = []

        for i in range(period, len(data[0]), small_period):
            self.processed_data_x.append(data[0][i-period:i, :])
            self.processed_data_y.append(data[1][i-1])

        self.processed_data_x = np.stack(self.processed_data_x, axis=0)
        self.processed_data_y = np.stack(self.processed_data_y, axis=0)

    def __len__(self) :
        return len(self.processed_data_y)

    def __getitem__(self, idx):
        return self.processed_data_x[idx, :,:], self.processed_data_y[idx, :]

In [None]:
train_data = MyDataset((dataset[0][:int(0.8*len(dataset[0])), :], dataset[1][:int(0.8*len(dataset[0])), :]), 600, 20)
val_data = MyDataset((dataset[0][int(0.8*len(dataset[0])): int(0.9*len(dataset[0])), :], dataset[1][int(0.8*len(dataset[0])): int(0.9*len(dataset[0])), :]), 600, 20)
test_data = MyDataset((dataset[0][int(0.9*len(dataset[0])):, :], dataset[1][int(0.9*len(dataset[0])):, :]), 600, 20)
print(len(train_data))
print(len(val_data))
print(len(test_data))

In [51]:
train_data = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
val_data = DataLoader (val_data, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)
test_data = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

In [52]:
class MyLSTM(nn.Module):
    def __init__(self, y_len):
        super().__init__()
        self.y_len = y_len

        # convolution blocks
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.LeakyReLU(negative_slope=0.01),
#             nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(24,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(24,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(24,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(24,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,5)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(24,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(24,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )

        # inception moduels
        self.inp1 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(18,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )
        self.inp2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(15,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )
        self.inp3 = nn.Sequential(
            nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )

        # lstm layers
        self.lstm = nn.LSTM(input_size=192, hidden_size=64, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(64, self.y_len)

    def forward(self, x):
        # h0: (number of hidden layers, batch size, hidden size)
        h0 = torch.zeros(1, x.size(0), 64).to(device)
        c0 = torch.zeros(1, x.size(0), 64).to(device)

        x = torch.unsqueeze(x, 1)

        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)

        x_inp1 = self.inp1(x)
        x_inp2 = self.inp2(x)
        x_inp3 = self.inp3(x)

        x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)

#         x = torch.transpose(x, 1, 2)
        x = x.permute(0, 2, 1, 3)
        x = torch.reshape(x, (-1, x.shape[1], x.shape[2]))

        x, _ = self.lstm(x, (h0, c0))
        x = x[:, -1, :]
        x = self.fc1(x)
        forecast_y = torch.softmax(x, dim=1)

        return forecast_y

In [None]:
model = MyLSTM(1)
model.to(device)

In [None]:
def xavier_init(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0.0)

model.apply(xavier_init)

In [None]:
summary(model, (16, 600, 20))

In [56]:
class HuberLoss(nn.Module):
    def __init__(self, delta=1.0):
        super(HuberLoss, self).__init__()
        self.delta = delta

    def forward(self, prediction, target):
        error = prediction - target
        abs_error = torch.abs(error)
        quadratic_part = torch.clamp(abs_error, max=self.delta)
        linear_part = abs_error - quadratic_part
        loss = 0.5 * quadratic_part ** 2 + self.delta * linear_part
        return torch.mean(loss)

In [57]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [None]:
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
current_time

In [59]:
cnt = 0
patience = 10
# A function to encapsulate the training Loop
def batch_gd(model, criterion, optimizer, train_loader, test_loader, epochs):
    train_losses = []
    test_losses = []
    best_test_loss = np.inf
    best_test_epoch = 0

    for it in range(epochs):
        model.train()
        t0 = datetime.now()
        train_loss = []

        with tqdm(train_loader, desc='Epoch {}'.format(it+1)) as loop:
            for idx, (inputs, targets) in enumerate(loop):
                # move data to GPU
                inputs = inputs.detach().numpy()
                mean_price_value = np.mean(inputs[:, 0, [0, 2]], axis=1).reshape(-1, 1, 1)
                inputs[:, :, 0::2] = (inputs[:, :, 0::2] - mean_price_value) / mean_price_value
                std_dev = np.std(inputs[:, :, 1::2], axis=(1, 2)).reshape(-1, 1, 1)
                inputs[:, :, 1::2] = inputs[:, :, 1::2] / std_dev

                inputs, targets = torch.from_numpy(inputs).to(device, dtype=torch.float), targets.to(device, dtype=torch.float)
                targets = torch.clamp(targets, min=-10.0, max=10.0)

                # zero the parameter gradients
                optimizer.zero_grad()
                # Forward pass
                outputs = model(inputs)
                # print(outputs)
                # print(targets)
                loss = criterion(outputs, targets)
                # Backward and optimize
                loss.backward()
                optimizer.step()
                train_loss.append(loss.item())

                loop.set_postfix(loss=np.mean(train_loss))

            # Get train Loss and test Loss
            train_loss = np.mean(train_loss) # a Little misleading

        model.eval()
        test_loss = []

        with tqdm(test_loader, desc='Epoch {}'.format(it+1)) as loop:
            for idx, (inputs, targets) in enumerate(loop):
                inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device, dtype=torch.float)
                outputs = model(inputs)

                loss = criterion(outputs, targets)
                test_loss.append(loss.item())

        test_loss = np.mean(test_loss)

        train_losses.append(train_loss)
        test_losses.append(test_loss)

        if test_loss < best_test_loss:
            torch.save(model, './models/{}.pth'.format(current_time))
            best_test_loss = test_loss
            best_test_epoch = it
            print('model saved')
            cnt = 0

        cnt += 1
        if cnt > patience:
            break

        dt = datetime.now() - t0
        print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
            Validation Loss: {test_loss:.4f}, Duration: {dt}, Best Val Epoch: {best_test_epoch}')

    return train_losses, test_losses

In [None]:
train_losses, val_losses = batch_gd(model, criterion, optimizer, train_data, val_data, epochs=10)

In [None]:
plt.figure(figsize=(15,6))
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='validation loss')
plt.legend()

In [None]:
model = torch.load('./models/{}.pth'.format(current_time)).to(device, dtype=torch.float)
target_list = []
output_list = []

model.eval()
for idx, (inputs, targets) in enumerate(test_data):
    # Move to GPU
    inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device, dtype=torch.float)
    # Forward pass
    with torch.no_grad():
        outputs = model(inputs)

    target_list.append(targets)
    output_list.append(outputs)

np.corrcoef(torch.cat(target_list, dim=0).reshape(-1).detach().to('cpu'), torch.cat(output_list, dim=0).reshape(-1).detach().to('cpu'))

In [None]:
plt.scatter(torch.cat(target_list, dim=0).detach().to('cpu'), torch.cat(output_list, dim=0).reshape(-1).detach().to('cpu'))