In [1]:
import torch
import torchmetrics
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
import numpy as np
import torch.nn as nn
from torch.utils.data.dataset import Dataset

import pandas as pd

import tqdm, sys, os, gc, argparse, warnings

warnings.filterwarnings("ignore")

import gc
import time

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
cols=['n_close' ,'amount_delta', 'n_midprice', 'n_bid1', 'n_bsize1'] #, 'n_ask1', 'n_asize1']
seq_length=25

In [2]:
class SeqDataset(Dataset):
    def __init__(self, data_features, data_targets,target_index):
        self.len = data_features.size()[0]
        self.features=data_features
        self.targets=data_targets[:,target_index]
    def __getitem__(self, index):
        return self.features[index], self.targets[index]
    def __len__(self):
        return self.len

In [3]:
# LSTM

class GRU(nn.Module):
    def __init__(self):
        super(GRU, self).__init__()
        self.gru = nn.GRU(
            input_size=len(cols),
            hidden_size=128,
            num_layers=2,
            batch_first=True,
        )
        self.mlp = nn.Sequential(
            nn.Linear(128, 3),
            nn.Softmax()
        )
    
    def forward(self, input):
        output, _ = self.gru(input, None)
        output = output[:, -1, :]
        output = self.mlp(output)
        return output
    
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=len(cols),
            hidden_size=128,
            num_layers=2,
            batch_first=True,
        )
        self.mlp = nn.Sequential(
            nn.Linear(128, 3),
            nn.Softmax()
        )
    
    def forward(self, input):
        output, _ = self.lstm(input, None)
        output = output[:, -1, :]
        output = self.mlp(output)
        return output

model=LSTM().to(device)

criterion = nn.CrossEntropyLoss().cuda()

In [4]:
def train(train_loader, model, criterion, optimizer):
    start = time.time()
    start_batch = [start, 0]
    model.train()
    train_loss = 0.0
    preds = torch.tensor([]).to(device)
    target_all = torch.tensor([]).to(device)
    for i, (input, target) in enumerate(train_loader):
        input = input.to(device)
        target = target.to(torch.int64).to(device)
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i + 1) % 2500 == 0:
            start_batch[((i + 1) // 2500) % 2] = time.time()
            print(
                "Train loss",
                loss.item(),
                "t={}s".format(
                    start_batch[((i + 1) // 2500) % 2]
                    - start_batch[(1 + (i + 1) // 2500) % 2]
                ),
            )

        preds = torch.cat((preds, output.argmax(1)))
        target_all = torch.cat((target_all, target))

        train_loss += loss.item()

    val_acc = torchmetrics.functional.classification.multiclass_f1_score(
        preds, target_all, num_classes=3, average="macro"
    )
    print(i)
    print("t={}s".format(time.time() - start))
    print("F1 score", val_acc)
    return train_loss / len(train_loader)

def validate(val_loader, model, criterion):
    model.eval()
    val_acc = 0.0
    preds = torch.tensor([]).to(device)
    target_all = torch.tensor([]).to(device)
    val_loss=0.0
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
            input = input.to(device)
            target = target.to(torch.int64).to(device)
            output = model(input)
            loss = criterion(output, target)
            # val_acc += (output.argmax(1) == target).sum().item()
            preds = torch.cat((preds, output.argmax(1)))
            target_all = torch.cat((target_all, target))

        val_acc = torchmetrics.functional.classification.multiclass_f1_score(
            preds, target_all, num_classes=3, average="macro"
        )
        val_loss += loss.item()
    # return val_acc / len(val_loader.dataset)
    return val_acc,val_loss/len(val_loader)

def predict(test_loader, model):
    model.eval()

    test_pred = []
    with torch.no_grad():
        for i, (input, _) in enumerate(test_loader):
            input = input.to(device)
            output = model(input)
            test_pred.append(output.data.cpu().numpy())

    return np.vstack(test_pred)

In [5]:
target_index=2

# FOR LOOPS

In [None]:

# def get_DataSet(csvindex):
#     path = "../data/"
#     features = torch.tensor([])
#     targets = torch.tensor([])

#     train_files = os.listdir(path + "train")
#     train_file_list = [train_files[i:i+100] for i in range(0,len(train_files),100)]
#     train_file_list[11] = train_file_list[11]+train_file_list[12]
#     train_file_list.pop()
#     for filename in tqdm.tqdm(train_file_list[csvindex]):
#         train_df = pd.read_csv(path + "train/" + filename)
#         target_df = train_df[["label_5", "label_10", "label_20", "label_40", "label_60"]]
#         train_df = train_df[cols]
#         train_df = train_df.astype("float32")

#         tmp = train_df.iloc[[0]]
#         for i in range(seq_length - 2):
#             tmp = tmp.append(train_df.iloc[[0]])
#         train_df = tmp.append(train_df)

#         # tmp_features=torch.tensor([])
#         # tmp_targets=torch.tensor([])
#         # for i in range(1999):
#         #     f=torch.from_numpy(np.array(train_df.iloc[i:i+seq_length]))
#         #     t=torch.from_numpy(np.array(target_df.iloc[[i]]))
#         #     tmp_features=torch.cat((tmp_features,torch.unsqueeze(f,0)),0)
#         #     tmp_targets=torch.cat((tmp_targets,t),0)
#         tmp_features = torch.from_numpy(np.array(train_df))
#         tmp_features = tmp_features.unfold(0, seq_length, 1)
#         tmp_features = tmp_features.permute(0, 2, 1)
#         tmp_targets = torch.from_numpy(np.array(target_df))
#         # print(tmp_features.size())
#         # print(tmp_targets.size())
#         features = torch.cat((features, tmp_features), 0)
#         targets = torch.cat((targets, tmp_targets), 0)
        

#     print(features.size())
#     print(targets.size())
#     gc.collect()
#     Data_set = SeqDataset(
#         data_features=features, data_targets=targets, target_index=target_index
#     )
#     train_dataset, val_dataset = torch.utils.data.random_split(
#         dataset=Data_set, lengths=[0.9, 0.1], generator=torch.Generator().manual_seed(42)
#     )
#     trainLoader = torch.utils.data.DataLoader(
#         train_dataset, batch_size=32, shuffle=True, num_workers=0, pin_memory=False, drop_last=True
#     )
#     valLoader = torch.utils.data.DataLoader(
#         val_dataset, batch_size=32, shuffle=True, num_workers=0, pin_memory=False, drop_last=True
#     )
#     return trainLoader,valLoader

In [None]:
# epochs=400
# for epoch in range(epochs):
#     for index in range(6):
#         trainLoader,valLoader = get_DataSet(index)
#         train_loss = train(trainLoader, model, criterion, optimizer)
#         val_acc,val_loss = validate(valLoader, model, criterion)
#         print(
#             "epoch {} :".format(epoch + 1), "file group {} :".format(index+1), "train_loss:", train_loss, 'val_loss:', val_loss,"val_f1:", val_acc
#         )
#     # epochs += 1
#     if epoch % 2 ==0:
#         torch.save(model.state_dict(),"./model/baseline_test.pth")

In [7]:
def get_DataSet(csvindex):
    path = "../data/"
    features = torch.tensor([])
    targets = torch.tensor([])

    train_files = os.listdir(path + "train")
    train_file_list = [train_files[i:i+100] for i in range(0,len(train_files),100)]
    train_file_list[11] = train_file_list[11]+train_file_list[12]
    train_file_list.pop()
    i=0
    for filename in tqdm.tqdm(train_files):
        train_df = pd.read_csv(path + "train/" + filename)
        target_df = train_df[["label_5", "label_10", "label_20", "label_40", "label_60"]]
        train_df = train_df[cols]
        train_df = train_df.astype("float32")

        tmp = train_df.iloc[[0]]
        for i in range(seq_length - 2):
            tmp = tmp.append(train_df.iloc[[0]])
        train_df = tmp.append(train_df)

        # tmp_features=torch.tensor([])
        # tmp_targets=torch.tensor([])
        # for i in range(1999):
        #     f=torch.from_numpy(np.array(train_df.iloc[i:i+seq_length]))
        #     t=torch.from_numpy(np.array(target_df.iloc[[i]]))
        #     tmp_features=torch.cat((tmp_features,torch.unsqueeze(f,0)),0)
        #     tmp_targets=torch.cat((tmp_targets,t),0)
        tmp_features = torch.from_numpy(np.array(train_df))
        tmp_features = tmp_features.unfold(0, seq_length, 1)
        tmp_features = tmp_features.permute(0, 2, 1)
        tmp_targets = torch.from_numpy(np.array(target_df))
        # print(tmp_features.size())
        # print(tmp_targets.size())
        features = torch.cat((features, tmp_features), 0)
        targets = torch.cat((targets, tmp_targets), 0)
        i += 1
        if i % 100 ==0:
            time.sleep(1.5)
        

    print(features.size())
    print(targets.size())
    gc.collect()
    return features,targets


features,targets = get_DataSet(0)


  0%|          | 0/1225 [00:00<?, ?it/s]


AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:


Data_set = SeqDataset(
    data_features=features, data_targets=targets, target_index=target_index
)
train_dataset, val_dataset = torch.utils.data.random_split(
    dataset=Data_set, lengths=[0.95, 0.05], generator=torch.Generator().manual_seed(42)
)
trainLoader = torch.utils.data.DataLoader(
    train_dataset, batch_size=64, shuffle=True, num_workers=0, pin_memory=False, drop_last=True
)
valLoader = torch.utils.data.DataLoader(
    val_dataset, batch_size=64, shuffle=True, num_workers=0, pin_memory=False, drop_last=True
)


In [None]:
len(trainLoader)*64

In [None]:
optimizer = torch.optim.Adam(model.parameters(), 0.0005)
epochs=400
for epoch in range(epochs):
        
    train_loss = train(trainLoader, model, criterion, optimizer)
    val_acc,val_loss = validate(valLoader, model, criterion)
    print(
        "epoch {} :".format(epoch + 1), "train_loss:", train_loss, 'val_loss:', val_loss,"val_f1:", val_acc
    )
    # epochs += 1
    if epoch % 2 ==0:
        torch.save(model.state_dict(),"./model/baseline_test.pth")

In [None]:
path = "../data/"
test_files = os.listdir(path + "test")

tfeatures = torch.tensor([])

for filename in tqdm.tqdm(test_files):
    test_df = pd.read_csv(path + "test/" + filename)
    test_df = test_df[cols]
    test_df = test_df.astype("float32")

    tmp = test_df.iloc[[0]]
    for i in range(seq_length - 2):
        tmp = tmp.append(test_df.iloc[[0]])
    test_df = tmp.append(test_df)

    tmp_tfeatures = torch.from_numpy(np.array(test_df))
    tmp_tfeatures = tmp_tfeatures.unfold(0, seq_length, 1)
    tmp_tfeatures = tmp_tfeatures.permute(0, 2, 1)

    tfeatures = torch.cat((tfeatures, tmp_tfeatures), 0)

test_set=SeqDataset(data_features=tfeatures,data_targets=torch.zeros([tfeatures.size()[0],1]),target_index=target_index)
testLoader=torch.utils.data.DataLoader(test_set,batch_size=32,shuffle=True,num_workers=0, pin_memory=False)

# for test