In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.optim import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# from preprocessing.preprocessing import ecb_pipeline_en, fast_detect

import time

import gc

from tqdm import tqdm


torch.set_default_dtype(torch.float32)

In [2]:
FILENAME = "data/train_series.csv"
FILENAME_ECB = "data/ecb_data_preprocessed.csv"
FILENAME_FED = "data/fed_data_preprocessed.csv"

In [3]:
returns = pd.read_csv(FILENAME, index_col=0)
ecb = pd.read_csv(FILENAME_ECB, index_col=0)
fed = pd.read_csv(FILENAME_FED, index_col=0)

In [4]:
returns = pd.get_dummies(returns, columns=["Index Name"])

In [5]:
returns["Sign"] = (returns["Index + 1"] > 0).astype(int)

In [6]:
y = returns["Sign"]

In [7]:
y.value_counts()

0    4930
1    4016
Name: Sign, dtype: int64

In [8]:
returns = returns.drop(["Sign", "Index + 1"], axis=1)

In [9]:
nontextual_cols = ['Index - 9',
 'Index - 8',
 'Index - 7',
 'Index - 6',
 'Index - 5',
 'Index - 4',
 'Index - 3',
 'Index - 2',
 'Index - 1',
 'Index - 0',
 'Index Name_CVIX Index',
 'Index Name_EURUSD Curncy',
 'Index Name_EURUSDV1M Curncy',
 'Index Name_MOVE Index',
 'Index Name_SPX Index',
 'Index Name_SRVIX Index',
 'Index Name_SX5E Index',
 'Index Name_V2X Index',
 'Index Name_VIX Index']
nb_nontextfeatures = len(nontextual_cols)

In [10]:
# 60% train, 20% val, 20% test

returns_, returns_test, y_, y_test = train_test_split(
    returns, y, test_size=0.2, train_size=0.8,
    random_state=0, stratify=y
    )

returns_train, returns_val, y_train, y_val = train_test_split(
    returns_, y_, test_size=0.25, train_size=0.75,
    random_state=42, stratify=y_
    )

In [11]:
del returns, y
gc.collect()

0

# The textual data

# Translation

# HuggingFace Transformers

# Loading data

In [12]:
from model.framework_dataset import get_data_loader
from model.framework_model import MyModel

# Loading model

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Other example

In [14]:
config = {

    "method": None,

    "learning_rate": 0.001,

    "weight_decay": 0.001,

    "batch_size": 64,

    "layers": 5,

    "dropout": 0.5,

    "separate": False,
    
    "max_corpus_len": 2


}

In [15]:
print(config["method"])

None


In [16]:
train_set, train_loader, tokenizer, steps = get_data_loader(
    returns_train, ecb, fed, y_train, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set, val_loader, tokenizer, steps = get_data_loader(
    returns_val, ecb, fed, y_val, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

test_set, test_loader, tokenizer, steps = get_data_loader(
    returns_test, ecb, fed, y_test, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

In [17]:
my_model = MyModel(
    nontext_dim=nb_nontextfeatures, method=config["method"],
    separate=config["separate"], dropout=config["dropout"]
).to(device)

In [18]:
my_model

MyModel(
  (nontext_network): NontextualNetwork()
  (corpus_encoder): CorpusEncoder()
  (classifier): ClassificationHead(
    (mlp): GPTMLP(
      (layers): Sequential(
        (0): Linear(in_features=19, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Dropout(p=0.5, inplace=False)
        (4): Linear(in_features=128, out_features=128, bias=True)
        (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (6): ReLU()
        (7): Dropout(p=0.5, inplace=False)
        (8): Linear(in_features=128, out_features=128, bias=True)
        (9): Linear(in_features=128, out_features=1, bias=True)
      )
    )
  )
)

In [19]:
next(iter(train_loader))

[tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.3337e-02,  0.0000e+00, -1.3661e-02,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-1.9520e-01, -1.0922e-03,  5.8503e-02,  ...,  0.0000e+00,
           1.0000e+00,  0.0000e+00],
         ...,
         [-4.5488e-03,  6.0890e-03, -3.0827e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 4.9453e-02,  3.0743e-02, -9.1785e-04,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-2.8000e-02,  4.5174e-03, -6.7419e-03,  ...,  0.0000e+00,
           1.0000e+00,  0.0000e+00]], dtype=torch.float64),
 tensor([0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
         0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
         1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1])]

In [21]:
# Test output
batch = next(iter(train_loader))
print(batch)

with torch.no_grad():
    X_ind, y = batch
    my_model_output = my_model(None, None, X_ind.float().to(device))

print(my_model_output.size(0)/64)

[tensor([[ 3.6228e-03,  6.5330e-03,  6.0443e-03,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-5.9292e-03, -4.0927e-02, -2.0905e-02,  ...,  1.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-3.3140e-03, -1.9744e-02,  2.2270e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 1.3176e-03,  7.1796e-04,  1.0471e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 7.8910e-03, -1.3175e-02, -8.4360e-03,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-2.7449e-02, -1.6232e-02,  9.9189e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], dtype=torch.float64), tensor([0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
        0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0])]
128.0


# Testing train code

In [None]:
from train import train, evaluate

In [None]:
my_model

MyModel(
  (nontext_network): NontextualNetwork()
  (corpus_encoder): CorpusEncoder()
  (classifier): ClassificationHead(
    (mlp): GPTMLP(
      (layers): Sequential(
        (0): Linear(in_features=19, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Dropout(p=0.5, inplace=False)
        (4): Linear(in_features=128, out_features=128, bias=True)
        (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (6): ReLU()
        (7): Dropout(p=0.5, inplace=False)
        (8): Linear(in_features=128, out_features=128, bias=True)
        (9): Linear(in_features=128, out_features=1, bias=True)
      )
    )
  )
)

In [None]:
train(my_model, train_loader=train_loader, val_loader=val_loader,config=config,
            device=device, max_epochs=5, eval_every=2, name="no_nlp")

Epoch 1:   0%|          | 0/84 [00:00<?, ?batch/s]


torch.Size([8192])


ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([8192]))

In [23]:
import optuna

In [24]:
def objective(trial):
      config ={

                "method": None,

                "learning_rate": 10**trial.suggest_float("lr_exp", -6, -2),

                "weight_decay": 10**trial.suggest_float("weight_decay_exp", -6, -3),

                "batch_size": 2**trial.suggest_int("batch_size_exp", 4, 5),

                "layers": trial.suggest_int("layers", 1, 8),

                "mlp_hidden_dim": 64,

                "separate": False,

                "max_corpus_len": 1,

                "dropout": trial.suggest_float("dropout", 0.2, 0.7),

            }
      model = MyModel(
            nontext_dim=nb_nontextfeatures, method=config["method"],
            separate=False, dropout=config["dropout"]
            ).to(device)

      _, train_loader, _, _ = get_data_loader(
      returns_train, ecb, fed, y_train, method=config["method"],
      separate=config["separate"], max_corpus_len=config["max_corpus_len"],
      batch_size=config["batch_size"]
      )

      _, val_loader, _, _ = get_data_loader(
      returns_val, ecb, fed, y_val, method=config["method"],
      separate=config["separate"], max_corpus_len=config["max_corpus_len"],
      batch_size=config["batch_size"]
      )

      _, _, _, _ = get_data_loader(
      returns_test, ecb, fed, y_test, method=config["method"],
      separate=config["separate"], max_corpus_len=config["max_corpus_len"],
      batch_size=config["batch_size"]
      )

      _, _, eval_f1s = train(model, train_loader=train_loader, val_loader=val_loader,config=config,
            device=device, max_epochs=1, eval_every=1, name=f"no_nlp_{config['learning_rate']}")
      return eval_f1s[-1]

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=3)

[32m[I 2023-03-08 22:50:20,964][0m A new study created in memory with name: no-name-7219f6bd-89ad-4c1b-9c4c-27298a09bd60[0m
  0%|          | 0/336 [00:00<?, ?batch/s]
Epoch 1:   0%|          | 0/336 [00:00<?, ?batch/s]
Epoch 1:   0%|          | 1/336 [00:00<02:34,  2.17batch/s, accuracy=56.2, loss=0.694]
[A
[A
Epoch 1:   1%|          | 2/336 [00:01<02:52,  1.93batch/s, accuracy=56.2, loss=0.691]
[A
[A
Epoch 1:   1%|          | 3/336 [00:01<03:09,  1.76batch/s, accuracy=56.2, loss=0.693]
[A
[A
Epoch 1:   1%|          | 4/336 [00:02<02:49,  1.96batch/s, accuracy=59.4, loss=0.692]
[A
[A
Epoch 1:   2%|▏         | 6/336 [00:02<02:33,  2.14batch/s, accuracy=62.5, loss=0.692]
[A
[A
[A
Epoch 1:   2%|▏         | 6/336 [00:03<02:33,  2.14batch/s, accuracy=62.5, loss=0.691]
Epoch 1:   2%|▏         | 7/336 [00:03<03:04,  1.79batch/s, accuracy=62.5, loss=0.691]
Epoch 1:   2%|▏         | 8/336 [00:04<02:29,  2.19batch/s, accuracy=60.9, loss=0.692]
[A
[A
Epoch 1:   3%|▎         | 9/33

TypeError: Object of type float32 is not JSON serializable