In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.optim import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# from preprocessing.preprocessing import ecb_pipeline_en, fast_detect
from preprocessing.outlier_detection import remove_outlier

import time

import gc

from tqdm import tqdm


torch.set_default_dtype(torch.float32)

In [2]:
FILENAME = "data/train_series.csv"
FILENAME_ECB = "data/ecb_data_preprocessed.csv"
FILENAME_FED = "data/fed_data_preprocessed.csv"

In [3]:
returns = pd.read_csv(FILENAME, index_col=0)
ecb = pd.read_csv(FILENAME_ECB, index_col=0)
fed = pd.read_csv(FILENAME_FED, index_col=0)

In [4]:
returns = remove_outlier(returns)

In [5]:
returns = pd.get_dummies(returns, columns=["Index Name"])

In [6]:
returns["Sign"] = (returns["Index + 1"] > 0).astype(int)

In [7]:
returns.head()

Unnamed: 0,Index - 9,Index - 8,Index - 7,Index - 6,Index - 5,Index - 4,Index - 3,Index - 2,Index - 1,Index - 0,...,Index Name_CVIX Index,Index Name_EURUSD Curncy,Index Name_EURUSDV1M Curncy,Index Name_MOVE Index,Index Name_SPX Index,Index Name_SRVIX Index,Index Name_SX5E Index,Index Name_V2X Index,Index Name_VIX Index,Sign
0,0.001045,0.005841,0.003832,-0.027519,-0.103565,-0.045086,-0.011265,0.005164,0.05405,0.015779,...,0,0,0,0,0,0,0,1,0,1
1,-0.021497,0.007891,-0.013175,-0.008436,0.0,0.026303,0.000556,0.001455,0.007422,0.0,...,0,0,0,1,0,0,0,0,0,1
2,-0.001872,-0.008154,0.023588,0.004086,0.003493,0.0033,0.000885,-0.011304,0.00504,0.000156,...,0,0,0,0,1,0,0,0,0,1
3,0.00498,-0.000864,0.001677,0.0,0.00603,-0.001083,0.000419,0.001492,0.001018,-0.002582,...,0,0,0,0,1,0,0,0,0,1
4,0.00036,-0.001893,0.005579,-0.003056,-0.001171,-0.001623,-0.00235,-0.006444,-0.000729,-0.000365,...,0,1,0,0,0,0,0,0,0,1


In [8]:
nontextual_cols = ['Index - 9',
 'Index - 8',
 'Index - 7',
 'Index - 6',
 'Index - 5',
 'Index - 4',
 'Index - 3',
 'Index - 2',
 'Index - 1',
 'Index - 0',
 'Index Name_CVIX Index',
 'Index Name_EURUSD Curncy',
 'Index Name_EURUSDV1M Curncy',
 'Index Name_MOVE Index',
 'Index Name_SPX Index',
 'Index Name_SRVIX Index',
 'Index Name_SX5E Index',
 'Index Name_V2X Index',
 'Index Name_VIX Index']
nb_nontextfeatures = len(nontextual_cols)

In [9]:
y = returns["Sign"]
y.value_counts()

0    4519
1    4007
Name: Sign, dtype: int64

In [10]:
returns = returns.drop(["Sign", "Index + 1"], axis=1)

In [11]:
# 60% train, 20% val, 20% test

returns_, returns_test, y_, y_test = train_test_split(
    returns, y, test_size=0.2, train_size=0.8,
    random_state=0, stratify=y
    )

returns_train, returns_val, y_train, y_val = train_test_split(
    returns_, y_, test_size=0.25, train_size=0.75,
    random_state=42, stratify=y_
    )

In [12]:
# reattach labels to train
returns_train = pd.concat([returns_train, y_train], axis=1)

In [13]:
returns_train["index ecb"] = returns_train["index ecb"].str.split(",")
returns_train["index fed"] = returns_train["index fed"].str.split(",")

In [14]:
# repeat samples containing multiple ECB and FED announcements, so that each sample contains only one ECB and one FED announcement
returns_train = returns_train.explode("index ecb")
returns_train = returns_train.explode("index fed")

returns_train = returns_train.dropna()


In [15]:
returns_train

Unnamed: 0,Index - 9,Index - 8,Index - 7,Index - 6,Index - 5,Index - 4,Index - 3,Index - 2,Index - 1,Index - 0,...,Index Name_CVIX Index,Index Name_EURUSD Curncy,Index Name_EURUSDV1M Curncy,Index Name_MOVE Index,Index Name_SPX Index,Index Name_SRVIX Index,Index Name_SX5E Index,Index Name_V2X Index,Index Name_VIX Index,Sign
4645,-0.004264,0.008511,0.016807,0.008299,0.000000,-0.016667,-0.042925,-0.017700,-0.008969,0.039740,...,0,0,1,0,0,0,0,0,0,0
3230,-0.008518,0.003416,-0.040893,-0.008025,0.069172,0.026382,-0.036458,0.010076,0.000000,0.000000,...,0,0,0,1,0,0,0,0,0,0
1301,0.005284,-0.000998,0.010213,-0.004101,-0.011616,0.001504,-0.003657,-0.024653,-0.006352,0.012444,...,0,1,0,0,0,0,0,0,0,1
1088,-0.039405,-0.016078,0.018308,0.061411,-0.008718,-0.004843,-0.030184,0.039245,0.028455,0.016807,...,0,0,1,0,0,0,0,0,0,0
6586,-0.002471,0.000000,0.000000,-0.030129,0.008506,0.004636,-0.026157,0.004215,0.028218,-0.016704,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,0.044951,0.004568,-0.056247,0.059887,-0.064720,-0.079639,0.000000,0.029976,0.027124,0.038878,...,0,0,0,1,0,0,0,0,0,1
4855,0.002891,0.012038,-0.000974,0.000905,0.008521,-0.009217,-0.003837,0.006340,-0.011104,-0.002672,...,0,1,0,0,0,0,0,0,0,1
1076,-0.068951,-0.049201,-0.025539,0.000000,0.070942,-0.009557,-0.017437,-0.048725,0.008853,0.080720,...,0,0,0,0,0,0,0,0,1,1
1076,-0.068951,-0.049201,-0.025539,0.000000,0.070942,-0.009557,-0.017437,-0.048725,0.008853,0.080720,...,0,0,0,0,0,0,0,0,1,1


In [16]:
y_train = returns_train["Sign"]
returns_train = returns_train.drop(["Sign"], axis=1)
y_train.value_counts()

0    3810
1    3400
Name: Sign, dtype: int64

In [17]:
y_val.value_counts()

0    904
1    801
Name: Sign, dtype: int64

In [18]:
# Make another version of the validation set with only one ECB and one FED announcement per sample

returns_val_t = pd.concat([returns_val, y_val], axis=1)

returns_val_t["index ecb"] = returns_val_t["index ecb"].str.split(",")
returns_val_t["index fed"] = returns_val_t["index fed"].str.split(",")
returns_val_t = returns_val_t.explode("index ecb")
returns_val_t = returns_val_t.explode("index fed")
returns_val_t = returns_val_t.dropna()

y_val_t = returns_val_t["Sign"]
returns_val_t = returns_val_t.drop(["Sign"], axis=1)
y_val_t.value_counts()


0    1273
1    1113
Name: Sign, dtype: int64

In [19]:
y_test.value_counts()

0    904
1    802
Name: Sign, dtype: int64

# The textual data

# Translation

# HuggingFace Transformers

# Loading data

In [20]:
from model.framework_dataset import get_data_loader
from model.framework_model import CorpusEncoder, ClassificationHead, MyModel

In [21]:
config = {

    "method": "model_03",

    "learning_rate": 0.001,

    "weight_decay": 0.,

    "batch_size": 32,

    "layers": 3,

    "dropout": 0.3,

    "separate": True,
    
    "max_corpus_len": 1

}

In [25]:
train_set, train_loader, tokenizer, steps = get_data_loader(
    returns_train, ecb, fed, y_train, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set, val_loader, tokenizer, steps = get_data_loader(
    returns_val, ecb, fed, y_val, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

test_set, test_loader, tokenizer, steps = get_data_loader(
    returns_test, ecb, fed, y_test, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set_t, val_loader_t, tokenizer, steps = get_data_loader(
    returns_val_t, ecb, fed, y_val_t, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

In [23]:
# import distilbert-base-uncased

from transformers import DistilBertTokenizer, DistilBertModel

In [24]:
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Loading model

In [28]:
from model.framework_model import MyModel

In [29]:
# device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import optuna

In [31]:
bert.to(device)
bert.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [32]:
# Use distilbert to compute the embeddings of each text
train_samples = []
train_labels = []
with torch.no_grad():
    for batch in tqdm(train_loader):
        cls_ecb = bert(batch["X_ecb"].squeeze(1).to(device), batch["X_ecb_mask"].squeeze(1).to(device)).last_hidden_state[:, 0, :]
        cls_fed = bert(batch["X_fed"].squeeze(1).to(device), batch["X_fed_mask"].squeeze(1).to(device)).last_hidden_state[:, 0, :]

        # concatenate X_ind and cls_ecb and cls_fed
        X_ind = batch["X_ind"].to(device)
        X = torch.cat((X_ind, cls_ecb, cls_fed), dim=1)

        # Add a copy of the result to the train samples
        train_samples.append(X.detach().cpu().numpy())
        train_labels.append(batch["label"].detach().cpu().numpy())

train_samples = np.concatenate(train_samples)
train_labels = np.concatenate(train_labels)

100%|██████████| 226/226 [09:51<00:00,  2.62s/it]


In [33]:
# On validation set
val_samples = []
val_labels = []
with torch.no_grad():
    for batch in tqdm(val_loader):
        cls_ecb = bert(batch["X_ecb"].squeeze(1).to(device), batch["X_ecb_mask"].squeeze(1).to(device)).last_hidden_state[:, 0, :]
        cls_fed = bert(batch["X_fed"].squeeze(1).to(device), batch["X_fed_mask"].squeeze(1).to(device)).last_hidden_state[:, 0, :]

        # concatenate X_ind and cls_ecb and cls_fed
        X_ind = batch["X_ind"].to(device)
        X = torch.cat((X_ind, cls_ecb, cls_fed), dim=1)

        # Add a copy of the result to the train samples
        val_samples.append(X.detach().cpu().numpy())
        val_labels.append(batch["label"].detach().cpu().numpy())

val_samples = np.concatenate(val_samples)
val_labels = np.concatenate(val_labels)

100%|██████████| 54/54 [02:22<00:00,  2.64s/it]


In [34]:
# Use distilbert to compute the embeddings of each text for the validation set with only one ECB and one FED announcement per sample
val_samples_t = []
val_labels_t = []
with torch.no_grad():
    for batch in tqdm(val_loader_t):
        cls_ecb = bert(batch["X_ecb"].squeeze(1).to(device), batch["X_ecb_mask"].squeeze(1).to(device)).last_hidden_state[:, 0, :]
        cls_fed = bert(batch["X_fed"].squeeze(1).to(device), batch["X_fed_mask"].squeeze(1).to(device)).last_hidden_state[:, 0, :]

        # concatenate X_ind and cls_ecb and cls_fed
        X_ind = batch["X_ind"].to(device)
        X = torch.cat((X_ind, cls_ecb, cls_fed), dim=1)

        # Add a copy of the result to the train samples
        val_samples_t.append(X.detach().cpu().numpy())
        val_labels_t.append(batch["label"].detach().cpu().numpy())
    
val_samples_t = np.concatenate(val_samples_t)
val_labels_t = np.concatenate(val_labels_t)

100%|██████████| 75/75 [03:23<00:00,  2.72s/it]


In [35]:
# Use distilbert to compute the embeddings of each text for the test set
test_samples = []
test_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        cls_ecb = bert(batch["X_ecb"].squeeze(1).to(device), batch["X_ecb_mask"].squeeze(1).to(device)).last_hidden_state[:, 0, :]
        cls_fed = bert(batch["X_fed"].squeeze(1).to(device), batch["X_fed_mask"].squeeze(1).to(device)).last_hidden_state[:, 0, :]

        # concatenate X_ind and cls_ecb and cls_fed
        X_ind = batch["X_ind"].to(device)
        X = torch.cat((X_ind, cls_ecb, cls_fed), dim=1)

        # Add a copy of the result to the train samples
        test_samples.append(X.detach().cpu().numpy())
        test_labels.append(batch["label"].detach().cpu().numpy())

test_samples = np.concatenate(test_samples)
test_labels = np.concatenate(test_labels)

100%|██████████| 54/54 [02:28<00:00,  2.76s/it]


In [36]:
# Test LGBMClassifier on the data.

lgbm = LGBMClassifier(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.1,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(train_samples, train_labels)

lgbm.score(test_samples, test_labels)

0.5662368112543963

In [38]:
# Use optuna to find the best parameters for LGBMClassifier

def objective(trial):
    lgbm = LGBMClassifier(
        n_estimators=100,
        max_depth=trial.suggest_int("max_depth", 5, 20),
        learning_rate=trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        # reg_lambda=trial.suggest_loguniform("reg_lambda", 1e-3, 1e3),
        random_state=42,
        n_jobs=-1
    )

    lgbm.fit(train_samples, train_labels)

    return lgbm.score(val_samples, val_labels)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2023-04-05 09:42:16,768][0m A new study created in memory with name: no-name-90e7da6c-8c4e-4d14-88f0-3e51ef468e9f[0m
[32m[I 2023-04-05 09:42:25,941][0m Trial 0 finished with value: 0.5302052785923753 and parameters: {'max_depth': 10, 'learning_rate': 0.001023277511457471, 'colsample_bytree': 0.637353750597351}. Best is trial 0 with value: 0.5302052785923753.[0m
[32m[I 2023-04-05 09:42:36,062][0m Trial 1 finished with value: 0.5513196480938416 and parameters: {'max_depth': 13, 'learning_rate': 0.016408079007132387, 'colsample_bytree': 0.8209747283017248}. Best is trial 1 with value: 0.5513196480938416.[0m
[32m[I 2023-04-05 09:42:45,820][0m Trial 2 finished with value: 0.555425219941349 and parameters: {'max_depth': 9, 'learning_rate': 0.09996910268840384, 'colsample_bytree': 0.8615577999008968}. Best is trial 2 with value: 0.555425219941349.[0m
[32m[I 2023-04-05 09:42:55,465][0m Trial 3 finished with value: 0.5595307917888563 and parameters: {'max_depth': 13, 'learn

In [39]:
# load the best parameters
lgbm = LGBMClassifier(
    n_estimators=100,
    max_depth=study.best_params["max_depth"],
    learning_rate=study.best_params["learning_rate"],
    colsample_bytree=study.best_params["colsample_bytree"],
    random_state=42,
    n_jobs=-1
)

lgbm.fit(train_samples, train_labels)

lgbm.score(test_samples, test_labels)

0.5609613130128956

In [42]:
# Concate the train and validation set
train_samples_ = np.concatenate((train_samples, val_samples_t))
train_labels_ = np.concatenate((train_labels, val_labels_t))

In [43]:
lgbm.fit(train_samples_, train_labels_)
lgbm.score(test_samples, test_labels)

0.5515826494724502

In [44]:
lgbm = LGBMClassifier(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.1,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(train_samples_, train_labels_)
lgbm.score(test_samples, test_labels)

0.5762016412661196

In [41]:
(train_labels == 1.0).sum() / len(train_labels)

0.47156726768377255

In [45]:
# Test XGBClassifier on the data.

xgb = XGBClassifier(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.1,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb.fit(train_samples, train_labels)

xgb.score(test_samples, test_labels)

0.5773739742086753

In [46]:
xgb.fit(train_samples_, train_labels_)
xgb.score(test_samples, test_labels)

0.5844079718640094

In [47]:
# Use optuna to find the best parameters for the XGBClassifier

def objective(trial):
    xgb = XGBClassifier(
        n_estimators=100,
        max_depth=trial.suggest_int("max_depth", 3, 30),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
        # reg_lambda=trial.suggest_float("reg_lambda", 0.01, 1.0),
        random_state=42,
        n_jobs=-1
    )
    xgb.fit(train_samples, train_labels)
    return xgb.score(val_samples, val_labels)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2023-04-05 10:05:53,406][0m A new study created in memory with name: no-name-11609199-be8e-4902-852a-d656a4cfa7b0[0m
[32m[I 2023-04-05 10:06:35,183][0m Trial 0 finished with value: 0.5648093841642229 and parameters: {'max_depth': 15, 'learning_rate': 0.4239917349291033, 'colsample_bytree': 0.8504270294941401}. Best is trial 0 with value: 0.5648093841642229.[0m


In [37]:
# load the best parameters
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=study.best_params["max_depth"],
    learning_rate=study.best_params["learning_rate"],
    colsample_bytree=study.best_params["colsample_bytree"],
    random_state=42,
    n_jobs=-1
)

xgb.fit(train_samples, train_labels)

xgb.score(test_samples, test_labels)

0.5916201117318436

# Other example

In [30]:
config = {

    "method": "model_01",

    "learning_rate": 0.01,

    "weight_decay": 0,

    "batch_size": 2,

    "layers": 3,

    "dropout": 0.5,

    "separate": True,
    
    "max_corpus_len": 2

}

In [31]:
train_set, train_loader, tokenizer, steps = get_data_loader(
    returns_train, ecb, fed, y_train, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set, val_loader, tokenizer, steps = get_data_loader(
    returns_val, ecb, fed, y_val, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

test_set, test_loader, tokenizer, steps = get_data_loader(
    returns_test, ecb, fed, y_test, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

In [46]:
first_sample = next(iter(train_loader))
# first_sample['X_ecb_mask']

tensor([[[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 0,  ..., 0, 0, 0]]])

In [45]:
(first_sample['X_ecb_mask'].sum(dim=-1) > 2).any(dim=-1)

tensor([True, True])

In [44]:
for idx, batch in enumerate(train_loader):
    X_att = batch["X_ecb_mask"]
    condition = (first_sample['X_ecb_mask'].sum(dim=-1) > 2).any(dim=-1)
    if not condition.all():
        print(idx, batch)

In [32]:
model3 = MyModel(
    nontext_dim=nb_nontextfeatures, method=config["method"],
    separate=config["separate"], dropout=config["dropout"]
).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- T

In [33]:
from train import train, evaluate, train_with_accumulation

In [34]:
train(model=model3, train_loader=train_loader, val_loader=val_loader, config=config, device=device, 
            max_epochs=2, eval_every=1, name="model_03_test")

Epoch 0: 100%|██████████| 2684/2684 [1:03:07<00:00,  1.41s/batch, accuracy=54, loss=1.11]  
Evaluation:   0%|          | 0/895 [00:01<?, ?batch/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!