In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.optim import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# from preprocessing.preprocessing import ecb_pipeline_en, fast_detect

import time

import gc

from tqdm import tqdm


torch.set_default_dtype(torch.float32)

In [2]:
FILENAME = "data/train_series.csv"
FILENAME_ECB = "data/ecb_data_preprocessed.csv"
FILENAME_FED = "data/fed_data_preprocessed.csv"

In [3]:
returns = pd.read_csv(FILENAME, index_col=0)
ecb = pd.read_csv(FILENAME_ECB, index_col=0)
fed = pd.read_csv(FILENAME_FED, index_col=0)

In [4]:
returns = pd.get_dummies(returns, columns=["Index Name"])

In [5]:
returns["Sign"] = (returns["Index + 1"] > 0).astype(int)

In [6]:
y = returns["Sign"]

In [7]:
y.value_counts()

0    4930
1    4016
Name: Sign, dtype: int64

In [8]:
small_dataset_size = 100 # len(y)
y.iloc[:small_dataset_size].value_counts()

0    53
1    47
Name: Sign, dtype: int64

In [9]:
returns = returns.drop(["Sign", "Index + 1"], axis=1)

In [10]:
nontextual_cols = ['Index - 9',
 'Index - 8',
 'Index - 7',
 'Index - 6',
 'Index - 5',
 'Index - 4',
 'Index - 3',
 'Index - 2',
 'Index - 1',
 'Index - 0',
 'Index Name_CVIX Index',
 'Index Name_EURUSD Curncy',
 'Index Name_EURUSDV1M Curncy',
 'Index Name_MOVE Index',
 'Index Name_SPX Index',
 'Index Name_SRVIX Index',
 'Index Name_SX5E Index',
 'Index Name_V2X Index',
 'Index Name_VIX Index']
nb_nontextfeatures = len(nontextual_cols)

In [11]:
# 60% train, 20% val, 20% test

returns_, returns_test, y_, y_test = train_test_split(
    returns.iloc[:small_dataset_size], y.iloc[:small_dataset_size], test_size=0.1, train_size=0.9,
    random_state=0, stratify=y.iloc[:small_dataset_size]
    )

returns_train, returns_val, y_train, y_val = train_test_split(
    returns_, y_, test_size=0.1, train_size=0.9,
    random_state=42, stratify=y_
    )

In [12]:
y.iloc[:100].value_counts()

0    53
1    47
Name: Sign, dtype: int64

In [13]:
# del returns, y
# gc.collect()

In [14]:
returns.describe()

Unnamed: 0,Index - 9,Index - 8,Index - 7,Index - 6,Index - 5,Index - 4,Index - 3,Index - 2,Index - 1,Index - 0,Index Name_CVIX Index,Index Name_EURUSD Curncy,Index Name_EURUSDV1M Curncy,Index Name_MOVE Index,Index Name_SPX Index,Index Name_SRVIX Index,Index Name_SX5E Index,Index Name_V2X Index,Index Name_VIX Index
count,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0,8946.0
mean,-8e-06,0.0002,0.000255,0.000339,9e-05,0.000407,0.000644,0.000988,0.000847,0.00095,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
std,0.040715,0.040788,0.039987,0.040587,0.03923,0.039386,0.040104,0.0399,0.040365,0.040699,0.314287,0.314287,0.314287,0.314287,0.314287,0.314287,0.314287,0.314287,0.314287
min,-0.355095,-0.355095,-0.355095,-0.355095,-0.355095,-0.355095,-0.350588,-0.350588,-0.345301,-0.345301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.011516,-0.01116,-0.011122,-0.010843,-0.010698,-0.010363,-0.010435,-0.010155,-0.01036,-0.010515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.008365,0.008304,0.008276,0.008358,0.008156,0.008573,0.008795,0.008732,0.008855,0.00902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.382167,0.382167,0.382167,0.496008,0.496008,0.496008,0.768245,0.768245,0.768245,0.768245,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
ct = ColumnTransformer([
    ('Standard Scaler', StandardScaler(), [
                                    'Index - 9',
                                    'Index - 8',
                                    'Index - 7',
                                    'Index - 6',
                                    'Index - 5',
                                    'Index - 4',
                                    'Index - 3',
                                    'Index - 2',
                                    'Index - 1',
                                    'Index - 0'])
], remainder='passthrough')

In [16]:
returns_train = pd.DataFrame(ct.fit_transform(returns_train), columns=returns_train.columns)
returns_val = pd.DataFrame(ct.transform(returns_val), columns=returns_train.columns)
returns_test = pd.DataFrame(ct.transform(returns_test), columns=returns_train.columns)

In [17]:
returns_train.describe()

Unnamed: 0,Index - 9,Index - 8,Index - 7,Index - 6,Index - 5,Index - 4,Index - 3,Index - 2,Index - 1,Index - 0,...,index fed,Index Name_CVIX Index,Index Name_EURUSD Curncy,Index Name_EURUSDV1M Curncy,Index Name_MOVE Index,Index Name_SPX Index,Index Name_SRVIX Index,Index Name_SX5E Index,Index Name_V2X Index,Index Name_VIX Index
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,...,81,81,81,81,81,81,81,81,81,81
unique,76.0,76.0,75.0,73.0,75.0,73.0,75.0,73.0,72.0,74.0,...,64,2,2,2,2,2,2,2,2,2
top,0.012477,-0.157326,-0.166659,0.101471,0.028317,-0.105236,-0.022662,0.055586,-0.142349,0.080932,...,12,0,0,0,0,0,0,0,0,0
freq,6.0,6.0,7.0,9.0,7.0,9.0,7.0,9.0,10.0,8.0,...,4,72,72,69,74,72,74,73,68,74


# The textual data

# Translation

# HuggingFace Transformers

# Loading data

In [18]:
from model.framework_dataset import get_data_loader
from model.framework_model import MyModel

# Loading model

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Other example

In [20]:
config = {

    "method": "model_03",

    "learning_rate": 1e-3,

    "weight_decay": 0,

    "batch_size": 2,

    "layers": 4,

    "mlp_hidden_dim": 128,

    "dropout": 0,

    "separate": False,
    
    "max_corpus_len": 2,

    "max_epochs": 20,

    "scheduler_step": -1,

    "scheduler_ratio": 0.1,

    "scheduler_last_epoch": 20,

    "early_stopping": False,

    "preload": False

}

In [21]:
print(config["method"])

model_03


In [22]:
train_set, train_loader, tokenizer, steps = get_data_loader(
    returns_train, ecb, fed, y_train, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set, val_loader, tokenizer, steps = get_data_loader(
    returns_val, ecb, fed, y_val, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

test_set, test_loader, tokenizer, steps = get_data_loader(
    returns_test, ecb, fed, y_test, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

In [23]:
train_set[0]

((['In this context, the outcome of the UK’s EU referendum is triggering a debate not just on the future relationship between the EU and the UK, but also – and perhaps more importantly – on how to improve the functioning of the EU and of Economic and Monetary Union (EMU) as one of its key elements.\r\nSo today I will be considering how we could boost support and strengthen the institutional arrangements for European integration.\r\nIn material terms, the EU has facilitated a level of prosperity unprecedented in European history, if only because by ensuring peace it prevented the strife and destruction of the past.\r\nPeople believe in Europe and European public goods when they are tangible and deliver results.\r\nEconomically, the pace of recovery in the euro area remains unsatisfactory, with unemployment levels still too high.\r\nRisk sharing involves both the public and private sectors, for example by ensuring the same level of deposit protection through a European deposit insurance 

In [24]:
returns_train.iloc[0]

Index - 9                      1.079978
Index - 8                     -1.197337
Index - 7                     -0.608429
Index - 6                      1.727739
Index - 5                       0.19114
Index - 4                      0.161762
Index - 3                     -0.941655
Index - 2                     -1.288233
Index - 1                      0.053673
Index - 0                      0.267276
index ecb                           309
index fed                           492
Index Name_CVIX Index                 0
Index Name_EURUSD Curncy              0
Index Name_EURUSDV1M Curncy           1
Index Name_MOVE Index                 0
Index Name_SPX Index                  0
Index Name_SRVIX Index                0
Index Name_SX5E Index                 0
Index Name_V2X Index                  0
Index Name_VIX Index                  0
Name: 0, dtype: object

In [25]:
def text_print(text, line_char_lim=150):
    text_ = text.split('\n')
    for subtext in text_:
        n = len(subtext)
        k = 0
        while k <= n:
            print(subtext[k:min(n, k + line_char_lim)])
            k += line_char_lim
        print('\n')

In [26]:
model = MyModel(method=config["method"],
                layers=config["layers"],
                mlp_hidden_dim=config["mlp_hidden_dim"],
                separate=config["separate"],
                dropout=config["dropout"]).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
model

MyModel(
  (nontext_network): NontextualNetwork()
  (corpus_encoder): CorpusEncoder(
    (encoder): CorpusEncoder(
      (doc_encoder): DocumentEncoder(
        (text_encoder): DistilBertModel(
          (embeddings): Embeddings(
            (word_embeddings): Embedding(30522, 768, padding_idx=0)
            (position_embeddings): Embedding(512, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (transformer): Transformer(
            (layer): ModuleList(
              (0): TransformerBlock(
                (attention): MultiHeadSelfAttention(
                  (dropout): Dropout(p=0.1, inplace=False)
                  (q_lin): Linear(in_features=768, out_features=768, bias=True)
                  (k_lin): Linear(in_features=768, out_features=768, bias=True)
                  (v_lin): Linear(in_features=768, out_features=768, bias=True)
                  (out_lin): Linear(in_feat

In [28]:
batch = next(iter(train_loader))
batch

{'X_text': tensor([[[ 101, 1999, 2023,  ..., 2011, 2216,  102],
          [ 101,  102,    0,  ...,    0,    0,    0],
          [ 101, 1996, 3795,  ..., 1010, 3105,  102],
          [ 101,  102,    0,  ...,    0,    0,    0]],
 
         [[ 101, 2256, 2490,  ..., 3296, 3930,  102],
          [ 101,  102,    0,  ...,    0,    0,    0],
          [ 101, 3361, 9211,  ..., 8485, 2015,  102],
          [ 101,  102,    0,  ...,    0,    0,    0]]]),
 'X_mask': tensor([[[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 0,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 0,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 0,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 0,  ..., 0, 0, 0]]]),
 'X_ind': tensor([[ 1.0800, -1.1973, -0.6084,  1.7277,  0.1911,  0.1618, -0.9417, -1.2882,
           0.0537,  0.2673,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000],
         [-0.8345,  0.7644, -0.1862, 

In [29]:
# First text of first ecb corpus of batch
# batch['X_text'][0][0]

In [30]:
# ECB texts
# tokenizer(train_set[0][0][0], padding='max_length', max_length=512, truncation=True)["input_ids"][0]

In [31]:
# # Test output
# with torch.no_grad():
#     my_model.eval()
#     batch = next(iter(train_loader))
#     print(batch)

#     with torch.no_grad():
#         X_ind, y = batch
#         my_model_output = my_model(None, None, X_ind.float().to(device))

#     print(my_model_output.size(0)/64)

# Testing train code

In [32]:
from train import train, evaluate

In [33]:
model.classifier

ClassificationHead(
  (mlp): SimpleMLP(
    (layers): Sequential(
      (0): Linear(in_features=787, out_features=128, bias=True)
      (1): ReLU()
      (2): Dropout(p=0, inplace=False)
      (3): Linear(in_features=128, out_features=128, bias=True)
      (4): ReLU()
      (5): Dropout(p=0, inplace=False)
      (6): Linear(in_features=128, out_features=128, bias=True)
      (7): ReLU()
      (8): Dropout(p=0, inplace=False)
      (9): Linear(in_features=128, out_features=1, bias=True)
    )
  )
)

In [34]:
import optuna
from torch.optim import Adam

In [35]:
epochs = config["max_epochs"]
lr = config["learning_rate"]
method = config["method"]
optimizer = Adam(model.parameters(),
                    lr=config["learning_rate"],
                    weight_decay=config["weight_decay"])
criterion = nn.BCEWithLogitsLoss()
sigmoid = nn.Sigmoid()

for epoch in range(1, epochs+1):
    total_loss = 0
    total_entries = 0
    correct = 0
    model.train()

    with tqdm(train_loader, unit="batch") as tepoch:
        for batch in tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            optimizer.zero_grad()
            
            # Get inputs
            if method is None:
                X_ind, y_ = batch
                X_ind = torch.Tensor(X_ind).float().to(device)
                y_ = torch.Tensor(y_).float().to(device)
                
                X_text = None
                X_mask = None
            else:
                X_ind = batch["X_ind"].to(device)
                y_ = batch["label"].to(device)

                if config["separate"]:
                    X_ecb = batch["X_ecb"].to(device)
                    X_ecb_att = batch["X_ecb_mask"].to(device)
                    X_fed = batch["X_fed"].to(device)
                    X_fed_att = batch["X_fed_mask"].to(device)

                    X_text = (X_ecb, X_fed)
                    X_mask = (X_ecb_att, X_fed_att)
                else:
                    X_text = (batch["X_text"].to(device),)
                    X_mask = (batch["X_mask"].to(device),)
            
            # Compute output
            output = model(X_text, X_mask, X_ind)
            # print(output)

            # Compute loss
            loss = criterion(output, y_)
            
            # Update model
            loss.backward()
            optimizer.step()
            # Computing predictions

            ## Batch loss
            batch_loss = loss.item()

            # Accuracy computation
            output_proba = sigmoid(output)
            batch_size_ = y_.size(0)
            preds = output_proba.round()
            correct += (preds == y_).sum().item()
            ## Total loss with no reduction
            total_loss += loss.item() * batch_size_
            total_entries += batch_size_
            tepoch.set_postfix(loss=total_loss/total_entries,
                                accuracy=100. * correct/total_entries,
                                batch_loss=batch_loss)



Epoch 1: 100%|██████████| 41/41 [00:42<00:00,  1.04s/batch, accuracy=45.7, batch_loss=0.728, loss=0.708]
Epoch 2: 100%|██████████| 41/41 [00:32<00:00,  1.27batch/s, accuracy=42, batch_loss=0.704, loss=0.7]    
Epoch 3: 100%|██████████| 41/41 [00:30<00:00,  1.34batch/s, accuracy=53.1, batch_loss=0.73, loss=0.695] 
Epoch 4: 100%|██████████| 41/41 [00:30<00:00,  1.36batch/s, accuracy=53.1, batch_loss=0.724, loss=0.694]
Epoch 5: 100%|██████████| 41/41 [00:31<00:00,  1.31batch/s, accuracy=53.1, batch_loss=0.734, loss=0.693]
Epoch 6: 100%|██████████| 41/41 [00:30<00:00,  1.34batch/s, accuracy=53.1, batch_loss=0.722, loss=0.694]
Epoch 7: 100%|██████████| 41/41 [00:37<00:00,  1.11batch/s, accuracy=53.1, batch_loss=0.723, loss=0.692]
Epoch 8: 100%|██████████| 41/41 [00:36<00:00,  1.11batch/s, accuracy=53.1, batch_loss=0.722, loss=0.692]
Epoch 9: 100%|██████████| 41/41 [00:35<00:00,  1.15batch/s, accuracy=53.1, batch_loss=0.716, loss=0.693]
Epoch 10: 100%|██████████| 41/41 [00:37<00:00,  1.08bat

8

In [27]:
def objective(trial):
      config ={

                "method": None,

                "learning_rate": 10**trial.suggest_float("lr_exp", -6, -2),

                "weight_decay": 10**trial.suggest_float("weight_decay_exp", -6, -2),

                "batch_size": 64,

                "layers": trial.suggest_int("layers", 2, 6),

                "mlp_hidden_dim": 64,

                "separate": False,

                "max_corpus_len": 1,

                "dropout": trial.suggest_float("dropout", 0.2, 0.7),

            }
      model = MyModel(
            nontext_dim=nb_nontextfeatures, method=config["method"],
            separate=False, dropout=config["dropout"], mlp_hidden_dim=config["mlp_hidden_dim"]
            ).to(device)

      _, train_loader, tokenizer, _ = get_data_loader(
      returns_train, ecb, fed, y_train, method=config["method"],
      separate=config["separate"], max_corpus_len=config["max_corpus_len"],
      batch_size=config["batch_size"]
      )

      _, val_loader, _, _ = get_data_loader(
      returns_val, ecb, fed, y_val, method=config["method"],
      separate=config["separate"], max_corpus_len=config["max_corpus_len"],
      batch_size=config["batch_size"]
      )

      _, _, _, _ = get_data_loader(
      returns_test, ecb, fed, y_test, method=config["method"],
      separate=config["separate"], max_corpus_len=config["max_corpus_len"],
      batch_size=config["batch_size"]
      )

      _, _, eval_f1s = train(model, train_loader=train_loader, val_loader=val_loader,config=config,
            device=device, max_epochs=20, eval_every=5, name=f"no_nlp_{config['learning_rate']}")
      return eval_f1s[-1]

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, n_jobs=1)

[32m[I 2023-03-09 01:12:10,663][0m A new study created in memory with name: no-name-06989159-1152-408d-b2ac-beadafbca427[0m
Epoch 1: 100%|██████████| 84/84 [00:19<00:00,  4.25batch/s, accuracy=52.2, loss=0.694]
Epoch 2: 100%|██████████| 84/84 [00:19<00:00,  4.37batch/s, accuracy=55.1, loss=0.683]
Epoch 3: 100%|██████████| 84/84 [00:19<00:00,  4.39batch/s, accuracy=54.8, loss=0.683]
Epoch 4: 100%|██████████| 84/84 [00:19<00:00,  4.40batch/s, accuracy=54.8, loss=0.681]
Epoch 5: 100%|██████████| 84/84 [00:19<00:00,  4.34batch/s, accuracy=55.5, loss=0.68] 
Evaluation: 100%|██████████| 28/28 [00:06<00:00,  4.48batch/s, accuracy=56.4, loss=0.681]
Epoch 6: 100%|██████████| 84/84 [00:19<00:00,  4.25batch/s, accuracy=56.2, loss=0.678]
Epoch 7: 100%|██████████| 84/84 [00:19<00:00,  4.34batch/s, accuracy=56.4, loss=0.677]
Epoch 8: 100%|██████████| 84/84 [00:19<00:00,  4.38batch/s, accuracy=56.2, loss=0.678]
Epoch 9: 100%|██████████| 84/84 [00:19<00:00,  4.36batch/s, accuracy=55.7, loss=0.677]


In [29]:
study.best_params

{'lr_exp': -5.961894715777749,
 'weight_decay_exp': -2.427291780099218,
 'layers': 4,
 'dropout': 0.4329101757872429}

In [30]:
study.best_value

0.5872945357618835