In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from model_02.dataset import ReturnsDataset
from torch import optim

from sklearn.model_selection import train_test_split

from preprocessing.preprocessing import ecb_pipeline_en, fast_detect

import time

import gc

from tqdm import tqdm


torch.set_default_dtype(torch.float32)

In [2]:
FILENAME = "data/train_series.csv"
FILENAME_ECB = "data/ecb_data.csv"
FILENAME_FED = "data/fed_data.csv"

In [3]:
returns = pd.read_csv(FILENAME, index_col=0)
ecb = pd.read_csv(FILENAME_ECB, index_col=0)
fed = pd.read_csv(FILENAME_FED, index_col=0)

In [4]:
returns = pd.get_dummies(returns, columns=["Index Name"])

In [5]:
returns["Sign"] = (returns["Index + 1"] > 0).astype(int)

In [6]:
returns.head()

Unnamed: 0,Index - 9,Index - 8,Index - 7,Index - 6,Index - 5,Index - 4,Index - 3,Index - 2,Index - 1,Index - 0,...,Index Name_CVIX Index,Index Name_EURUSD Curncy,Index Name_EURUSDV1M Curncy,Index Name_MOVE Index,Index Name_SPX Index,Index Name_SRVIX Index,Index Name_SX5E Index,Index Name_V2X Index,Index Name_VIX Index,Sign
0,0.001045,0.005841,0.003832,-0.027519,-0.103565,-0.045086,-0.011265,0.005164,0.05405,0.015779,...,0,0,0,0,0,0,0,1,0,1
1,-0.021497,0.007891,-0.013175,-0.008436,0.0,0.026303,0.000556,0.001455,0.007422,0.0,...,0,0,0,1,0,0,0,0,0,1
2,-0.001872,-0.008154,0.023588,0.004086,0.003493,0.0033,0.000885,-0.011304,0.00504,0.000156,...,0,0,0,0,1,0,0,0,0,1
3,0.00498,-0.000864,0.001677,0.0,0.00603,-0.001083,0.000419,0.001492,0.001018,-0.002582,...,0,0,0,0,1,0,0,0,0,1
4,0.00036,-0.001893,0.005579,-0.003056,-0.001171,-0.001623,-0.00235,-0.006444,-0.000729,-0.000365,...,0,1,0,0,0,0,0,0,0,1


In [7]:
y = returns["Sign"]

In [8]:
y.value_counts()

0    4930
1    4016
Name: Sign, dtype: int64

In [9]:
returns = returns.drop(["Sign", "Index + 1"], axis=1)

In [10]:
returns.columns

Index(['Index - 9', 'Index - 8', 'Index - 7', 'Index - 6', 'Index - 5',
       'Index - 4', 'Index - 3', 'Index - 2', 'Index - 1', 'Index - 0',
       'index ecb', 'index fed', 'Index Name_CVIX Index',
       'Index Name_EURUSD Curncy', 'Index Name_EURUSDV1M Curncy',
       'Index Name_MOVE Index', 'Index Name_SPX Index',
       'Index Name_SRVIX Index', 'Index Name_SX5E Index',
       'Index Name_V2X Index', 'Index Name_VIX Index'],
      dtype='object')

In [11]:
nontextual_cols = ['Index - 9',
 'Index - 8',
 'Index - 7',
 'Index - 6',
 'Index - 5',
 'Index - 4',
 'Index - 3',
 'Index - 2',
 'Index - 1',
 'Index - 0',
 'Index Name_CVIX Index',
 'Index Name_EURUSD Curncy',
 'Index Name_EURUSDV1M Curncy',
 'Index Name_MOVE Index',
 'Index Name_SPX Index',
 'Index Name_SRVIX Index',
 'Index Name_SX5E Index',
 'Index Name_V2X Index',
 'Index Name_VIX Index']
nb_nontextfeatures = len(nontextual_cols)

In [12]:
# 60% train, 20% val, 20% test

returns_, returns_test, y_, y_test = train_test_split(
    returns, y, test_size=0.2, train_size=0.8,
    random_state=0, stratify=y
    )

returns_train, returns_val, y_train, y_val = train_test_split(
    returns_, y_, test_size=0.25, train_size=0.75,
    random_state=42, stratify=y_
    )

In [13]:
del returns, y
gc.collect()

0

# The textual data

In [14]:
ecb.head()

Unnamed: 0,title,speaker,text
0,Comments by Yves Mersch at Financial Services ...,Yves Mersch,Comments by Yves Mersch at Financial Service...
1,Securing sustained economic growth in the euro...,Vítor Constâncio,Securing sustained economic growth in the eu...
2,The role of monetary policy in addressing the ...,Mario Draghi,The role of monetary policy in addressing th...
3,The pandemic emergency: the three challenges f...,Philip R. Lane,SPEECH The pandemic emergency: the three c...
4,Transmission channels of monetary policy in th...,Peter Praet,Transmission channels of monetary policy in ...


In [15]:
fed.head()

Unnamed: 0,title,speaker,text
0,The Importance of Economic Education and Finan...,Governor Frederic S. Mishkin,As ...
1,Financial Innovation and Consumer Protection,Chairman Ben S. Bernanke,"The concept of financial innovation, it seems..."
2,Implementing Basel II in the United States,Governor Randall S. Kroszner,Good afternoon. I would like to thank Standar...
3,An Assessment of the U.S. Economy,Vice Chair for Supervision Randal K. Quarles,Thank you for the opportunity to take part in...
4,Monetary Policy since the Onset of the Crisis,Chairman Ben S. Bernanke,When we convened in Jackson Hole in August 20...


In [16]:
ecb["text_"] = ecb.apply(ecb_pipeline_en, axis=1)

In [17]:
ecb["text"].fillna("", inplace=True)
ecb["speaker"].fillna("Unknown", inplace=True)
fed["speaker"].fillna("Unknown", inplace=True)

In [18]:
# Text in french
ecb.loc[138]
# Text in german
ecb.loc[151]

title                         Auf neuen Wegen zum alten Ziel
speaker                                          Yves Mersch
text         Auf neuen Wegen zum alten Ziel   Rede von Yv...
text_      Rede von Yves Mersch, Mitglied des Direktorium...
Name: 151, dtype: object

In [19]:
ecb["lang"] = ecb["text_"].apply(fast_detect)

In [20]:
ecb.head()

Unnamed: 0,title,speaker,text,text_,lang
0,Comments by Yves Mersch at Financial Services ...,Yves Mersch,Comments by Yves Mersch at Financial Service...,Sustainable economic growth in the real econom...,en
1,Securing sustained economic growth in the euro...,Vítor Constâncio,Securing sustained economic growth in the eu...,"Ladies and Gentlemen, Thank you for inviting m...",en
2,The role of monetary policy in addressing the ...,Mario Draghi,The role of monetary policy in addressing th...,"There was a time, not too long ago, when centr...",en
3,The pandemic emergency: the three challenges f...,Philip R. Lane,SPEECH The pandemic emergency: the three c...,"Today, I will discuss the monetary policy meas...",en
4,Transmission channels of monetary policy in th...,Peter Praet,Transmission channels of monetary policy in ...,"Ladies and Gentlemen, Since the onset of the f...",en


In [21]:
fed["lang"] = fed["text"].apply(fast_detect)

In [22]:
fed.head()

Unnamed: 0,title,speaker,text,lang
0,The Importance of Economic Education and Finan...,Governor Frederic S. Mishkin,As ...,en
1,Financial Innovation and Consumer Protection,Chairman Ben S. Bernanke,"The concept of financial innovation, it seems...",en
2,Implementing Basel II in the United States,Governor Randall S. Kroszner,Good afternoon. I would like to thank Standar...,en
3,An Assessment of the U.S. Economy,Vice Chair for Supervision Randal K. Quarles,Thank you for the opportunity to take part in...,en
4,Monetary Policy since the Onset of the Crisis,Chairman Ben S. Bernanke,When we convened in Jackson Hole in August 20...,en


In [23]:
all_langs = ecb["lang"].unique()

In [24]:
ecb["lang"].value_counts()

en    1646
de      75
fr      31
es      16
it       4
Name: lang, dtype: int64

# Translation

# HuggingFace Transformers

# Loading data

In [25]:
batch_size = 4

In [26]:
fed.head()

Unnamed: 0,title,speaker,text,lang
0,The Importance of Economic Education and Finan...,Governor Frederic S. Mishkin,As ...,en
1,Financial Innovation and Consumer Protection,Chairman Ben S. Bernanke,"The concept of financial innovation, it seems...",en
2,Implementing Basel II in the United States,Governor Randall S. Kroszner,Good afternoon. I would like to thank Standar...,en
3,An Assessment of the U.S. Economy,Vice Chair for Supervision Randal K. Quarles,Thank you for the opportunity to take part in...,en
4,Monetary Policy since the Onset of the Crisis,Chairman Ben S. Bernanke,When we convened in Jackson Hole in August 20...,en


In [27]:
from transformers import DistilBertTokenizer

In [28]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_corpus_len = 2

def collate_fn(batch):
    batch_size = len(batch)
    X_ecb = []
    X_fed = []
    X_ind = []
    y = []
    
    for data in batch:
        X_ecb.extend(data[0][0])
        X_fed.extend(data[0][1])
        X_ind.append(data[0][2])
        y.append(data[1])

    # tic = time.perf_counter()
    X_ecb_tokens = tokenizer(X_ecb, return_tensors="pt",
                      truncation=True, padding='max_length', max_length=512)
    X_ecb = X_ecb_tokens['input_ids'].view(batch_size, max_corpus_len, 512)
    X_ecb_att = X_ecb_tokens['attention_mask'].view(batch_size, max_corpus_len, 512)

    X_fed_tokens = tokenizer(X_fed, return_tensors="pt",
                      truncation=True, padding='max_length', max_length=512)
    X_fed = X_fed_tokens['input_ids'].view(batch_size, max_corpus_len, 512)
    X_fed_att = X_fed_tokens['attention_mask'].view(batch_size, max_corpus_len, 512)
    # toc = time.perf_counter()
    # print(f"1 batch tokenized in {toc - tic:0.4f} seconds")
    X_ind = torch.stack(X_ind, dim=0)
    
    return {
        "X_ecb": X_ecb,
        "X_ecb mask": X_ecb_att,
        "X_fed": X_fed,
        "X_fed_mask": X_fed_att,
        "X_ind": X_ind,
        "label": torch.Tensor(y)
    }

In [29]:
train_loader = DataLoader(
    dataset=ReturnsDataset(returns_train, ecb, fed, y_train),
    collate_fn=collate_fn,
    batch_size=2,
    shuffle=False
)

val_loader = DataLoader(
    dataset=ReturnsDataset(returns_val, ecb, fed, y_val),
    collate_fn=collate_fn,
    batch_size=32,
    shuffle=False
)

test_loader = DataLoader(
    dataset=ReturnsDataset(returns_test, ecb, fed, y_test),
    collate_fn=collate_fn,
    batch_size=32,
    shuffle=False
)

# Loading model

In [30]:
from model_02.model import MyModel

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [32]:
model = MyModel(dropout=1/2).to(device)
model

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- T

MyModel(
  (corpus_enc_ecb): CorpusEncoder(
    (doc_encoder): DocumentEncoder(
      (text_encoder): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0): TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(in_features=768, out_features=768, bias=True)
                (k_lin): Linear(in_features=768, out_features=768, bias=True)
                (v_lin): Linear(in_features=768, out_features=768, bias=True)
                (out_lin): Linear(in_features=768, out_features=768, bias=True)
              )
              (sa_layer_norm): LayerNorm((768,), e

In [33]:
# Test output
batch = next(iter(val_loader))

with torch.no_grad():
    model.eval()
    X_ecb = batch["X_ecb"].to(device)
    X_ecb_att = batch["X_ecb mask"].to(device)
    X_fed = batch["X_fed"].to(device)
    X_fed_att = batch["X_fed_mask"].to(device)
    X_ind =  batch["X_ind"].to(device)
    y = batch["label"]
    output = model(X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind)
print(X_ecb)
print(X_ecb_att)
print(output)
print(y)

tensor([[[  101,  6203, 11141,  ...,  1996,  3072,   102],
         [  101,   102,     0,  ...,     0,     0,     0]],

        [[  101,  6203,  6456,  ...,  1996,  4132,   102],
         [  101,  6203,  6456,  ...,  2013,  3032,   102]],

        [[  101,  2048,  2350,  ...,  1012,  1999,   102],
         [  101,   102,     0,  ...,     0,     0,     0]],

        ...,

        [[  101,  1006,  1529,  ...,  2197,  2416,   102],
         [  101,   102,     0,  ...,     0,     0,     0]],

        [[  101,  2009,  2003,  ...,  2000,  7868,   102],
         [  101,   102,     0,  ...,     0,     0,     0]],

        [[  101, 12194,  3343,  ...,  2008,  2057,   102],
         [  101,   102,     0,  ...,     0,     0,     0]]], device='cuda:0')
tensor([[[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 0,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1, 

In [34]:
lr=1e-3
decay = 1e-5

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=decay)
criterion = nn.BCELoss()

In [35]:
# Load model
epoch = 1
model.load_state_dict(torch.load(f"model_02/weights/model_bis_{epoch}_epoch.pt"))

In [36]:
epochs = 1
eval_every = 5

model.train()
for epoch_ in range(1, epochs+1):
    model.train()
    print("Epoch ", epoch+epoch_)
    total_loss = 0
    total_entries = 0
    correct = 0
    with tqdm(train_loader, unit="batch") as tepoch:
        for batch in tepoch:
            tepoch.set_description(f"Epoch {epoch+epoch_}")
            optimizer.zero_grad()
            X_ecb = batch["X_ecb"].to(device)
            X_ecb_att = batch["X_ecb mask"].to(device)
            X_fed = batch["X_fed"].to(device)
            X_fed_att = batch["X_fed_mask"].to(device)
            X_ind =  batch["X_ind"].to(device)
            y = batch["label"].to(device)
            batch_size_ = y.size(0)
            output = model(X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind).view(-1)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            output = (output > 1/2).float()
            correct += (output == y).sum().item()
            total_loss += loss.item() * batch_size_
            total_entries += batch_size_

            # del X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind, y, batch
            gc.collect()
            torch.cuda.empty_cache()
            tepoch.set_postfix(loss=total_loss/total_entries, accuracy=100. * correct/total_entries)

    print(f"Mean loss over the last epoch: {total_loss / total_entries:.4f}")
    print(f"Accuracy over the last epoch: {correct / total_entries:.4f}")
    torch.save(model.state_dict(), f"model_01/weights/model_bis_{epoch+epoch_}_epoch.pt")
    if not eval_every is None and epoch+epoch_ > 1 and epoch+epoch_%eval_every == 0:
        model.eval()
        print(f"Evaluation at epoch {epoch+epoch_}")
        total_loss = 0
        total_entries = 0
        correct = 0
        with torch.no_grad():
            for batch in tqdm(val_loader):

                X_ecb = batch["X_ecb"].to(device)
                X_ecb_att = batch["X_ecb mask"].to(device)
                X_fed = batch["X_fed"].to(device)
                X_fed_att = batch["X_fed_mask"].to(device)
                X_ind =  batch["X_ind"].to(device)
                y = batch["label"].to(device)
                batch_size_ = y.size(0)
                output = model(X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind).view(-1)
                loss = criterion(output, y)

                total_loss += loss.item() * batch_size_
                output = (output > 1/2).float()
                correct += (output == y).sum().item()
                total_entries += batch_size_

                del X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind, batch
                gc.collect()
                torch.cuda.empty_cache()

            print(f"Mean loss over validation set: {total_loss / total_entries:.4f}")
            print(f"Accuracy over validation set: {correct / total_entries:.4f}")
epoch += epoch_

Epoch  1


Epoch 1: 100%|██████████| 2684/2684 [57:26<00:00,  1.28s/batch, accuracy=53, loss=0.69]    


Mean loss over the last epoch: 0.6904
Accuracy over the last epoch: 0.5303


In [40]:
model.eval()
print(f"Evaluation at epoch {epoch}")
total_loss = 0
total_entries = 0
correct = 0
with torch.no_grad():
    for batch in tqdm(val_loader):

        X_ecb = batch["X_ecb"].to(device)
        X_ecb_att = batch["X_ecb mask"].to(device)
        X_fed = batch["X_fed"].to(device)
        X_fed_att = batch["X_fed_mask"].to(device)
        X_ind =  batch["X_ind"].to(device)
        y = batch["label"].to(device)
        batch_size_ = y.size(0)
        output = model(X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind).view(-1)
        loss = criterion(output, y)

        total_loss += loss.item() * batch_size_
        output = (output > 1/2).float()
        correct += (output == y).sum().item()
        total_entries += batch_size_

        del X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind, batch
        gc.collect()
        torch.cuda.empty_cache()
print(f"Mean loss over validation set: {total_loss / total_entries:.4f}")
print(f"Accuracy over validation set: {correct / total_entries:.4f}")

Evaluation at epoch 5


100%|██████████| 56/56 [10:39<00:00, 11.41s/it]

Mean loss over validation set: 0.6872
Accuracy over validation set: 0.5511





In [41]:
model.eval()
print(f"Test at epoch {epoch}")
total_loss = 0
total_entries = 0
correct = 0
with torch.no_grad():
    for batch in tqdm(test_loader):

        X_ecb = batch["X_ecb"].to(device)
        X_ecb_att = batch["X_ecb mask"].to(device)
        X_fed = batch["X_fed"].to(device)
        X_fed_att = batch["X_fed_mask"].to(device)
        X_ind =  batch["X_ind"].to(device)
        y = batch["label"].to(device)
        batch_size_ = y.size(0)
        output = model(X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind).view(-1)
        loss = criterion(output, y)

        total_loss += loss.item() * batch_size_
        output = (output > 1/2).float()
        correct += (output == y).sum().item()
        total_entries += batch_size_

        del X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind, batch
        gc.collect()
        torch.cuda.empty_cache()
print(f"Mean loss over test set: {total_loss / total_entries:.4f}")
print(f"Accuracy over test set: {correct / total_entries:.4f}")

Test at epoch 5


100%|██████████| 56/56 [10:52<00:00, 11.66s/it]

Mean loss over test set: 0.6872
Accuracy over test set: 0.5508





In [68]:
torch.save(model.state_dict(), f"model_01/weights/model_{epoch}_epoch.pt")

In [39]:
del X_ecb, X_ecb_att, X_fed, X_fed_att, X_ind, batch
gc.collect()
torch.cuda.empty_cache()

NameError: name 'X_ecb' is not defined

In [None]:
del model
gc.collect()
torch.cuda.empty_cache()