In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.optim import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from preprocessing.preprocessing import ecb_pipeline_en, fast_detect

import time

import gc

from tqdm import tqdm


torch.set_default_dtype(torch.float32)

In [5]:
FILENAME = "data/train_series.csv"
FILENAME_ECB = "data/ecb_data.csv"
FILENAME_FED = "data/fed_data.csv"

In [6]:
returns = pd.read_csv(FILENAME, index_col=0)
ecb = pd.read_csv(FILENAME_ECB, index_col=0)
fed = pd.read_csv(FILENAME_FED, index_col=0)

In [7]:
returns = pd.get_dummies(returns, columns=["Index Name"])

In [8]:
returns["Sign"] = (returns["Index + 1"] > 0).astype(int)

In [9]:
returns.head()

Unnamed: 0,Index - 9,Index - 8,Index - 7,Index - 6,Index - 5,Index - 4,Index - 3,Index - 2,Index - 1,Index - 0,...,Index Name_CVIX Index,Index Name_EURUSD Curncy,Index Name_EURUSDV1M Curncy,Index Name_MOVE Index,Index Name_SPX Index,Index Name_SRVIX Index,Index Name_SX5E Index,Index Name_V2X Index,Index Name_VIX Index,Sign
0,0.001045,0.005841,0.003832,-0.027519,-0.103565,-0.045086,-0.011265,0.005164,0.05405,0.015779,...,0,0,0,0,0,0,0,1,0,1
1,-0.021497,0.007891,-0.013175,-0.008436,0.0,0.026303,0.000556,0.001455,0.007422,0.0,...,0,0,0,1,0,0,0,0,0,1
2,-0.001872,-0.008154,0.023588,0.004086,0.003493,0.0033,0.000885,-0.011304,0.00504,0.000156,...,0,0,0,0,1,0,0,0,0,1
3,0.00498,-0.000864,0.001677,0.0,0.00603,-0.001083,0.000419,0.001492,0.001018,-0.002582,...,0,0,0,0,1,0,0,0,0,1
4,0.00036,-0.001893,0.005579,-0.003056,-0.001171,-0.001623,-0.00235,-0.006444,-0.000729,-0.000365,...,0,1,0,0,0,0,0,0,0,1


In [10]:
y = returns["Sign"]

In [11]:
y.value_counts()

0    4930
1    4016
Name: Sign, dtype: int64

In [12]:
returns = returns.drop(["Sign", "Index + 1"], axis=1)

In [13]:
returns.columns

Index(['Index - 9', 'Index - 8', 'Index - 7', 'Index - 6', 'Index - 5',
       'Index - 4', 'Index - 3', 'Index - 2', 'Index - 1', 'Index - 0',
       'index ecb', 'index fed', 'Index Name_CVIX Index',
       'Index Name_EURUSD Curncy', 'Index Name_EURUSDV1M Curncy',
       'Index Name_MOVE Index', 'Index Name_SPX Index',
       'Index Name_SRVIX Index', 'Index Name_SX5E Index',
       'Index Name_V2X Index', 'Index Name_VIX Index'],
      dtype='object')

In [14]:
nontextual_cols = ['Index - 9',
 'Index - 8',
 'Index - 7',
 'Index - 6',
 'Index - 5',
 'Index - 4',
 'Index - 3',
 'Index - 2',
 'Index - 1',
 'Index - 0',
 'Index Name_CVIX Index',
 'Index Name_EURUSD Curncy',
 'Index Name_EURUSDV1M Curncy',
 'Index Name_MOVE Index',
 'Index Name_SPX Index',
 'Index Name_SRVIX Index',
 'Index Name_SX5E Index',
 'Index Name_V2X Index',
 'Index Name_VIX Index']
nb_nontextfeatures = len(nontextual_cols)

In [15]:
# 60% train, 20% val, 20% test

returns_, returns_test, y_, y_test = train_test_split(
    returns, y, test_size=0.2, train_size=0.8,
    random_state=0, stratify=y
    )

returns_train, returns_val, y_train, y_val = train_test_split(
    returns_, y_, test_size=0.25, train_size=0.75,
    random_state=42, stratify=y_
    )

In [16]:
del returns, y
gc.collect()

0

# The textual data

In [17]:
ecb.head()

Unnamed: 0,title,speaker,text
0,Comments by Yves Mersch at Financial Services ...,Yves Mersch,Comments by Yves Mersch at Financial Service...
1,Securing sustained economic growth in the euro...,Vítor Constâncio,Securing sustained economic growth in the eu...
2,The role of monetary policy in addressing the ...,Mario Draghi,The role of monetary policy in addressing th...
3,The pandemic emergency: the three challenges f...,Philip R. Lane,SPEECH The pandemic emergency: the three c...
4,Transmission channels of monetary policy in th...,Peter Praet,Transmission channels of monetary policy in ...


In [18]:
fed.head()

Unnamed: 0,title,speaker,text
0,The Importance of Economic Education and Finan...,Governor Frederic S. Mishkin,As ...
1,Financial Innovation and Consumer Protection,Chairman Ben S. Bernanke,"The concept of financial innovation, it seems..."
2,Implementing Basel II in the United States,Governor Randall S. Kroszner,Good afternoon. I would like to thank Standar...
3,An Assessment of the U.S. Economy,Vice Chair for Supervision Randal K. Quarles,Thank you for the opportunity to take part in...
4,Monetary Policy since the Onset of the Crisis,Chairman Ben S. Bernanke,When we convened in Jackson Hole in August 20...


In [19]:
ecb["text_"] = ecb.apply(ecb_pipeline_en, axis=1)

In [20]:
ecb["text"].fillna("", inplace=True)
ecb["speaker"].fillna("Unknown", inplace=True)
fed["speaker"].fillna("Unknown", inplace=True)

In [21]:
# Text in french
ecb.loc[138]
# Text in german
ecb.loc[151]

title                         Auf neuen Wegen zum alten Ziel
speaker                                          Yves Mersch
text         Auf neuen Wegen zum alten Ziel   Rede von Yv...
text_      Rede von Yves Mersch, Mitglied des Direktorium...
Name: 151, dtype: object

In [22]:
ecb["lang"] = ecb["text_"].apply(fast_detect)

In [23]:
ecb.head()

Unnamed: 0,title,speaker,text,text_,lang
0,Comments by Yves Mersch at Financial Services ...,Yves Mersch,Comments by Yves Mersch at Financial Service...,Sustainable economic growth in the real econom...,en
1,Securing sustained economic growth in the euro...,Vítor Constâncio,Securing sustained economic growth in the eu...,"Ladies and Gentlemen, Thank you for inviting m...",en
2,The role of monetary policy in addressing the ...,Mario Draghi,The role of monetary policy in addressing th...,"There was a time, not too long ago, when centr...",en
3,The pandemic emergency: the three challenges f...,Philip R. Lane,SPEECH The pandemic emergency: the three c...,"Today, I will discuss the monetary policy meas...",en
4,Transmission channels of monetary policy in th...,Peter Praet,Transmission channels of monetary policy in ...,"Ladies and Gentlemen, Since the onset of the f...",en


In [24]:
fed["lang"] = fed["text"].apply(fast_detect)

In [25]:
fed.head()

Unnamed: 0,title,speaker,text,lang
0,The Importance of Economic Education and Finan...,Governor Frederic S. Mishkin,As ...,en
1,Financial Innovation and Consumer Protection,Chairman Ben S. Bernanke,"The concept of financial innovation, it seems...",en
2,Implementing Basel II in the United States,Governor Randall S. Kroszner,Good afternoon. I would like to thank Standar...,en
3,An Assessment of the U.S. Economy,Vice Chair for Supervision Randal K. Quarles,Thank you for the opportunity to take part in...,en
4,Monetary Policy since the Onset of the Crisis,Chairman Ben S. Bernanke,When we convened in Jackson Hole in August 20...,en


In [26]:
all_langs = ecb["lang"].unique()

In [27]:
ecb["lang"].value_counts()

en    1646
de      75
fr      31
es      16
it       4
Name: lang, dtype: int64

# Translation

# HuggingFace Transformers

We test hierarchical BERT here.

In [38]:
from huggingface_hub import login

In [39]:
login(token='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')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to C:\Users\huuta\.huggingface\token
Login successful


In [43]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [45]:
AutoTokenizer.from_pretrained?

[1;31mSignature:[0m
[0mAutoTokenizer[0m[1;33m.[0m[0mfrom_pretrained[0m[1;33m([0m[1;33m
[0m    [0mpretrained_model_name_or_path[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[0minputs[0m[1;33m,[0m[1;33m
[0m    [1;33m**[0m[0mkwargs[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.

The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
falling back to using pattern matching on `pretrained_model_name_or_path`:

    - **albert** -- [`AlbertTokenizerFast`] (ALBERT model)
    - **bart** -- [`BartTokenizer`] or [`BartTokenizerFast`] (BART model)
    - **barthez** -- [`BarthezTokenizerFast`] (BARThez model)
    - **bartpho** -- [`BartphoTokenizer`] (BARTpho model)
    - **bert** -- [`Be

In [44]:
tokenizer = AutoTokenizer.from_pretrained("kiddothe2b/hierarchical-transformer-base-4096", trust_remote_code=True)

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


AttributeError: type object 'HATTokenizer' has no attribute 'register_for_auto_class'

In [37]:
model = AutoModelForMaskedLM.from_pretrained("kiddothe2b/hierarchical-transformer-base-4096",
                                             trust_remote_code=True)


Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading:   0%|          | 0.00/105k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/766M [00:00<?, ?B/s]

# Loading data

In [26]:
from model.framework_dataset import get_data_loader
from model.framework_model import CorpusEncoder, ClassificationHead, MyModel

In [27]:
config = {

    "method": "model_01",

    "learning_rate": 0.001,

    "weight_decay": 0.,

    "batch_size": 2,

    "layers": 3,

    "dropout": 0.3,

    "separate": True,
    
    "max_corpus_len": 2

}

In [28]:
train_set, train_loader, tokenizer, steps = get_data_loader(
    returns_train, ecb, fed, y_train, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set, val_loader, tokenizer, steps = get_data_loader(
    returns_val, ecb, fed, y_val, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

test_set, test_loader, tokenizer, steps = get_data_loader(
    returns_test, ecb, fed, y_test, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

# Loading model

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
ce = CorpusEncoder(method=config["method"],
                   separate=config["separate"],
                   dropout=config["dropout"]).to(device)
clf = ClassificationHead(
    corpus_emb_dim=ce.corpus_emb_dim, nontext_dim=nb_nontextfeatures,
    layers=config["max_corpus_len"], dropout=config["dropout"]
).to(device)
my_model = MyModel(
    nontext_dim=nb_nontextfeatures, method=config["method"],
    separate=config["separate"], dropout=config["dropout"]
).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- T

In [31]:
# Test output
batch = next(iter(val_loader))

with torch.no_grad():
    ce.eval()
    X_ecb = batch["X_ecb"].to(device)
    X_ecb_att = batch["X_ecb_mask"].to(device)
    X_fed = batch["X_fed"].to(device)
    X_fed_att = batch["X_fed_mask"].to(device)
    X_ind =  batch["X_ind"].to(device)
    y = batch["label"]
    X_text = (X_ecb, X_fed)
    X_att = (X_ecb_att, X_fed_att)
    ce_output = ce(X_text, X_att)
    print("Computed output successfully. ce ouput = \n", ce_output)
    print("corpus encoder ouput shape = ", ce_output.size(), "\ncorpus embed dim = ", ce.corpus_emb_dim)
    print(X_ind.size())


    clf_output = clf(ce_output, X_ind)
    
    print("Classifier output =  \n", clf_output)
    my_model_output = my_model(X_text, X_att, X_ind)

print(X_ecb.size())
print(X_ecb_att.size())
print(y)

Computed output successfully. ce ouput = 
 tensor([[ 0.1655,  0.5805,  0.1516,  0.1980,  0.1799, -0.0704,  0.0039,  0.2648,
         -0.1464, -0.4822, -0.0637, -0.0381,  0.1207,  0.0595, -0.1349, -0.0389,
         -0.4814, -0.4484, -0.2856,  0.5924, -0.0681,  0.0799, -0.0182,  0.1669,
          0.0192, -0.2786, -0.0716, -0.0894,  0.2287, -0.2295, -0.1108,  0.1308,
          0.0775, -0.4293, -0.0793,  0.3174,  0.0321,  0.2328,  0.2982, -0.0495,
         -0.1027,  0.1390,  0.2921,  0.2360, -0.2802,  0.0855, -0.0006, -0.1196,
          0.3278, -0.4202,  0.2727,  0.1209, -0.0837,  0.3232,  0.0220, -0.0669,
          0.0854, -0.3017,  0.1084, -0.0993,  0.3060,  0.1382,  0.2276, -0.0841],
        [ 0.2190,  0.6202,  0.0863,  0.1179,  0.2088,  0.0798, -0.0274,  0.3590,
         -0.0182, -0.4043, -0.0105, -0.1185,  0.1783,  0.1159, -0.1525, -0.0539,
         -0.5589, -0.4120, -0.4387,  0.4396, -0.0862, -0.0060, -0.0561,  0.2117,
         -0.0965, -0.3359,  0.0290, -0.1410,  0.2277, -0.2775, -0

# Other example

In [32]:
config = {

    "method": "model_01",

    "learning_rate": 0.001,

    "weight_decay": 0.,

    "batch_size": 2,

    "layers": 3,

    "dropout": 0.5,

    "separate": False,
    
    "max_corpus_len": 2


}

In [33]:
train_set, train_loader, tokenizer, steps = get_data_loader(
    returns_train, ecb, fed, y_train, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set, val_loader, tokenizer, steps = get_data_loader(
    returns_val, ecb, fed, y_val, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

test_set, test_loader, tokenizer, steps = get_data_loader(
    returns_test, ecb, fed, y_test, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

In [34]:
ce = CorpusEncoder(method=config["method"],
                   separate=config["separate"],
                   dropout=config["dropout"]).to(device)
clf = ClassificationHead(
    corpus_emb_dim=ce.corpus_emb_dim, nontext_dim=nb_nontextfeatures,
    layers=config["max_corpus_len"], dropout=config["dropout"]
).to(device)
my_model = MyModel(
    nontext_dim=nb_nontextfeatures, method=config["method"],
    separate=config["separate"], dropout=config["dropout"]
).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- T

In [35]:
z = torch.range(0, 15)
print(z)
z.view(4, 4)

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15.])


  z = torch.range(0, 15)


tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [12., 13., 14., 15.]])

In [36]:
# Test output
batch = next(iter(val_loader))

with torch.no_grad():
    ce.eval()
    X_text = (batch["X_text"].to(device),)
    X_mask = (batch["X_mask"].to(device),)
    X_ind =  batch["X_ind"].to(device)
    y = batch["label"]
    ce_output = ce(X_text, X_mask)
    print("Computed output successfully. ce ouput = \n", ce_output)
    print("corpus encoder ouput shape = ", ce_output.size(), "\ncorpus embed dim = ", ce.corpus_emb_dim)
    print(X_ind.size())


    clf_output = clf(ce_output, X_ind)
    
    print("Classifier output =  \n", clf_output)
    my_model_output = my_model(X_text, X_mask, X_ind)

print(X_ecb)
print(X_ecb_att)
print(y)

Computed output successfully. ce ouput = 
 tensor([[ 0.0680, -0.4876, -0.3666, -0.1956,  0.1132,  0.0946,  0.1756, -0.0340,
         -0.0052,  0.3330,  0.1639, -0.0179, -0.0024,  0.3258, -0.1272,  0.1548,
          0.0966, -0.0413, -0.2932,  0.2296, -0.7422,  0.1561, -0.1577,  0.0229,
          0.6316, -0.1441,  0.0673, -0.0629, -0.0406,  0.0709, -0.5268, -0.0249],
        [ 0.0360, -0.5765, -0.3424, -0.1569,  0.1195,  0.1085,  0.2071, -0.0058,
         -0.0093,  0.2382,  0.0986,  0.0924,  0.0303,  0.3314, -0.1299,  0.1669,
          0.0670, -0.0543, -0.2332,  0.2621, -0.5938,  0.0526, -0.2652,  0.1079,
          0.6401, -0.0916,  0.1027,  0.0037,  0.0228,  0.1233, -0.5413,  0.0193]],
       device='cuda:0', dtype=torch.float64)
corpus encoder ouput shape =  torch.Size([2, 32]) 
corpus embed dim =  32
torch.Size([2, 19])
Classifier output =  
 tensor([0.5068, 0.5185], device='cuda:0')
tensor([[[  101,  1996, 23889,  ...,  4675,  7011,   102],
         [  101,   102,     0,  ...,     0,

# Testing train code

In [37]:
from train import train, evaluate

In [38]:
train(my_model, train_loader=train_loader, val_loader=val_loader,config=config,
      device=device, max_epochs=1, eval_every=1, name="dummy")

Epoch 0:   2%|▏         | 58/2684 [01:16<55:43,  1.27s/batch, accuracy=49.1, loss=0.703]  