In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.optim import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from preprocessing.preprocessing import ecb_pipeline_en, fast_detect

import time

import gc

from tqdm import tqdm


torch.set_default_dtype(torch.float32)

In [2]:
FILENAME = "data/train_series.csv"
FILENAME_ECB = "data/ecb_data.csv"
FILENAME_FED = "data/fed_data.csv"

In [3]:
returns = pd.read_csv(FILENAME, index_col=0)
ecb = pd.read_csv(FILENAME_ECB, index_col=0)
fed = pd.read_csv(FILENAME_FED, index_col=0)

In [4]:
returns = pd.get_dummies(returns, columns=["Index Name"])

In [5]:
returns["Sign"] = (returns["Index + 1"] > 0).astype(int)

In [6]:
returns.head()

Unnamed: 0,Index - 9,Index - 8,Index - 7,Index - 6,Index - 5,Index - 4,Index - 3,Index - 2,Index - 1,Index - 0,...,Index Name_CVIX Index,Index Name_EURUSD Curncy,Index Name_EURUSDV1M Curncy,Index Name_MOVE Index,Index Name_SPX Index,Index Name_SRVIX Index,Index Name_SX5E Index,Index Name_V2X Index,Index Name_VIX Index,Sign
0,0.001045,0.005841,0.003832,-0.027519,-0.103565,-0.045086,-0.011265,0.005164,0.05405,0.015779,...,0,0,0,0,0,0,0,1,0,1
1,-0.021497,0.007891,-0.013175,-0.008436,0.0,0.026303,0.000556,0.001455,0.007422,0.0,...,0,0,0,1,0,0,0,0,0,1
2,-0.001872,-0.008154,0.023588,0.004086,0.003493,0.0033,0.000885,-0.011304,0.00504,0.000156,...,0,0,0,0,1,0,0,0,0,1
3,0.00498,-0.000864,0.001677,0.0,0.00603,-0.001083,0.000419,0.001492,0.001018,-0.002582,...,0,0,0,0,1,0,0,0,0,1
4,0.00036,-0.001893,0.005579,-0.003056,-0.001171,-0.001623,-0.00235,-0.006444,-0.000729,-0.000365,...,0,1,0,0,0,0,0,0,0,1


In [7]:
y = returns["Sign"]

In [8]:
y.value_counts()

0    4930
1    4016
Name: Sign, dtype: int64

In [9]:
returns = returns.drop(["Sign", "Index + 1"], axis=1)

In [10]:
returns.columns

Index(['Index - 9', 'Index - 8', 'Index - 7', 'Index - 6', 'Index - 5',
       'Index - 4', 'Index - 3', 'Index - 2', 'Index - 1', 'Index - 0',
       'index ecb', 'index fed', 'Index Name_CVIX Index',
       'Index Name_EURUSD Curncy', 'Index Name_EURUSDV1M Curncy',
       'Index Name_MOVE Index', 'Index Name_SPX Index',
       'Index Name_SRVIX Index', 'Index Name_SX5E Index',
       'Index Name_V2X Index', 'Index Name_VIX Index'],
      dtype='object')

In [11]:
nontextual_cols = ['Index - 9',
 'Index - 8',
 'Index - 7',
 'Index - 6',
 'Index - 5',
 'Index - 4',
 'Index - 3',
 'Index - 2',
 'Index - 1',
 'Index - 0',
 'Index Name_CVIX Index',
 'Index Name_EURUSD Curncy',
 'Index Name_EURUSDV1M Curncy',
 'Index Name_MOVE Index',
 'Index Name_SPX Index',
 'Index Name_SRVIX Index',
 'Index Name_SX5E Index',
 'Index Name_V2X Index',
 'Index Name_VIX Index']
nb_nontextfeatures = len(nontextual_cols)

In [12]:
# 60% train, 20% val, 20% test

returns_, returns_test, y_, y_test = train_test_split(
    returns, y, test_size=0.2, train_size=0.8,
    random_state=0, stratify=y
    )

returns_train, returns_val, y_train, y_val = train_test_split(
    returns_, y_, test_size=0.25, train_size=0.75,
    random_state=42, stratify=y_
    )

In [13]:
del returns, y
gc.collect()

0

# The textual data

In [14]:
ecb.head()

Unnamed: 0,title,speaker,text
0,Comments by Yves Mersch at Financial Services ...,Yves Mersch,Comments by Yves Mersch at Financial Service...
1,Securing sustained economic growth in the euro...,Vítor Constâncio,Securing sustained economic growth in the eu...
2,The role of monetary policy in addressing the ...,Mario Draghi,The role of monetary policy in addressing th...
3,The pandemic emergency: the three challenges f...,Philip R. Lane,SPEECH The pandemic emergency: the three c...
4,Transmission channels of monetary policy in th...,Peter Praet,Transmission channels of monetary policy in ...


In [15]:
fed.head()

Unnamed: 0,title,speaker,text
0,The Importance of Economic Education and Finan...,Governor Frederic S. Mishkin,As ...
1,Financial Innovation and Consumer Protection,Chairman Ben S. Bernanke,"The concept of financial innovation, it seems..."
2,Implementing Basel II in the United States,Governor Randall S. Kroszner,Good afternoon. I would like to thank Standar...
3,An Assessment of the U.S. Economy,Vice Chair for Supervision Randal K. Quarles,Thank you for the opportunity to take part in...
4,Monetary Policy since the Onset of the Crisis,Chairman Ben S. Bernanke,When we convened in Jackson Hole in August 20...


In [16]:
ecb["text_"] = ecb.apply(ecb_pipeline_en, axis=1)

In [17]:
ecb["text"].fillna("", inplace=True)
ecb["speaker"].fillna("Unknown", inplace=True)
fed["speaker"].fillna("Unknown", inplace=True)

In [18]:
# Text in french
ecb.loc[138]
# Text in german
ecb.loc[151]

title                         Auf neuen Wegen zum alten Ziel
speaker                                          Yves Mersch
text         Auf neuen Wegen zum alten Ziel   Rede von Yv...
text_      Rede von Yves Mersch, Mitglied des Direktorium...
Name: 151, dtype: object

In [19]:
ecb["lang"] = ecb["text_"].apply(fast_detect)

In [20]:
ecb.head()

Unnamed: 0,title,speaker,text,text_,lang
0,Comments by Yves Mersch at Financial Services ...,Yves Mersch,Comments by Yves Mersch at Financial Service...,Sustainable economic growth in the real econom...,en
1,Securing sustained economic growth in the euro...,Vítor Constâncio,Securing sustained economic growth in the eu...,"Ladies and Gentlemen, Thank you for inviting m...",en
2,The role of monetary policy in addressing the ...,Mario Draghi,The role of monetary policy in addressing th...,"There was a time, not too long ago, when centr...",en
3,The pandemic emergency: the three challenges f...,Philip R. Lane,SPEECH The pandemic emergency: the three c...,"Today, I will discuss the monetary policy meas...",en
4,Transmission channels of monetary policy in th...,Peter Praet,Transmission channels of monetary policy in ...,"Ladies and Gentlemen, Since the onset of the f...",en


In [21]:
fed["lang"] = fed["text"].apply(fast_detect)

In [22]:
fed.head()

Unnamed: 0,title,speaker,text,lang
0,The Importance of Economic Education and Finan...,Governor Frederic S. Mishkin,As ...,en
1,Financial Innovation and Consumer Protection,Chairman Ben S. Bernanke,"The concept of financial innovation, it seems...",en
2,Implementing Basel II in the United States,Governor Randall S. Kroszner,Good afternoon. I would like to thank Standar...,en
3,An Assessment of the U.S. Economy,Vice Chair for Supervision Randal K. Quarles,Thank you for the opportunity to take part in...,en
4,Monetary Policy since the Onset of the Crisis,Chairman Ben S. Bernanke,When we convened in Jackson Hole in August 20...,en


In [23]:
all_langs = ecb["lang"].unique()

In [24]:
ecb["lang"].value_counts()

en    1645
de      75
fr      31
es      16
it       5
Name: lang, dtype: int64

# Translation

# HuggingFace Transformers

# Loading data

In [25]:
from model.framework_dataset import get_data_loader
from model.framework_model import CorpusEncoder, ClassificationHead, MyModel

In [26]:
config = {

    "method": "model_01",

    "learning_rate": 0.001,

    "weight_decay": 0.,

    "batch_size": 2,

    "layers": 3,

    "dropout": 0.3,

    "separate": True,
    
    "max_corpus_len": 2

}

In [27]:
train_set, train_loader, tokenizer, steps = get_data_loader(
    returns_train, ecb, fed, y_train, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set, val_loader, tokenizer, steps = get_data_loader(
    returns_val, ecb, fed, y_val, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

test_set, test_loader, tokenizer, steps = get_data_loader(
    returns_test, ecb, fed, y_test, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

# Loading model

In [28]:
from model.framework_model import MyModel

In [29]:
# device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Other example

In [30]:
config = {

    "method": "model_01",

    "learning_rate": 0.01,

    "weight_decay": 0.01,

    "batch_size": 2,

    "layers": 3,

    "dropout": 0.5,

    "separate": True,
    
    "max_corpus_len": 2

}

In [31]:
train_set, train_loader, tokenizer, steps = get_data_loader(
    returns_train, ecb, fed, y_train, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

val_set, val_loader, tokenizer, steps = get_data_loader(
    returns_val, ecb, fed, y_val, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

test_set, test_loader, tokenizer, steps = get_data_loader(
    returns_test, ecb, fed, y_test, method=config["method"],
    separate=config["separate"], max_corpus_len=config["max_corpus_len"],
    batch_size=config["batch_size"]
)

In [32]:
model3 = MyModel(
    nontext_dim=nb_nontextfeatures, method=config["method"],
    separate=config["separate"], dropout=config["dropout"]
).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- T

In [33]:
from train import train, evaluate

In [34]:
train(model=model3, train_loader=train_loader, val_loader=val_loader, config=config, device=device,
      max_epochs=1, eval_every=1, name="model_01_test")

Epoch 0:   0%|          | 12/2684 [00:40<2:28:40,  3.34s/batch, accuracy=66.7, loss=10.7]


KeyboardInterrupt: 