In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import T5Model
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments,  BertForPreTraining
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk

In [23]:
data = pd.read_csv("all_ECB_speeches.csv", sep="|", encoding="utf-8")
print("Shape DataFrame: ", data.shape)
print("Data size: ", data.shape[0])
data_drop_nan = data.dropna(subset=data.columns)
print("Dropando NaN")
print("Data size: ", data_drop_nan.shape[0] )
data.head()


Shape DataFrame:  (2772, 5)
Data size:  2772
Dropando NaN
Data size:  2683


Unnamed: 0,date,speakers,title,subtitle,contents
0,2024-02-26,Christine Lagarde,European Parliament plenary debate on the ECB ...,"Speech by Christine Lagarde, President of the ...",SPEECH European Parliament plenary debate ...
1,2024-02-23,Isabel Schnabel,Has the fight against inflation been won?,"Slides by Isabel Schnabel, Member of the Execu...",
2,2024-02-16,Isabel Schnabel,From laggard to leader? Closing the euro area’...,Inaugural lecture of the EMU Lab by Isabel Sch...,SPEECH From laggard to leader? Closing the ...
3,2024-02-15,Philip R. Lane,The banking channel of monetary policy,"Slides by Philip R. Lane, Member of the Execut...",
4,2024-02-15,Christine Lagarde,Hearing of the Committee on Economic and Monet...,"Speech by Christine Lagarde, President of the ...",SPEECH Hearing of the Committee on Economic...


In [4]:
model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
labels = model.config.id2label
print("Maximum sequence length of 512 tokens")
print("Etiquetas do modelo:", labels)

Maximum sequence length of 512 tokens
Etiquetas do modelo: {0: 'positive', 1: 'negative', 2: 'neutral'}


In [166]:
exemplo_1 = " Pre-tax loss totaled euro 0.3 million , compared to a loss of euro 2.2 million in the first quarter of 2005 ."
exemplo_2 = " This implementation is very important to the operator , since it is about to launch its Fixed to Mobile convergence service in Brazil "
exemplo_3 = " The situation of coated magazine printing paper will continue to be weak ."
exemplo_list = [exemplo_1, exemplo_2, exemplo_3]
exemplo_labels = ["positive", "neutral", "negative"]

In [167]:
for exemplo, exemplo_label in zip(exemplo_list, exemplo_labels):
    inputs = tokenizer(exemplo, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    outputs = model(**inputs)
    last_hidden_states = outputs[0]
    prediction = labels[outputs.logits.argmax(dim=1).item()]
    print("Last hidden states: ", last_hidden_states)
    print("Prediction: ",prediction, end = "               ")
    print("True Value: ", exemplo_label)

Last hidden states:  tensor([[ 0.3164,  1.3494, -2.2071]], grad_fn=<AddmmBackward0>)
Prediction:  negative               True Value:  positive
Last hidden states:  tensor([[ 1.7332, -2.9581,  0.7627]], grad_fn=<AddmmBackward0>)
Prediction:  positive               True Value:  neutral
Last hidden states:  tensor([[-1.4422,  1.9802, -0.2990]], grad_fn=<AddmmBackward0>)
Prediction:  negative               True Value:  negative


In [183]:
for exemplo, exemplo_label in zip(exemplo_list, exemplo_labels):
    exemplo = exemplo.split(",")
    sum_states = []
    for sentence in exemplo:    
        inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        outputs = model(**inputs)
        sum_states.append(np.array(outputs[0].tolist())) 
    sum_states = np.sum(sum_states, axis=0)  
    prediction = labels[np.argmax(sum_states)]
    print("Last hidden states: ", sum_states)
    print("Prediction: ", prediction, end="               ")
    print("True Value: ", exemplo_label)

Last hidden states:  [[-0.29729581  1.20301848 -0.62639463]]
Prediction:  negative               True Value:  positive
Last hidden states:  [[ 1.57148963 -5.26005149  3.76658833]]
Prediction:  neutral               True Value:  neutral
Last hidden states:  [[-1.4422096   1.98019993 -0.2990008 ]]
Prediction:  negative               True Value:  negative


In [145]:
tokens = tokenizer.tokenize(exemplo_1)
print(tokens, type(tokens))
num_tokens = len(tokens)
print(num_tokens)

['pre', '-', 'tax', 'loss', 'totaled', 'euro', '0', '.', '3', 'million', ',', 'compared', 'to', 'a', 'loss', 'of', 'euro', '2', '.', '2', 'million', 'in', 'the', 'first', 'quarter', 'of', '2005', '.'] <class 'list'>
28


In [146]:
predictions_0 = []
for content in data["contents"]:
    if type(content) == float:
        predictions_0.append(np.nan) 
    else:
        inputs = tokenizer(content, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        outputs = model(**inputs)
        predictions_0.append(labels[outputs.logits.argmax(dim=1).item()])

In [147]:
print(predictions_0)
print(len(predictions_0))

['negative', nan, 'neutral', nan, 'negative', nan, 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'negative', nan, 'neutral', nan, 'negative', nan, 'neutral', 'neutral', 'neutral', nan, 'negative', nan, 'neutral', nan, 'neutral', nan, nan, 'neutral', 'neutral', 'neutral', 'negative', 'neutral', nan, 'negative', nan, 'negative', 'neutral', 'positive', 'negative', 'negative', 'neutral', nan, 'neutral', nan, 'negative', 'neutral', 'negative', 'neutral', 'neutral', nan, 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'positive', 'neutral', 'neutral', 'negative', 'negative', 'negative', 'neutral', 'neutral', 'neutral', 'negative', nan, 'negative', 'positive', 'positive', 'negative', 'positive', 'positive', nan, 'positive', 'neutral', 'positive', 'positive', 'positive', nan, 'neutral', nan, nan, 'positive', nan, 'neutral', 'neutral', 'neutral', nan, nan, 'positive', 'neutral', 'positive', nan, 'negative', nan, 'negative', 'neutral', nan, nan, 'neutral',

In [150]:
predictions_1 = []
for content in data["contents"]:
    if type(content) == float:
        predictions_1.append(np.nan) 
    else:
        inputs = tokenizer(content, return_tensors="pt", max_length=256, truncation=True, padding="max_length")
        outputs = model(**inputs)
        predictions_1.append(labels[outputs.logits.argmax(dim=1).item()])

In [151]:
print(predictions_1)
print(len(predictions_1))

['positive', nan, 'positive', nan, 'neutral', nan, 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', nan, 'neutral', nan, 'positive', nan, 'negative', 'neutral', 'neutral', nan, 'negative', nan, 'neutral', nan, 'negative', nan, nan, 'positive', 'neutral', 'neutral', 'negative', 'neutral', nan, 'negative', nan, 'negative', 'neutral', 'positive', 'positive', 'negative', 'neutral', nan, 'positive', nan, 'positive', 'neutral', 'positive', 'neutral', 'neutral', nan, 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'negative', 'positive', 'positive', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'neutral', 'positive', nan, 'negative', 'neutral', 'positive', 'positive', 'positive', 'neutral', nan, 'positive', 'neutral', 'positive', 'positive', 'negative', nan, 'neutral', nan, nan, 'neutral', nan, 'neutral', 'neutral', 'neutral', nan, nan, 'neutral', 'neutral', 'negative', nan, 'positive', nan, 'neutral', 'neutral', nan, nan, 'neutral', 

In [152]:
predictions_2 = []
for content in data["contents"]:
    if type(content) == float:
        predictions_2.append(np.nan) 
    else:
        inputs = tokenizer(content, return_tensors="pt", max_length=128, truncation=True, padding="max_length")
        outputs = model(**inputs)
        predictions_2.append(labels[outputs.logits.argmax(dim=1).item()])
print(predictions_2)
print(len(predictions_2))

['positive', nan, 'positive', nan, 'positive', nan, 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'positive', nan, 'neutral', nan, 'neutral', nan, 'negative', 'positive', 'neutral', nan, 'negative', nan, 'neutral', nan, 'neutral', nan, nan, 'neutral', 'neutral', 'positive', 'neutral', 'neutral', nan, 'negative', nan, 'negative', 'neutral', 'positive', 'positive', 'neutral', 'neutral', nan, 'positive', nan, 'positive', 'positive', 'neutral', 'neutral', 'neutral', nan, 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'negative', 'negative', 'negative', 'positive', 'neutral', 'neutral', 'negative', 'neutral', 'positive', 'positive', 'positive', 'positive', nan, 'negative', 'neutral', 'positive', 'positive', 'positive', 'neutral', nan, 'neutral', 'neutral', 'negative', 'positive', 'negative', nan, 'neutral', nan, nan, 'positive', nan, 'neutral', 'neutral', 'neutral', nan, nan, 'neutral', 'neutral', 'positive', nan, 'positive', nan, 'neutral', 'neutral', nan, nan, 'negativ

In [153]:
predictions_3 = []
for content in data["contents"]:
    if type(content) == float:
        predictions_3.append(np.nan) 
    else:
        inputs = tokenizer(content, return_tensors="pt", max_length=64, truncation=True, padding="max_length")
        outputs = model(**inputs)
        predictions_3.append(labels[outputs.logits.argmax(dim=1).item()])
print(predictions_3)
print(len(predictions_3))

['positive', nan, 'neutral', nan, 'neutral', nan, 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', nan, 'neutral', nan, 'neutral', nan, 'neutral', 'neutral', 'positive', nan, 'negative', nan, 'neutral', nan, 'positive', nan, nan, 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', nan, 'neutral', nan, 'negative', 'neutral', 'positive', 'positive', 'neutral', 'neutral', nan, 'positive', nan, 'positive', 'neutral', 'neutral', 'neutral', 'neutral', nan, 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'positive', 'neutral', 'positive', 'negative', 'neutral', 'positive', 'neutral', 'neutral', 'positive', nan, 'negative', 'positive', 'positive', 'neutral', 'neutral', 'neutral', nan, 'neutral', 'neutral', 'positive', 'neutral', 'positive', nan, 'neutral', nan, nan, 'neutral', nan, 'neutral', 'neutral', 'neutral', nan, nan, 'positive', 'neutral', 'positive', nan, 'neutral', nan, 'neutral', 'positive', nan, nan, 'neutral', nan, 'nega

In [178]:
for exemplo, exemplo_label in zip(exemplo_list, exemplo_labels):
    exemplo = exemplo.split(",")
    sum_states = []
    for sentence in exemplo:    
        inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        outputs = model(**inputs)
        sum_states.append(np.array(outputs[0].tolist())) 
    sum_states = np.sum(sum_states, axis=0)  
    prediction = labels[np.argmax(sum_states)]
    print("Last hidden states: ", sum_states)
    print("Prediction: ", prediction, end="               ")
    print("True Value: ", exemplo_label)

Last hidden states:  [[-1.6210910081863403, 1.2126089334487915, 1.554442048072815], [1.323795199394226, -0.009590454399585724, -2.1808366775512695]]
Prediction:  negative               True Value:  positive
Last hidden states:  [[0.8837599754333496, -2.6314079761505127, 1.7413891553878784], [0.6877296566963196, -2.62864351272583, 2.0251991748809814]]
Prediction:  positive               True Value:  neutral
Last hidden states:  [[-1.4422096014022827, 1.980199933052063, -0.2990007996559143]]
Prediction:  positive               True Value:  negative


In [29]:
predictions_teste = []
for content in data["contents"][:1]:
    if type(content) == float:
        predictions_teste.append(np.nan) 
    else:
        content = content.split(".")
        sum_states = []
        for sentence in content:
            inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
            outputs = model(**inputs)
            sum_states.append(np.array(outputs[0].tolist()))
    sum_states = np.sum(sum_states, axis=0)  
    prediction = labels[np.argmax(sum_states)]
    predictions_teste.append(prediction)

print(predictions_teste)
print(len(predictions_teste))


[]
0


In [30]:
print("Last hidden states: ", sum_states)
print("Prediction: ", prediction, end="               ")

Last hidden states:  [[  61.16585604 -115.49471026   34.67863581]]
Prediction:  positive               