In [1]:
# !pip install sklearn torch datasets transformers

In [1]:
# Importação do Dataset

from torch.utils.data.dataset import random_split
from datasets import load_dataset

raw_datasets = load_dataset('csv', data_files={'train': 'df_dataset.csv', 'test':'df_dataset_test.csv'})

Using custom data configuration default-402dc41faabe14b5


Downloading and preparing dataset csv/default to /home/arthurn/.cache/huggingface/datasets/csv/default-402dc41faabe14b5/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/arthurn/.cache/huggingface/datasets/csv/default-402dc41faabe14b5/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
# Tratamento básico dos dados

train_texts = raw_datasets['train']['txt']
train_labels = raw_datasets['train']['has_anger']
test_texts = raw_datasets['test']['txt']
test_labels = raw_datasets['test']['has_anger']

print("TrainTexts Length: ", len(train_texts))
print("TrainLabels Length: ", len(train_labels))
print("TestTexts Length: ", len(test_texts))
print("TestLabels Length: ", len(test_labels))

# Removendo elementos None no texto e nas labels
elements_none = []
for x in range(len(test_texts)):
    if (test_texts[x] == None):
        elements_none.append(x)

for index_none in sorted(elements_none, reverse=True):
    test_texts.pop(index_none)
    test_labels.pop(index_none)
    
print("TestTexts Post Processing Length: ", len(test_texts))
print("TestLabels Post Processing Length: ", len(test_labels))

TrainTexts Length:  7251
TrainLabels Length:  7251
TestTexts Length:  421
TestLabels Length:  421
TestTexts Post Processing Length:  419
TestLabels Post Processing Length:  419


In [3]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.96)

In [4]:
print("TrainTexts Length: ", len(train_texts))
print("TrainLabels Length: ", len(train_labels))
print("ValidationTexts Length: ", len(val_texts))
print("ValidationLabels Length: ", len(val_labels))

TrainTexts Length:  290
TrainLabels Length:  290
ValidationTexts Length:  6961
ValidationLabels Length:  6961


In [5]:
# Importando o Tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
# Tokenização dos datasets

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [7]:
# Voltando os datasets tokenizados para instâncias da classe de Dataset

import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [8]:
# print(val_dataset.__getitem__(0))

In [9]:
# Fine Tuning do Modelo

from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AdamW

device = 'cpu'

model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=2)
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.sum().backward() # loss.backward() # Tive que alterar, pois loss.backward() é, implicitamente, loss.backward(torch.Tensor([1]))
        # e no caso, deveria ser um vetor com mais de um elemento
        # https://discuss.pytorch.org/t/loss-backward-raises-error-grad-can-be-implicitly-created-only-for-scalar-outputs/12152
        optim.step()

model.eval()

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
# Salvando o modelo Pre treinado

model.save_pretrained("pretrained-model-bert-base-portuguese-cased-fine-tuning-1")

In [11]:
# Descobrindo acurácia do Modelo

from torch import nn

acertos = 0
for i in range(len(val_texts)):
    tokenized_text = tokenizer(val_texts[i], truncation=True, padding=True, max_length=512)
    # print(entry)
    output = model(torch.tensor([tokenized_text.input_ids]))
    if (torch.argmax(output.logits, dim=-1) == val_labels[i]):
        acertos += 1
        
print("Acertos: ", acertos)
print("Total: ", len(val_texts))
print(f'Acurácia: {(acertos / len(val_texts)) * 100:.4f}%')

Acertos:  6496
Total:  6961
Acurácia: 93.3199%


In [12]:
# Predição de textos específicos

text = 'Texto aqui'

tokenized_text = tokenizer(text, truncation=True, padding=True, max_length=512)

output = model(torch.tensor([tokenized_text.input_ids]))

print(torch.argmax(output.logits, dim=-1))

tensor([0])
