[Tutorial de referência - Kaggle](https://www.kaggle.com/code/maostack/clrp-how-to-get-text-embedding-from-roberta/notebook#Model)

In [None]:
%pip install -r requirements.txt

In [None]:
%cd /embeddings_dinamicas

In [5]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import RobertaModel, RobertaTokenizer

In [6]:
class Settings:
    batch_size=16
    max_len=350
    device = "cuda" if torch.cuda.is_available() else "cpu"
    seed = 318

In [7]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True

set_seed(Settings.seed)

In [8]:
class TrainValidDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.text = df["textos"].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        texts = self.text[idx]
        tokenized = self.tokenizer.encode_plus(texts, truncation=True, add_special_tokens=True, max_length=self.max_len, padding="max_length")
        ids = tokenized["input_ids"]
        mask = tokenized["attention_mask"]
        return {
            "ids": torch.LongTensor(ids),
            "mask": torch.LongTensor(mask),
        }

In [9]:
class CommonLitRoBERTa(nn.Module):
    def __init__(self, pretrained_path):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(pretrained_path)

    def forward(self, ids, mask):
        output = self.roberta(ids, attention_mask=mask)
        return output

In [13]:
#!git clone https://huggingface.co/roberta-base

Cloning into 'roberta-base'...
remote: Enumerating objects: 81, done.[K
remote: Total 81 (delta 0), reused 0 (delta 0), pack-reused 81[K
Unpacking objects: 100% (81/81), 1.63 MiB | 5.81 MiB/s, done.


In [17]:
model = RobertaModel.from_pretrained("./roberta-base")

SafetensorError: Error while deserializing header: HeaderTooLarge

In [10]:
model = CommonLitRoBERTa("./roberta-base")
model.to(Settings.device)

SafetensorError: Error while deserializing header: HeaderTooLarge

In [31]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer

RobertaTokenizer(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [23]:
%cd "/../resumos-anotados"

[Errno 2] No such file or directory: '/../resumos-anotados'
/home/vira-tempo/Desktop/PIE-EMBRAPA/embeddings_dinamicas


In [20]:
import pandas as pd
import numpy as np

files = []
resumos = os.listdir('./')

files_dic = {"textos": resumos}
df_train = pd.DataFrame(files_dic)

In [29]:
def train_tokenizer(df_train, tokenizer):
  # load dataset and train data
  train_dataset = TrainValidDataset(df_train, tokenizer, Settings.max_len)
  train_loader = DataLoader(train_dataset, batch_size=Settings.batch_size, shuffle=True, num_workers=2, pin_memory=True)

  # inicialize batch
  batch = next(iter(train_loader))

  # set ids and masks
  ids = batch["ids"].to(Settings.device)
  mask = batch["mask"].to(Settings.device)
  print(ids.shape)
  print(mask.shape)

  # create model with ids and masks
  output = model(ids, mask)
  print(output)

  last_hidden_state = output[0]
  print("last_hidden_state shape:", last_hidden_state.shape)


  pooler_output = output[1]
  if(pooler_output.shape):
    print("pooler_output shape:", pooler_output.shape)

  cls_embeddings = last_hidden_state[:, 0, :].detach()
  print("cls_embeddings shape:", cls_embeddings.shape)
  print(cls_embeddings)
  pd.DataFrame(cls_embeddings.numpy()).head()

  print(last_hidden_state.shape)
  pooled_embeddings = last_hidden_state.detach().mean(dim=1)
  print("shape:", pooled_embeddings.shape)
  print("")
  print(pooled_embeddings)
  pd.DataFrame(pooled_embeddings.numpy()).head()

In [30]:
train_tokenizer(df_train, tokenizer)

torch.Size([3, 350])
torch.Size([3, 350])


NameError: name 'model' is not defined

In [10]:
from transformers import RobertaTokenizerFast

tokenizer_fast = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer_fast

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [17]:
train_tokenizer(df_train, tokenizer_fast)

torch.Size([16, 350])
torch.Size([16, 350])
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-5.9299e-02,  3.7249e-02,  1.2489e-02,  ..., -1.1966e-01,
          -3.7666e-02,  8.4265e-03],
         [-4.0645e-03, -1.0995e-01, -3.2749e-02,  ..., -8.1397e-01,
          -1.4021e-01, -1.8434e-01],
         [-1.1272e-01,  1.9893e-01,  6.1200e-02,  ...,  1.8740e-01,
          -2.0491e-01, -3.6131e-03],
         ...,
         [ 4.9667e-02, -1.9686e-01,  3.2727e-02,  ..., -2.4733e-01,
          -7.6935e-02,  8.1675e-02],
         [ 4.9667e-02, -1.9686e-01,  3.2727e-02,  ..., -2.4733e-01,
          -7.6934e-02,  8.1675e-02],
         [ 4.9667e-02, -1.9686e-01,  3.2727e-02,  ..., -2.4733e-01,
          -7.6934e-02,  8.1675e-02]],

        [[-6.1533e-02,  3.8445e-02,  1.2164e-02,  ..., -1.1675e-01,
          -3.9379e-02,  1.1573e-02],
         [-3.6284e-03, -1.3299e-01, -2.5280e-02,  ..., -7.7106e-01,
          -1.2827e-01, -1.7892e-01],
         [-1.0336e-01,  1.7424e-01,  

In [18]:
from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
import torch

tokenizer_causallm = AutoTokenizer.from_pretrained("roberta-base")
config = AutoConfig.from_pretrained("roberta-base")
config.is_decoder = True
model = RobertaForCausalLM.from_pretrained("roberta-base", config=config)
model.to(Settings.device)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [1]:
train_tokenizer(df_train, tokenizer_causallm)

NameError: ignored