In [None]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install datasets
!pip install accelerate -U
!pip install transformers -U

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

# Training GPT-2 Model with InfiniAttention Module

In [None]:
import torch
import torch.nn as nn
from transformers import GPT2Config, GPT2LMHeadModel
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
from typing import Optional, Tuple, Union

config = GPT2Config()

### Infini

In [None]:
class InfiniAttentionGPT2(GPT2Attention):
    def __init__(
        self,
        config,
        num_heads,
        dim_k,
        dim_v,
        dim_input,
        segment_len,
        dropout=0.1,
        is_cross_attention=False,
        layer_idx=None,
        eps=1e-6,
    ):
        super().__init__(config, is_cross_attention, layer_idx)
        self.dim_k = dim_k
        self.dim_v = dim_v
        self.dim_input = dim_input
        self.num_heads = num_heads
        self.segment_len = segment_len
        self.dropout = dropout
        self.eps = eps

        self.proj_k = nn.Linear(dim_input, num_heads * dim_k, bias=False)
        self.proj_v = nn.Linear(dim_input, num_heads * dim_v, bias=False)
        self.proj_q = nn.Linear(dim_input, num_heads * dim_k, bias=False)
        self.proj_out = nn.Linear(num_heads * dim_v, dim_input, bias=False)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.mask = torch.triu(torch.ones(segment_len, segment_len), diagonal=1).to(
            self.device
        )

        self.betas = nn.Parameter(torch.randn(1, num_heads, 1, dim_v))
        self.dropout_layer = nn.Dropout(self.dropout)

        # self.memory:[batch_size, num_heads, dim_k, dim_v]
        self.memory = torch.zeros(1, num_heads, dim_k, dim_v).to(self.device)

        # self.z:[batch_size, num_heads, 1, dim_k]
        self.z = (torch.randn(1, num_heads, 1, dim_k) * 1e-10).to(self.device)

    def forward(self, x, *args, **kwargs):
        attention_mask = kwargs.get("attention_mask", None)
        # x:[batch_size, seq_len, dim_input]
        batch_size, seq_len, _ = x.shape
        n_seq = seq_len // self.segment_len
        out = []
        for i in range(n_seq):
            # x_segment:[batch_size, segment_len, dim_input]
            x_segment = x[:, i * self.segment_len : (i + 1) * self.segment_len, :]

            # k,v,q:[batch_size, num_heads, segment_len, dim_k/dim_v]
            k = (
                self.proj_k(x_segment)
                .unsqueeze(1)
                .view(batch_size, self.num_heads, self.segment_len, self.dim_k)
            )
            q = (
                self.proj_q(x_segment)
                .unsqueeze(1)
                .view(batch_size, self.num_heads, self.segment_len, self.dim_k)
            )
            v = (
                self.proj_v(x_segment)
                .unsqueeze(1)
                .view(batch_size, self.num_heads, self.segment_len, self.dim_v)
            )

            if attention_mask is not None:
                attention_mask_segment = attention_mask[
                    :, :, :, i * self.segment_len : (i + 1) * self.segment_len
                ]
                attention_mask_segment = attention_mask_segment.expand(
                    -1, -1, self.segment_len, self.segment_len
                )
            else:
                attention_mask_segment = None

            # att_dot:[batch_size, num_heads, segment_len, dim_v]
            att_dot = self._att_dot(q, k, v, attention_mask=attention_mask_segment)

            # att_mem:[batch_size, num_heads, segment_len, dim_v]
            att_mem = self._memory_retrival(batch_size, q)

            self._memory_update(k, v)

            # att:[batch_size, num_heads, segment_len, dim_v]
            att = (
                nn.functional.sigmoid(self.betas) * att_mem
                + (1 - nn.functional.sigmoid(self.betas)) * att_dot
            )
            # print(f"{att.shape=}")
            att = att.view((batch_size, self.segment_len, self.num_heads * self.dim_v))
            att = self.proj_out(att)
            att = self.dropout_layer(att)
            out.append(att)

        self._reset_memory()
        # out:[batch_size, seq_len, dim_input]
        return (torch.cat(out, dim=1),)

    def _att_dot(self, q, k, v, attention_mask):
        # k,v,q:[batch_size, num_heads, segment_len, dim_k/dim_v]
        scores = q @ k.transpose(-2, -1) / torch.sqrt(torch.tensor(self.dim_k))
        # scores:[batch_size, num_heads, segment_len, segment_len]
        scores = scores.masked_fill(self.mask.bool(), float("-inf"))
        if attention_mask is not None:
            scores = scores + attention_mask

        scores = nn.functional.softmax(scores, dim=-1)
        # print(scores)
        # raise NotImplementedError
        scores = self.dropout_layer(scores)
        att_dot = scores @ v
        return att_dot

    def _memory_retrival(self, batch_size, q):
        # if self.memory==None and self.z==None:
        #     self.memory=torch.zeros(batch_size, self.num_heads, self.dim_k, self.dim_v)
        #     self.z=torch.zeros(batch_size, self.num_heads, 1, self.dim_k)

        sigma_q = nn.functional.elu(q) + 1.0
        # sigma_q:[batch_size, num_heads, segment_len, dim_k]
        # self.memory:[batch_size, num_heads, dim_k, dim_v]
        # self.z:[batch_size, num_heads, 1, dim_k]
        # att_mem:[batch_size, num_heads, segment_len, dim_v]
        att_mem = (
            (sigma_q @ self.memory) / ((sigma_q @ self.z.transpose(-2, -1)) + self.eps)
        ).detach()
        return att_mem

    def _memory_update(self, k, v):
        # k,v:[batch_size, num_heads, segment_len, dim_k/dim_v]
        sigma_k = nn.functional.elu(k) + 1.0
        # self.memory:[batch_size, num_heads, dim_k, dim_v]
        if self.memory != None:
            self.memory = self.memory + sigma_k.transpose(-2, -1) @ v
        else:
            self.memory = sigma_k.transpose(-2, -1) @ v

        # self.z:[batch_size, num_heads, 1, dim_k]
        if self.z != None:
            self.z = self.z + sigma_k.sum(dim=-2, keepdim=True)
        else:
            self.z = sigma_k.sum(dim=-2, keepdim=True)

    def _reset_memory(self):
        # self.memory:[batch_size, num_heads, dim_k, dim_v]
        self.memory = torch.zeros(1, self.num_heads, self.dim_k, self.dim_v).to(
            self.device
        )

        # self.z:[batch_size, num_heads, 1, dim_k]
        self.z = (torch.randn(1, self.num_heads, 1, self.dim_k) * 1e-10).to(self.device)

In [None]:
dim_input = 128
num_heads = 8
dim_k = dim_input // num_heads
dim_v = dim_input // num_heads
segment_len = 64

model = InfiniAttentionGPT2(
    config,
    num_heads,
    dim_k,
    dim_v,
    dim_input,
    segment_len,
)
batch = torch.randn(2, 8, dim_input)
# print(batch[0])
# print(model(batch)[0])

### Tokenizer

In [None]:
from transformers import GPT2Tokenizer, AutoTokenizer

tokenizer_path = (
    "/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/tokenizer/"
)

vocab_file = tokenizer_path + "vocab.json"
merges_file = tokenizer_path + "merges.txt"

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
# tokenizer = GPT2Tokenizer(vocab_file, merges_file)
tokenizer.model_max_length = config.n_positions
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"


bos_id = tokenizer.bos_token_id
eos_id = tokenizer.eos_token_id
pad_id = tokenizer.pad_token_id

### Trainer

In [None]:
# gpt-2 original
# model = GPT2LMHeadModel(config).to(device)

# gpt-2 infini
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel(config)
model.config.use_cache = False

num_heads = config.num_attention_heads
dim_k = config.hidden_size // config.num_attention_heads
dim_v = config.hidden_size // config.num_attention_heads
dim_input = config.hidden_size
segment_len = config.n_positions // 4

for i, layer in enumerate(model.transformer.h):

    model.transformer.h[i].attn = InfiniAttentionGPT2(
        config, num_heads, dim_k, dim_v, dim_input, segment_len
    )


model.resize_token_embeddings(len(tokenizer))
model.generation_config.pad_token_id = tokenizer.pad_token_id
model = model.to(device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 152,760,576 trainable parameters


### Dataset

In [None]:
from datasets import Dataset
import torch

# # ----- TEST DATASET -----
# dataset = [
#     "Repita: a cor é Azul." * 1000,
#     "Amarelo é a cor do sol." * 1000,
#     "Verde é a cor do mar." * 1000
# ]

# train_input_ids = tokenizer(
#     dataset,
#     padding="max_length",
#     truncation=True,
#     max_length=model.config.n_positions,
#     return_tensors="pt",
# )

# _dataset = {
#     "input_ids": train_input_ids["input_ids"],
#     "attention_mask": train_input_ids["attention_mask"],
# }

# train_dataset = Dataset.from_dict(_dataset)
# train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# test_dataset = train_dataset

# ----- END TEST DATASET -----

path_dataset = (
    "/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/all_1024_gpt2"
)

train_dataset = Dataset.load_from_disk(path_dataset)
train_dataset = train_dataset.train_test_split(test_size=0.1)

test_dataset = train_dataset["test"]
test_dataset = test_dataset.select(range(2))

train_dataset = train_dataset["train"]
train_dataset = train_dataset.select(range(10))

In [None]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

In [None]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 10
})

In [None]:
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2
})

In [None]:
import torch.nn.functional as F


def generate(model, idx, attention_mask, max_new_tokens):
    model.eval()
    # idx is (B,T) array of indices in the current
    with torch.no_grad():
        for _ in range(max_new_tokens):
            # Crop idx to the max size of our positional embeddings table
            idx_crop = idx[:, -model.config.n_positions :]
            attention_mask_crop = attention_mask[:, -model.config.n_positions :]
            # Get predictions
            logits = model(idx_crop, attention_mask=attention_mask_crop).logits
            # Get the last time step from logits where the dimensions of the logits are (B,T,C)
            logits_last_timestep = logits[:, -1, :]
            # Apply softmax to get probabilities
            probs = F.softmax(input=logits_last_timestep, dim=-1)
            # Sample from the probabilities' distribution.
            # idx_next = torch.multinomial(input=probs, num_samples=1)
            # get most probable
            idx_next = torch.argmax(input=probs, dim=-1).unsqueeze(-1)
            # Append the sampled indexes idx_next to idx
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

### Inference

In [None]:
sample = "Rodovias são constantemente consideradas Rodovias são constantemente consideradas Rodovias são constantemente consideradas"

sample_tokenized = tokenizer(
    sample,
    padding="max_length",
    truncation=True,
    max_length=model.config.n_positions,
    return_tensors="pt",
)

sample = sample_tokenized["input_ids"].to(device)
sample_attention_mask = sample_tokenized["attention_mask"].to(device)

logits = model(sample, attention_mask=sample_attention_mask).logits

tokenizer.decode(F.softmax(logits[:, -1, :], dim=-1).argmax())

' Trigger'

In [None]:
# sample = train_dataset["input_ids"][:2].to(device)
# sample_attention_mask = train_dataset["attention_mask"][:2].to(device)

# for _input_ids, _attention_mask in zip(sample, sample_attention_mask):
#     sample_generation = generate(model, _input_ids.unsqueeze(0), _attention_mask.unsqueeze(0), 5)

#     for generated in sample_generation:
#         print(tokenizer.decode(generated.tolist(), skip_special_tokens=True))

### Trainer

In [None]:
# from transformers import DataCollatorForLanguageModeling
# from transformers import TrainingArguments


# from datasets import Dataset


# output_dir = (
#     "/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/models/output_dir"
# )


# logging_dir = (
#     "/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/models/logs"
# )


# model_save_dir = (
#     "/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/models/"
# )

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

total_train_batch = len(train_dataloader)
total_test_batch = len(test_dataloader)

### Manually compute the loss

In [None]:
# sample = train_dataset[0]
# sample_input_ids = sample["input_ids"].to(device)
# sample_attention_mask = sample["attention_mask"].to(device)
# labels = sample["input_ids"].to(device)

# output = model(sample_input_ids, attention_mask=sample_attention_mask, labels=labels)

# # print the loss
# lm_logits = output.logits

# shift_logits = lm_logits[..., :-1, :].contiguous()
# shift_labels = labels[..., 1:].contiguous()

# loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
# print(loss)
# print(output.loss)

In [None]:
from tqdm.auto import tqdm

epochs = 10
lr = 1e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
losses = []

# print("Computing Initial Metrics")
# with torch.no_grad():
#     model.eval()
#     epoch_loss = []
#     for batch in tqdm(train_dataloader, desc="Train Batchs", total=total_train_batch):
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["input_ids"].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         epoch_loss.append(loss.item())
#     train_loss = sum(epoch_loss) / len(epoch_loss)
#     train_ppl = torch.exp(torch.Tensor([train_loss]))[0].item()


#     for batch in tqdm(test_dataloader, desc="Test Batchs", total=total_test_batch):
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["input_ids"].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss

#         epoch_loss.append(loss.item())
#     eval_loss = sum(epoch_loss) / len(epoch_loss)
#     eval_ppl = torch.exp(torch.Tensor([eval_loss]))[0].item()

#     print(f"Initial metrics: train_loss: {train_loss}, train_ppl: {train_ppl}, eval_loss: {eval_loss}, eval_ppl: {eval_ppl}\n")

# print("Training")
for epoch in tqdm(range(epochs), desc="Epochs"):
    model.train()
    epoch_losses = []
    for i, batch in tqdm(
        enumerate(train_dataloader), desc="Train Batches", total=total_train_batch
    ):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["input_ids"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        # if not loss.isnan():
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())

    model.eval()
    with torch.no_grad():
        epoch_loss = []
        for batch in tqdm(test_dataloader, desc="Test Batches", total=total_test_batch):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["input_ids"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            epoch_loss.append(loss.item())
        eval_loss = sum(epoch_loss) / len(epoch_loss)
        eval_ppl = torch.exp(torch.Tensor([eval_loss]))[0].item()

    print(
        f"epoch: {epoch}, loss: {sum(epoch_losses)/len(epoch_losses)}, train_ppl: {torch.exp(torch.Tensor([sum(epoch_losses)/len(epoch_losses)]))[0].item()}, eval_loss: {eval_loss}, eval_ppl: {eval_ppl}"
    )

    losses.append(sum(epoch_losses) / len(epoch_losses))

model.eval()
print()

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 0, loss: 6.674548244476318, train_ppl: 791.989501953125, eval_loss: 7.267024517059326, eval_ppl: 1432.2823486328125


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 1, loss: 6.300175476074219, train_ppl: 544.6676025390625, eval_loss: 7.19720983505249, eval_ppl: 1335.69873046875


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 2, loss: 5.871802425384521, train_ppl: 354.8880310058594, eval_loss: 7.073591709136963, eval_ppl: 1180.3800048828125


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 3, loss: 5.353797578811646, train_ppl: 211.4095916748047, eval_loss: 7.027628421783447, eval_ppl: 1127.3538818359375


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 4, loss: 4.807580184936524, train_ppl: 122.43496704101562, eval_loss: 7.029416799545288, eval_ppl: 1129.3720703125


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 5, loss: 4.284088802337647, train_ppl: 72.53640747070312, eval_loss: 7.017535448074341, eval_ppl: 1116.032470703125


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 6, loss: 3.7961266040802, train_ppl: 44.52837371826172, eval_loss: 7.031916379928589, eval_ppl: 1132.198486328125


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 7, loss: 3.2632187843322753, train_ppl: 26.133522033691406, eval_loss: 7.076834201812744, eval_ppl: 1184.213623046875


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 8, loss: 2.9010246992111206, train_ppl: 18.192779541015625, eval_loss: 7.078364610671997, eval_ppl: 1186.0269775390625


Train Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Test Batches:   0%|          | 0/2 [00:00<?, ?it/s]

epoch: 9, loss: 2.431860899925232, train_ppl: 11.380040168762207, eval_loss: 7.119774103164673, eval_ppl: 1236.1708984375



### Test generation

In [None]:
import torch.nn.functional as F


def generate(model, idx, attention_mask, max_new_tokens):
    model.eval()
    # idx is (B,T) array of indices in the current
    with torch.no_grad():
        for _ in range(max_new_tokens):
            # Crop idx to the max size of our positional embeddings table
            idx_crop = idx[:, -model.config.n_positions :]
            attention_mask_crop = attention_mask[:, -model.config.n_positions :]
            # Get predictions
            logits = model(idx_crop, attention_mask=attention_mask_crop).logits
            # Get the last time step from logits where the dimensions of the logits are (B,T,C)
            logits_last_timestep = logits[:, -1, :]
            # Apply softmax to get probabilities
            probs = F.softmax(input=logits_last_timestep, dim=-1)
            # Sample from the probabilities' distribution.
            idx_next = torch.multinomial(input=probs, num_samples=1)
            # get most probable
            # idx_next = torch.argmax(input=probs, dim=-1).unsqueeze(-1)
            # Append the sampled indexes idx_next to idx
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
# TEST 1
sample = train_dataset["input_ids"][0]
original_sentence = tokenizer.decode(sample, skip_special_tokens=True)
print("Original complete sentence:")
print(original_sentence)
original_sentence = " ".join(original_sentence.split())
print("Original splited sentence:")
print(original_sentence)

tokenized_sample = tokenizer(
    original_sentence,
    padding="max_length",
    truncation=True,
    max_length=model.config.n_positions,
    return_tensors="pt",
)
sample = tokenized_sample["input_ids"].to(device)
sample_attention_mask = tokenized_sample["attention_mask"].to(device)

for _input_ids, _attention_mask in zip(sample, sample_attention_mask):
    sample_generation = generate(
        model, _input_ids.unsqueeze(0), _attention_mask.unsqueeze(0), 100
    )

    for generated in sample_generation:
        print("Generated sentence:")
        print(tokenizer.decode(generated.tolist(), skip_special_tokens=True))

Original complete sentence:
o Campeões do Mundo. A unificação por parte da imprensa é puramente simbólica já que essa competição sempre foi chamada de Mundial Interclubes desde os primórdios, até a CONMEBOL faz isso. LuanCastle (discussão) 17h00min de 4 de março de 2021 (UTC) Vamos por partes: 1º A CBF ao falar sobre problemas com dinheiro não abdica da organização ao indicar várias alternativas, como a de um Brasileiro regionalizado(tipo Taça Brasil) para evitar custos. 2º A CBF deu o título do Grupo Amarelo para o Sport baseando-se na melhor campanha. Em 1986, o Eurico teve a ideia de uma disputa nacional com apenas 16 clubes. No ano seguinte, fez o acordo com a entidade, 32 clubes divididos em 2 grupos de 16, tendo o C13 delegado poderes para que ele "resolvesse o problema" (ver vídeo acima com o Aidar - texto em inglês) 3º Simbólico em reportagens antigas da FIFA que aprovou apenas o status de título mundial, mas a imprensa sempre unifica intercontinental com o mundial da entidade.

In [None]:
# get next token

sample = "Repita: "

sample_tokenized = tokenizer(
    sample,
    padding="max_length",
    truncation=True,
    max_length=model.config.n_positions,
    return_tensors="pt",
)

sample = sample_tokenized["input_ids"].to(device)
sample_attention_mask = sample_tokenized["attention_mask"].to(device)

logits = model(sample, attention_mask=sample_attention_mask).logits

tokenizer.decode(F.softmax(logits[:, -1, :], dim=-1).argmax())