In [1]:
import torch
from transformers import AutoTokenizer  # We need to have a shared tokenizer
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
# Both RNN and Transformers need to share the same tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
tokenizer.pad_token = tokenizer.eos_token

In [17]:
test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
# dataset = load_dataset("text", data_files="valenciano.txt")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 55331.72 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 1184227.78 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 660575.65 examples/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (281597 > 512). Running this sequence through the model will result in indexing errors


In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from accelerate.test_utils.testing import get_backend

device, _, _ = (
    get_backend()
)  # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
model_id = "google-bert/bert-base-cased"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

Given a sequence $X=(x_1, x_2, \ldots, x_t)$.

$$
PPL = \exp\left(\frac{1}{t}\sum_{i=1}^{t} \log{p(x_i | x_1, \ldots, x_{i-1})}\right)
$$

In [None]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)

nll_sum = 0.0
n_tokens = 0
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    # Accumulate the total negative log-likelihood and the total number of tokens
    num_valid_tokens = (
        (target_ids != -100).sum().item()
    )  # number of valid tokens in target_ids
    batch_size = target_ids.size(0)
    num_loss_tokens = (
        num_valid_tokens - batch_size
    )  # subtract batch_size due to internal label shift
    nll_sum += neg_log_likelihood * num_loss_tokens
    n_tokens += num_loss_tokens

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

avg_nll = nll_sum / n_tokens  # average negative log-likelihood per token
ppl = torch.exp(avg_nll)