In [None]:
!pip install torch

In [27]:
from pprint import pprint
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


def to_tokens_and_logprobs(model, tokenizer, input_texts):
    input_ids = tokenizer(input_texts, padding=True, return_tensors="pt").input_ids
    outputs = model(input_ids)
    probs = torch.log_softmax(outputs.logits, dim=-1).detach()

    outputs = model.generate(input_ids)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Output text:", decoded_output)

    # collect the probability of the generated token -- probability at index 0 corresponds to the token at index 1
    probs = probs[:, :-1, :]
    input_ids = input_ids[:, 1:]
    gen_probs = torch.gather(probs, 2, input_ids[:, :, None]).squeeze(-1)

    batch = []
    for input_sentence, input_probs in zip(input_ids, gen_probs):
        text_sequence = []
        for token, p in zip(input_sentence, input_probs):
            if token not in tokenizer.all_special_ids:
                text_sequence.append((tokenizer.decode(token), p.item()))
        batch.append(text_sequence)
    return batch


tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id

input_texts = ["what is your favorite colour","One plus one is two", "Good morning", "Hello, how are you?", "what is your favorite game"]

batch = to_tokens_and_logprobs(model, tokenizer, input_texts)
pprint(batch)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output text: what is your favorite colour?

I love the colour of the sun. I love the
[[('what', -9.269655227661133),
  (' is', -1.9438844919204712),
  (' your', -3.602717876434326),
  (' favorite', -1.775857925415039),
  (' colour', -6.8239946365356445)],
 [('One', -5.882714748382568),
  (' plus', -9.785088539123535),
  (' one', -0.7229113578796387),
  (' is', -2.4940872192382812),
  (' two', -6.137444496154785)],
 [('Good', -7.579043865203857), (' morning', -1.826709270477295)],
 [(',', -2.3431499004364014),
  (' how', -4.3397216796875),
  (' are', -2.6824662685394287),
  (' you', -0.41092735528945923),
  ('?', -1.8950951099395752)],
 [('what', -9.269655227661133),
  (' is', -1.9438844919204712),
  (' your', -3.602717876434326),
  (' favorite', -1.775857925415039),
  (' game', -3.8326237201690674)]]


In [35]:
from evaluate import load
perplexity = load("perplexity", module_type="metric")
input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]

results = perplexity.compute(model_id='mistral',
                             add_start_token=False,
                             predictions=input_texts)
pprint(results)

OSError: mistral is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`