In [None]:
!pip install torch

In [27]:
from pprint import pprint
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


def to_tokens_and_logprobs(model, tokenizer, input_texts):
    input_ids = tokenizer(input_texts, padding=True, return_tensors="pt").input_ids
    outputs = model(input_ids)
    probs = torch.log_softmax(outputs.logits, dim=-1).detach()

    outputs = model.generate(input_ids)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Output text:", decoded_output)

    # collect the probability of the generated token -- probability at index 0 corresponds to the token at index 1
    probs = probs[:, :-1, :]
    input_ids = input_ids[:, 1:]
    gen_probs = torch.gather(probs, 2, input_ids[:, :, None]).squeeze(-1)

    batch = []
    for input_sentence, input_probs in zip(input_ids, gen_probs):
        text_sequence = []
        for token, p in zip(input_sentence, input_probs):
            if token not in tokenizer.all_special_ids:
                text_sequence.append((tokenizer.decode(token), p.item()))
        batch.append(text_sequence)
    return batch


tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id

input_texts = ["what is your favorite colour","One plus one is two", "Good morning", "Hello, how are you?", "what is your favorite game"]

batch = to_tokens_and_logprobs(model, tokenizer, input_texts)
pprint(batch)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output text: what is your favorite colour?

I love the colour of the sun. I love the
[[('what', -9.269655227661133),
  (' is', -1.9438844919204712),
  (' your', -3.602717876434326),
  (' favorite', -1.775857925415039),
  (' colour', -6.8239946365356445)],
 [('One', -5.882714748382568),
  (' plus', -9.785088539123535),
  (' one', -0.7229113578796387),
  (' is', -2.4940872192382812),
  (' two', -6.137444496154785)],
 [('Good', -7.579043865203857), (' morning', -1.826709270477295)],
 [(',', -2.3431499004364014),
  (' how', -4.3397216796875),
  (' are', -2.6824662685394287),
  (' you', -0.41092735528945923),
  ('?', -1.8950951099395752)],
 [('what', -9.269655227661133),
  (' is', -1.9438844919204712),
  (' your', -3.602717876434326),
  (' favorite', -1.775857925415039),
  (' game', -3.8326237201690674)]]


In [23]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def to_tokens_and_logprobs(model, tokenizer, input_texts):
    # Encode the input text
    input_ids = tokenizer(input_texts, padding=True, return_tensors="pt").input_ids
    # Get model outputs
    outputs = model(input_ids)
    # Calculate log probabilities
    probs = torch.log_softmax(outputs.logits, dim=-1)

    # Convert logits to probabilities for easier interpretation
    probs = probs.exp().detach()

    # Get the most probable token IDs for each position in the input text
    predicted_ids = torch.argmax(probs, dim=-1)

    # Decode the token IDs to text
    for i, input_id in enumerate(input_ids):
        output_text = tokenizer.decode(predicted_ids[i], skip_special_tokens=True)
        print(f"Input text: {input_texts[i]}")
        print(f"Predicted output: {output_text}")

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id
# Example input text
input_texts = ["One plus one is two", "Good morning", "Hello, how are you?", "what is your favorite "]

# Generate and print output text
to_tokens_and_logprobs(model, tokenizer, input_texts)


Input text: One plus one is two
Predicted output: 
 of one. that.
Input text: Good morning
Predicted output: 
TheTheThe news,
Input text: Hello, how are you?
Predicted output: , I about you doing

Input text: what is your favorite 
Predicted output: 
 is the favorite part 


In [35]:
from evaluate import load
perplexity = load("perplexity", module_type="metric")
input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]

results = perplexity.compute(model_id='mistral',
                             add_start_token=False,
                             predictions=input_texts)
pprint(results)

OSError: mistral is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [29]:
!pip install -U evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp38-cp38-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py38-none-any.whl.metadata (7.1 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Collecting pyarrow>=12.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-15.0.1-cp38-cp38-win_amd64.whl.metadata (3.1 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0->evaluate)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting aiohttp (from datasets>=2.0.0->evaluate)
  Downloading aiohttp-3.9.3-cp38-cp38-win_amd64.whl.metadata (7.6 kB)
Collecting aiosign

In [42]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the pre-trained model and tokenizer
model_name = "gpt2"  # Replace with the desired model name or path
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the input text X and the corresponding output text Y
input_text = "What is the capital of France?"
output_text = "The capital of France is Paris."

# Tokenize the input and output texts
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output_ids = tokenizer.encode(output_text, return_tensors="pt")

# Concatenate the input and output token IDs
input_output_ids = torch.cat((input_ids, output_ids), dim=-1)
# Create attention mask
attention_mask = torch.ones_like(input_output_ids)

# Compute the perplexity
with torch.no_grad():
    outputs = model(input_output_ids, attention_mask=attention_mask, labels=input_output_ids)
    loss = outputs.loss
    perplexity = torch.exp(loss)

print(f"Perplexity: {perplexity.item():.2f}")

Perplexity: 24.39


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the pre-trained model and tokenizer
model_name = "gpt2"  # Replace with the desired model name or path
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the input text X and the corresponding output text Y
input_text = "What is the capital of France?"
output_text = "The capital of France is Paris."

# Tokenize the input and output texts
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output_ids = tokenizer.encode(output_text, return_tensors="pt")

# Create attention mask
attention_mask = torch.ones_like(output_ids)

# Compute the perplexity
with torch.no_grad():
    outputs = model(input_ids=input_ids, labels=output_ids, attention_mask=attention_mask)
    loss = outputs.loss
    perplexity = torch.exp(loss)

print(f"Perplexity: {perplexity.item():.2f}")

In [4]:
print([i+1 for i in [1,2,3,234]])

[2, 3, 4, 235]


In [6]:
print([x for x in range(10)])
y = [x for x in range(10)]
y[:-3]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


[0, 1, 2, 3, 4, 5, 6]