In [None]:
import pandas as pd


# Huggingface documentation

In [None]:
from transformers import AutoTokenizer, TFGPT2LMHeadModel
import tensorflow as tf

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2")

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
outputs = model(inputs)
logits = outputs.logits

# Chatgpt code

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
# Define a function to calculate log probability of a sentence
def calculate_log_probability(sentence):
    # Tokenize the sentence
    input_ids = tokenizer.encode(sentence, return_tensors="pt")
    
    # Generate output probabilities from the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
        logits = outputs[0]
    
    # Calculate log probability of each token
    token_log_probs = torch.log_softmax(logits[0], dim=-1)
    
    # Get token IDs of the input sentence
    input_token_ids = input_ids[0]
    
    # Sum up log probabilities of tokens in the input sentence
    log_prob_sum = 0
    for i, token_id in enumerate(input_token_ids):
        log_prob_sum += token_log_probs[i, token_id]
    
    return log_prob_sum.item()

# Example usage
sentence = "A message has been sent to your account"
log_probability = calculate_log_probability(sentence)
print("Log probability of the sentence:", log_probability)


# Article function

(env : mamba install -c conda-forge ipywidgets)

In [None]:
from model_functions import model_factory
model = model_factory(name = 'gpt2', gpu_id = None)
sentence = "A message has been sent to your account"
log_probability = model.sent_prob(sentence)
print("Log probability of the sentence:", log_probability)

In [None]:
from model_functions import model_factory
model = model_factory(name = 'gpt2', gpu_id = None)
sentence = "A message has been sent to your account"
log_probability = model.sent_prob(sentence)
print("Log probability of the sentence:", log_probability)

## Handmade version

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import torch

  torch.utils._pytree._register_pytree_node(


In [4]:
class GPTContainer:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device)
        self.starts = []
        self.suffs = []

        # Populate starts and suffs
        for i in range(len(self.tokenizer.get_vocab())):
            tok = self.tokenizer.decode(i)
            if tok[0] == " " or tok[0] == ".":
                self.starts.append(i)
            elif tok[0] != " ":
                self.suffs.append(i)

In [5]:
container = GPTContainer()

In [8]:
logsoftmax = torch.nn.LogSoftmax(dim=-1)

def gpt2_sent_prob(sent):

    tokenizer = container.tokenizer
    model = container.model

    starts = container.starts
    suffs = container.suffs

    sent = ". " + sent + "."

    tokens = tokenizer.encode(sent)
    inputs = torch.tensor(tokens).to(container.device)

    with torch.no_grad():
        out = model(inputs)

    unsoft = out[0]
    lab1 = inputs.cpu().data.numpy()

    probs = []
    for x in range(len(lab1) - 1):

        lab = lab1[x + 1]
        unsoft1 = unsoft[x]

        if lab in starts:

            soft = logsoftmax(unsoft1[starts])
            prob = float(soft[starts.index(lab)].cpu().data.numpy())

        elif lab in suffs:

            soft = logsoftmax(unsoft1[suffs])
            prob = float(soft[suffs.index(lab)].cpu().data.numpy())

        probs.append(prob)

    prob = np.sum(probs)

    return prob


In [9]:
sentence = "A message has been sent to your account"
log_probability = gpt2_sent_prob(sentence)

In [10]:
print(log_probability)

-27.383871644735336


# Githubcode

https://gist.github.com/yuchenlin/eb63e2d0513f70cfc9bb85fa5a78953b

In [None]:
import torch
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from scipy.special import softmax
 
def model_init(model_string, cuda):
    if model_string.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(model_string)
        model = GPT2LMHeadModel.from_pretrained(model_string)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(model_string)
        model = OpenAIGPTLMHeadModel.from_pretrained(model_string)
    model.eval()
    if cuda:
        model.to('cuda')
    print("Model init")
    return model, tokenizer


def sent_scoring(model_tokenizer, text, cuda):
    model = model_tokenizer[0]
    tokenizer = model_tokenizer[1]
    assert model is not None
    assert tokenizer is not None
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    if cuda:
        input_ids = input_ids.to('cuda')
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    sentence_prob = loss.item()
    return sentence_prob
 

if __name__ == '__main__':
    # model, tokenizer = model_init('openai-gpt', False) 
    model, tokenizer = model_init('gpt2', False) 
    print(sent_scoring((model, tokenizer), "A message has been sent to your account", False))
    print(sent_scoring((model, tokenizer), "They are barely able to handle Delhi properly", False))



# Huggingface code

https://discuss.huggingface.co/t/generation-probabilities-how-to-compute-probabilities-of-output-scores-for-gpt2/3175

In [None]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer


gpt2 = AutoModelForCausalLM.from_pretrained("gpt2", return_dict_in_generate=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

input_ids = tokenizer("A message has been sent to your account", return_tensors="pt").input_ids

generated_outputs = gpt2.generate(input_ids, do_sample=True, num_return_sequences=3, output_scores=True)

# only use id's that were generated
# gen_sequences has shape [3, 15]
gen_sequences = generated_outputs.sequences[:, input_ids.shape[-1]:]

# let's stack the logits generated at each step to a tensor and transform
# logits to probs
probs = torch.stack(generated_outputs.scores, dim=1).softmax(-1)  # -> shape [3, 15, vocab_size]

# now we need to collect the probability of the generated token
# we need to add a dummy dim in the end to make gather work
gen_probs = torch.gather(probs, 2, gen_sequences[:, :, None]).squeeze(-1)

# now we can do all kinds of things with the probs

# 1) the probs that exactly those sequences are generated again
# those are normally going to be very small
unique_prob_per_sequence = gen_probs.prod(-1)

# 2) normalize the probs over the three sequences
normed_gen_probs = gen_probs / gen_probs.sum(0)
assert normed_gen_probs[:, 0].sum() == 1.0, "probs should be normalized"

# 3) compare normalized probs to each other like in 1)
unique_normed_prob_per_sequence = normed_gen_probs.prod(-1)