### Generating one token at a time

In [1]:
# import the models

from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
# load a pretrained model and tokenizer from HuggingFace

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")


In [3]:
# create a sentence to tokenize it
text = "She only cooks eggs when I am not"
inputs = tokenizer(text, return_tensors="pt")

In [4]:
# show tokens as numbers 
inputs["input_ids"]

tensor([[ 3347,   691, 38383,  9653,   618,   314,   716,   407]])

In [5]:
# importing pandas to show tokenization
import pandas as pd

# function to show tokenization result
def show_tokenization(inputs):
    return pd.DataFrame(
        [(id, tokenizer.decode(id)) for id in inputs["input_ids"][0]],
        columns=["id", "token"],
    )

show_tokenization(inputs)

Unnamed: 0,id,token
0,tensor(3347),She
1,tensor(691),only
2,tensor(38383),cooks
3,tensor(9653),eggs
4,tensor(618),when
5,tensor(314),I
6,tensor(716),am
7,tensor(407),not


In [6]:
# calculate probabilities for the next token

import torch

with torch.no_grad():
    logits = model(**inputs).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)

def show_next(probabilities, top_n=5):
    return pd.DataFrame(
        [
            (id, tokenizer.decode(id), p.item())
            for id, p in enumerate(probabilities)
            if p.item()
        ],
        columns=["id", "token", "probability"]
    ).sort_values("probability", ascending=False).head(top_n)

show_next(probabilities)

Unnamed: 0,id,token,probability
10801,10801,cooking,0.145505
287,287,in,0.065937
1363,1363,home,0.064642
1762,1762,working,0.06276
379,379,at,0.059329


In [7]:
# the token id for the most probable next token
next_token_id = torch.argmax(probabilities).item()

print(f"Next token id: {next_token_id}")
print(f"Next token: {tokenizer.decode(next_token_id)}")

Next token id: 10801
Next token:  cooking


In [8]:
# appending the text
text = text + tokenizer.decode(next_token_id)
text

'She only cooks eggs when I am not cooking'

In [9]:
# generating more tokens
# importing display

from IPython.display import display, Markdown

# show text
print(f"Text: {text}")

#convert into tokens
inputs = tokenizer(text, return_tensors="pt")

# calculate probabilities for the next token
with torch.no_grad():
    logits = model(**inputs).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)

# show the texts
display(Markdown("**Next tokens probabilities:**"))
display (show_next(probabilities))

# the token id for the most probable next token
next_token_id = torch.argmax(probabilities).item()

text = text + tokenizer.decode(next_token_id)



Text: She only cooks eggs when I am not cooking


**Next tokens probabilities:**

Unnamed: 0,id,token,probability
606,606,them,0.13122
13,13,.,0.104529
11,11,",",0.084879
340,340,it,0.068265
553,553,",""",0.051667


In [10]:
# use generate method
output = model.generate(**inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)

# show generated text
display(Markdown(tokenizer.decode(output[0])))

She only cooks eggs when I am not cooking them. I don't cook eggs when I am not cooking them. I don't cook eggs when I am not cooking them. I don't cook eggs when I am not cooking them. I don't cook eggs when I am not cooking them. I don't cook eggs when I am not cooking them. I don't cook eggs when I am not cooking them. I don't cook eggs when I am not cooking them. I don't cook eggs