## Text Generation with Tokens in Generative AI

In [1]:
# import transformers for the tokens
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
# load a pretrained model and tokenizer from HuggingFace

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# generate text
text = "Mr Macharia is generating a text to observe the results of the"
inputs = tokenizer(text, return_tensors="pt")

# show tokens as numbers 
inputs["input_ids"]

tensor([[ 5246,  7080, 10312,   318, 15453,   257,  2420,   284, 12414,   262,
          2482,   286,   262]])

In [3]:
# Importing pamdas to view the sentnce
import pandas as pd

def show_tokenization(inputs):
    return pd.DataFrame(
        [(id, tokenizer.decode(id)) for id in inputs["input_ids"][0]],
        columns=["id", "token"]
    )

show_tokenization(inputs)


Unnamed: 0,id,token
0,tensor(5246),Mr
1,tensor(7080),Mach
2,tensor(10312),aria
3,tensor(318),is
4,tensor(15453),generating
5,tensor(257),a
6,tensor(2420),text
7,tensor(284),to
8,tensor(12414),observe
9,tensor(262),the


In [4]:
# claculate probabilities for the next token

import torch

In [5]:
# claculate probabilities for the next token
with torch.no_grad():
    logits = model(**inputs).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)

# show the next top 5 tokens and their probabilities
def show_next_token(probabilities, top_n=5):
    return pd.DataFrame(
        [
            (id, tokenizer.decode(id), p.item())
            for id, p in enumerate(probabilities)
            if p.item() > 0
        ],
        columns=["id", "token", "probability"]
    ).sort_values("probability", ascending=False)[:top_n]

show_next_token(probabilities)


Unnamed: 0,id,token,probability
2050,2050,study,0.032744
5526,5526,survey,0.025201
3071,3071,election,0.019222
3278,3278,poll,0.013906
717,717,first,0.0112


In [6]:
# obtain the token id for the most probable token
next_token_id = torch.argmax(probabilities).item()

# show the results
print(f"Next token id: {next_token_id}")
print(f"Next token: {tokenizer.decode(next_token_id)}")

Next token id: 2050
Next token:  study


In [7]:
# appending the most probable token
text = text + tokenizer.decode(next_token_id)
print(text)

Mr Macharia is generating a text to observe the results of the study


In [8]:
# import modules to display output
from IPython.display import display, Markdown


In [9]:
# generate more tokens
inputs = tokenizer(text, return_tensors="pt")

# use generate method to generate more texts
outputs = model.generate(**inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)

# decode and display the results
display(Markdown(tokenizer.decode(outputs[0])))

Mr Macharia is generating a text to observe the results of the study.

"We are very pleased to see that the results of the study are now being published in the journal Nature Communications. We are also very pleased to see that the results of the study are being published in the journal Nature Communications. We are very pleased to see that the results of the study are being published in the journal Nature Communications. We are very pleased to see that the results of the study are being published in the