## GPT Tokenizers

In [None]:
! pip install -q tiktoken==0.12.0 pandas==2.2.2 numpy==2.0.2 torch==2.9.0

In [None]:
import tiktoken
import pandas as pd

# Initialize encodings
encodings = {
    "GPT-2": tiktoken.get_encoding("gpt2"),         # GPT 2 tokenizer
    "GPT-3": tiktoken.get_encoding("p50k_base"),    # GPT 3 tokenizer
    "GPT-4": tiktoken.get_encoding("cl100k_base"),  # GPT 4 tokenizer
    "GPT-4o": tiktoken.get_encoding("o200k_base"),  # GPT 4o tokenizer
}

 
print("Input text:")
print(text)
print()

summary = []
decoded_tokens = {}
max_len = 0

# Encode once per tokenizer
for model, enc in encodings.items():
    token_ids = enc.encode(text)
    pieces = [repr(enc.decode([tid])) for tid in token_ids]

    summary.append({
        "Model": model,
        "Vocab size": enc.n_vocab,
        "Token count": len(token_ids),
        "Round-trip OK": enc.decode(token_ids) == text
    })

    decoded_tokens[model] = pieces
    max_len = max(max_len, len(pieces))

# Print compact summary
summary_df = pd.DataFrame(summary)
print("Summary:")
display(summary_df)

# Build aligned token table (token index as rows)
rows = []
for i in range(max_len):
    row = {"Idx": i}
    for model in decoded_tokens:
        row[model] = decoded_tokens[model][i] \
                      if i < len(decoded_tokens[model]) else ""
    rows.append(row)

tokens_df = pd.DataFrame(rows)

print("Token pieces comparison:")
display(tokens_df)

Input text:
Learning never stops | ‡§∏‡•Ä‡§ñ‡§®‡§æ ‡§ï‡§≠‡•Ä ‡§®‡§π‡•Ä‡§Ç ‡§∞‡•Å‡§ï‡§§‡§æ | Â≠¶„Å≥„ÅØÊ≠¢„Åæ„Çâ„Å™„ÅÑ üòä

Summary:


Unnamed: 0,Model,Vocab size,Token count,Round-trip OK
0,GPT-2,50257,51,True
1,GPT-3,50281,51,True
2,GPT-4,100277,38,True
3,GPT-4o,200019,20,True


Token pieces comparison:


Unnamed: 0,Idx,GPT-2,GPT-3,GPT-4,GPT-4o
0,0,'Learning','Learning','Learning','Learning'
1,1,' never',' never',' never',' never'
2,2,' stops',' stops',' stops',' stops'
3,3,' |',' |',' |',' |'
4,4,' ÔøΩ',' ÔøΩ',' ‡§∏',' ‡§∏‡•Ä‡§ñ'
5,5,'ÔøΩ','ÔøΩ','‡•Ä','‡§®‡§æ'
6,6,'ÔøΩ','ÔøΩ','ÔøΩ',' ‡§ï‡§≠‡•Ä'
7,7,'ÔøΩ','ÔøΩ','ÔøΩ',' ‡§®‡§π‡•Ä‡§Ç'
8,8,'ÔøΩ','ÔøΩ','‡§®',' ‡§∞'
9,9,'ÔøΩ','ÔøΩ','‡§æ','‡•Å‡§ï'


## Token Embeddings

In [3]:
import torch
import torch.nn as nn
torch.manual_seed(123)

vocab_size = 50000   # just an example vocabulary size
embedding_dim = 128  # example embedding vector size
embed_layer = nn.Embedding(vocab_size, embedding_dim)

In [4]:
print(embed_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ...,  1.2774, -1.4596, -2.1595],
        [-0.2582, -2.0407, -0.8016,  ...,  1.3337,  0.0771, -0.0522],
        [ 0.2386,  0.1411, -1.3354,  ..., -1.7984, -0.6822, -0.5191],
        ...,
        [ 0.7284, -0.8894, -0.5148,  ..., -0.6396, -0.1302,  0.0185],
        [ 1.2806,  0.6254,  1.5523,  ...,  1.7538, -0.9652,  2.0230],
        [-0.0881,  0.7107,  2.4718,  ..., -0.0083,  1.2321, -0.1318]],
       requires_grad=True)


In [5]:
# Suppose we have a token sequence (batch of 1 sequence of 4 tokens for simplicity)
token_ids = torch.tensor([[40, 1842, 616, 3290]])  # shape (batch_size=1, seq_length=4)
embedded_sequence = embed_layer(token_ids)
print(embedded_sequence.shape)

torch.Size([1, 4, 128])
