## 2.2 Tokenizing text

In [1]:
import os
import urllib.request

if not os.path.exists('the-verdict.txt'):
    url = 'https://raw.githubusercontent.com/GlebTanaka/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt'
    file_path = 'the-verdict.txt'
    urllib.request.urlretrieve(url, file_path)

In [2]:
# Read the file and get character count
with open('the-verdict.txt', 'r', encoding='utf-8') as file:
    the_verdict_text = file.read()
    char_count = len(the_verdict_text)
print(f"Total number of characters in the file: {char_count}")
print(the_verdict_text[:99])

Total number of characters in the file: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
import re

# Sample sentence
sample_sentence = "Hello, world! This is a simple example of tokenization (using regex)."

# Tokenize using re.findall
# \w+ matches one or more word characters (letters, digits, underscores)
tokens = re.findall(r'\w+', sample_sentence.lower())

print("Original text:", sample_sentence)
print("\nTokens:", tokens)
print("Number of tokens:", len(tokens))

Original text: Hello, world! This is a simple example of tokenization (using regex).

Tokens: ['hello', 'world', 'this', 'is', 'a', 'simple', 'example', 'of', 'tokenization', 'using', 'regex']
Number of tokens: 11


In [4]:
# A few example texts with different patterns
texts = [
    "Hello world",                    # Simple space-separated
    "Hello, world!",                  # With punctuation
    "Is this--a test?",              # With double dash
    "Word. Another word... Final"     # With multiple dots
]

# Try different patterns
patterns = [
    r'\w+',                          # Just words
    r'[A-Za-z]+',                    # Only letters
    r'([,.:;?_!"()\']|--|\s)',      # more complex pattern
    r'\S+'                           # Non-whitespace chunks
]

# Test each pattern on each text
for text in texts:
    print(f"\nOriginal text: '{text}'")
    for pattern in patterns:
        print(f"\nPattern '{pattern}':")
        if pattern.startswith('('):
            # Use split for patterns with groups
            tokens = [t.strip() for t in re.split(pattern, text) if t.strip()]
        else:
            # Use findall for simple patterns
            tokens = re.findall(pattern, text)
        print(f"Tokens: {tokens}")


Original text: 'Hello world'

Pattern '\w+':
Tokens: ['Hello', 'world']

Pattern '[A-Za-z]+':
Tokens: ['Hello', 'world']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Hello', 'world']

Pattern '\S+':
Tokens: ['Hello', 'world']

Original text: 'Hello, world!'

Pattern '\w+':
Tokens: ['Hello', 'world']

Pattern '[A-Za-z]+':
Tokens: ['Hello', 'world']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Hello', ',', 'world', '!']

Pattern '\S+':
Tokens: ['Hello,', 'world!']

Original text: 'Is this--a test?'

Pattern '\w+':
Tokens: ['Is', 'this', 'a', 'test']

Pattern '[A-Za-z]+':
Tokens: ['Is', 'this', 'a', 'test']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Is', 'this', '--', 'a', 'test', '?']

Pattern '\S+':
Tokens: ['Is', 'this--a', 'test?']

Original text: 'Word. Another word... Final'

Pattern '\w+':
Tokens: ['Word', 'Another', 'word', 'Final']

Pattern '[A-Za-z]+':
Tokens: ['Word', 'Another', 'word', 'Final']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Word', '.', 'Another', 'word', '.',

In [5]:
# Define the pattern as a variable
pattern = r'([,.:;?_!"()\']|--|\s)'

# Tokenize using the pattern
tokens = [token.strip() for token in re.split(pattern, the_verdict_text) if token.strip()]

print("\nTokens:", tokens[:30])
print("Number of tokens:", len(tokens))



Tokens: ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
Number of tokens: 4690


## 2.3 Converting tokens into token IDs

In [6]:
# Convert tokens to a sorted set (vocabulary)
vocab = sorted(set(tokens))

print("Vocabulary (unique tokens):", vocab[:5])
print("Vocabulary size:", len(vocab))

# Create a token-to-id mapping dictionary
token_to_id = {token: idx for idx, token in enumerate(vocab)}

# Convert tokens to IDs
token_ids = [token_to_id[token] for token in tokens]

# Print first 5 items from token_to_id dictionary
print("\nFirst 5 token-to-ID mappings:")
for i, (token, id_num) in enumerate(token_to_id.items()):
    if i >= 5:
        break
    print(f"'{token}' -> {id_num}")

# Print first 10 token IDs
print("\nFirst 10 tokens as IDs:", token_ids[:10])

Vocabulary (unique tokens): ['!', '"', "'", '(', ')']
Vocabulary size: 1130

First 5 token-to-ID mappings:
'!' -> 0
'"' -> 1
''' -> 2
'(' -> 3
')' -> 4

First 10 tokens as IDs: [53, 44, 149, 1003, 57, 38, 818, 115, 256, 486]


In [7]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab # Dictionary mapping tokens to IDs
        self.int_to_str = {i:s for s,i in vocab.items()} # Reverse mapping: IDs to tokens

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [8]:
# Example usage
vocab = {'hello': 0, 'world': 1, '!': 2}
tokenizer = SimpleTokenizerV1(vocab)

# Encoding
ids = tokenizer.encode("hello world!")  # Returns: [0, 1, 2]

# Decoding
text = tokenizer.decode([0, 1, 2])  # Returns: "hello world!"
print(text)

hello world!


In [9]:
# Using tokenizer on prior created vocabulary.
tokenizer_2 = SimpleTokenizerV1(token_to_id)

text_section = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer_2.encode(text_section)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [10]:
# decode integers into text
tokenizer_2.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [11]:
# First tokenizer_2.encode(text_section) converts the text into a list of token IDs
# Then tokenizer_2.decode() converts those IDs back into text
# This is a roundtrip conversion: text -> IDs -> text
# Useful for testing if the tokenizer preserves the text correctly
tokenizer_2.decode(tokenizer_2.encode(text_section))

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

## 2.4 Adding special context tokens

In [12]:
# length before adding special tokens
len(token_to_id.items())

1130

In [13]:
# Add special tokens - using the next available IDs
next_id = len(token_to_id)  # Get the next available ID

# Add end of line token
token_to_id['<|endoftext|>'] = next_id
next_id += 1

# Add unknown word token
token_to_id[('<|unk|>')] = next_id

# Print the new tokens and their IDs
print("Special tokens added:")
print(f"<|endoftext|> token ID: {token_to_id['<|endoftext|>']}")
print(f"<|unk|> token ID: {token_to_id['<|unk|>']}")

Special tokens added:
<|endoftext|> token ID: 1130
<|unk|> token ID: 1131


In [14]:
# length after adding special tokens:
len(token_to_id.items())

1132

In [15]:
for i, item in enumerate(list(token_to_id.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [16]:
# adjust the tokenizer to use unknown words:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        # Store the '<|unk|>' token ID for easy access
        self.unk_token_id = vocab['<|unk|>']

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        # Use get() with unk_token_id as default for unknown words
        ids = [self.str_to_int.get(s, self.unk_token_id) for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [17]:
tokenizer = SimpleTokenizerV2(token_to_id)
# This will work even with words not in our vocabulary
test_text = "hello nonexistentword world"
encoded = tokenizer.encode(test_text)
decoded = tokenizer.decode(encoded)
print(decoded)

<|unk|> <|unk|> <|unk|>


In [18]:
# example with sentences from the verdict
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [19]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [20]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

## 2.5 Byte pair encoding

In [21]:
import tiktoken
tiktoken.__version__

'0.9.0'

In [22]:
import tiktoken

# Create an encoder using GPT-2's tokenizer
encoding = tiktoken.get_encoding("gpt2")

# Example text
text = "Hello world! This is an example of tiktoken tokenization."

# Encode the text into tokens
tokens = encoding.encode(text)
print("Encoded tokens:", tokens)

# Decode back to text
decoded_text = encoding.decode(tokens)
print("\nDecoded text:", decoded_text)

# Get the number of tokens
print(f"\nNumber of tokens: {len(tokens)}")

# See each token and its text representation
print("\nToken-by-token breakdown:")
for token in tokens:
    print(f"Token {token}: {encoding.decode([token])!r}")

Encoded tokens: [15496, 995, 0, 770, 318, 281, 1672, 286, 256, 1134, 30001, 11241, 1634, 13]

Decoded text: Hello world! This is an example of tiktoken tokenization.

Number of tokens: 14

Token-by-token breakdown:
Token 15496: 'Hello'
Token 995: ' world'
Token 0: '!'
Token 770: ' This'
Token 318: ' is'
Token 281: ' an'
Token 1672: ' example'
Token 286: ' of'
Token 256: ' t'
Token 1134: 'ik'
Token 30001: 'token'
Token 11241: ' token'
Token 1634: 'ization'
Token 13: '.'


In [23]:
# Try different encodings
print("Available encodings:", tiktoken.list_encoding_names())

# Create encoders for different models
gpt2_encoder = tiktoken.get_encoding("gpt2")
cl100k_encoder = tiktoken.get_encoding("cl100k_base")  # Used by GPT-4

# Compare tokenization
text = "Hello world! Let's try different encoders."
print("\nGPT-2 tokens:", gpt2_encoder.encode(text))
print("CL100K tokens:", cl100k_encoder.encode(text))

Available encodings: ['gpt2', 'r50k_base', 'p50k_base', 'p50k_edit', 'cl100k_base', 'o200k_base']

GPT-2 tokens: [15496, 995, 0, 3914, 338, 1949, 1180, 2207, 375, 364, 13]
CL100K tokens: [9906, 1917, 0, 6914, 596, 1456, 2204, 3289, 53598, 13]


## 2.6 Data sampling with sliding window

The code demonstrates sliding window sampling by:
- Taking a section of encoded text
- Creating input-target pairs
- Showing how context can be built up progressively
- Demonstrating both the numerical (token IDs) and text representations

This helps in understanding how language models process text as a sequence of tokens and how they use context to predict the next token

In [24]:
# Read the input text file
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Convert the raw text into token IDs using our tokenizer
enc_text = tokenizer.encode(raw_text)
print(f"Total number of tokens in the text: {len(enc_text)}")

# Take a sample starting from the 50th token to work with
enc_sample = enc_text[50:]
# Define the size of our context window
context_size = 4

# Create input (x) and target (y) sequences for our first example
# x contains tokens [0:4] and y contains tokens [1:5]
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print("\nDemonstrating basic input-target pair:")
print(f"Input sequence (x):  {x}")
print(f"Target sequence (y): {y}")

print("\nDemonstrating progressive context building with token IDs:")
# Show how we can build up context one token at a time
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    # Print the increasing context size and the next token to predict
    print(f"Context {i} tokens: {context} ----> Next token: {desired}")

print("\nDemonstrating progressive context building with decoded text:")
# Same as above but show the actual text instead of token IDs
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    # Print the increasing context size and the next token to predict, but in readable text
    print(f"Context: '{tokenizer.decode(context)}' ----> Next token: '{tokenizer.decode([desired])}'")

Total number of tokens in the text: 4690

Demonstrating basic input-target pair:
Input sequence (x):  [568, 115, 1066, 727]
Target sequence (y): [115, 1066, 727, 988]

Demonstrating progressive context building with token IDs:
Context 1 tokens: [568] ----> Next token: 115
Context 2 tokens: [568, 115] ----> Next token: 1066
Context 3 tokens: [568, 115, 1066] ----> Next token: 727
Context 4 tokens: [568, 115, 1066, 727] ----> Next token: 988

Demonstrating progressive context building with decoded text:
Context: 'in' ----> Next token: 'a'
Context: 'in a' ----> Next token: 'villa'
Context: 'in a villa' ----> Next token: 'on'
Context: 'in a villa on' ----> Next token: 'the'


In [25]:
import torch
print("PyTorch version:", torch.__version__)

PyTorch version: 2.7.1


In [26]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [27]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [28]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [29]:
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=1,      # Only one sequence per batch
    max_length=4,      # Each sequence is 4 tokens long
    stride=1,          # Move window by 1 token each time
    shuffle=False      # Keep sequences in order
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [30]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


## 2.7 Creating Token embeddings

In [31]:
import torch.nn as nn

# Let's say we have a small vocabulary of 5 words
vocab_size = 5        # Total unique tokens in our vocabulary
embedding_dim = 3     # Size of the embedding vector for each token

# Create an embedding layer
embedding_layer = nn.Embedding(
    num_embeddings=vocab_size,  # Size of the vocabulary
    embedding_dim=embedding_dim  # Size of the embedding vector
)

# Example input: batch of token IDs
# Let's say we have 2 sequences, each with 4 tokens
token_ids = torch.tensor([
    [0, 2, 1, 3],  # First sequence
    [1, 1, 4, 2]   # Second sequence
])

# Get embeddings for these tokens
embedded_tokens = embedding_layer(token_ids)

print("Input shape:", token_ids.shape)
print("Output shape:", embedded_tokens.shape)
print("\nEmbeddings for first sequence:")
print(embedded_tokens[0])

Input shape: torch.Size([2, 4])
Output shape: torch.Size([2, 4, 3])

Embeddings for first sequence:
tensor([[ 1.1997, -2.2683,  0.6205],
        [ 0.6375,  0.7446, -0.1135],
        [ 0.1926, -0.2064,  0.0356],
        [ 0.4293, -0.0239,  0.6841]], grad_fn=<SelectBackward0>)


## 2.8 Encoding word positions

In [32]:
import tiktoken

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Example text
text = "Hello world!"

# Convert text to token IDs and add a batch dimension so the shape becomes [batch, seq_len]
# e.g., [[15496, 995, 0]] depending on the tokenizer
token_ids = tokenizer.encode(text)
token_ids = torch.tensor([token_ids])  # Add batch dimension

# Create embedding layer with GPT-2 vocabulary size
# - num_embeddings: vocabulary size (index upper bound is vocab_size-1)
# - embedding_dim: size of each token vector
# For demonstration we use GPT‑2's common sizes (50257 vocab, 768 hidden size)
gpt2_embedding = nn.Embedding(
    num_embeddings=50257,  # GPT-2 vocabulary size
    embedding_dim=768  # Standard GPT-2 embedding dimension
)

# Look up token embeddings => shape: [batch, seq_len, embedding_dim]
embeddings = gpt2_embedding(token_ids)

print("Token IDs shape:", token_ids.shape)      # e.g., [1, 3]
print("Embeddings shape:", embeddings.shape)    # e.g., [1, 3, 768]

Token IDs shape: torch.Size([1, 3])
Embeddings shape: torch.Size([1, 3, 768])


### Example from the book: learnable token + position embeddings

In [33]:
# Token embedding configuration
vocab_size = 50257  # tokenizer vocab size
output_dim = 256    # model hidden size (embedding dimension)

# Learnable token embedding table: maps token IDs -> vectors in R^{output_dim}
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# Suppose our training pipeline feeds batches of fixed-length sequences (max_length tokens each)
# We'll draw a single batch from the dataloader just to demonstrate shapes.
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)  # inputs: [batch, seq_len]
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape) # e.g., torch.Size([8, 4])

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [34]:
# Embed the tokens: result has shape [batch, seq_len, output_dim]
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)
print(token_embeddings)

torch.Size([8, 4, 256])
tensor([[[ 0.7935, -1.5521, -0.0757,  ...,  0.1973,  0.3964,  0.3699],
         [-0.3152, -0.6270,  0.5026,  ...,  1.3847, -0.7100,  0.1380],
         [ 1.0433, -0.6563,  1.1417,  ..., -1.7185,  1.1614, -1.0819],
         [-0.3306,  1.0400, -0.8803,  ..., -1.6823, -0.1525, -0.2416]],

        [[ 0.8192,  1.2286,  1.2404,  ...,  0.2220,  0.5345,  0.1483],
         [-1.5531,  1.8293, -0.6717,  ..., -0.8943, -0.3155,  0.0232],
         [-0.6936, -0.2899, -1.1548,  ...,  2.1094, -1.6945, -0.6705],
         [ 0.2116,  0.0452, -0.9042,  ..., -0.6992, -2.1959, -0.1711]],

        [[ 0.8561,  0.6646, -0.1296,  ..., -0.8893, -1.0118,  1.0650],
         [ 0.0484,  0.0066, -0.7802,  ..., -1.4375,  0.5547,  1.2714],
         [ 1.0910,  1.3781, -0.9815,  ...,  1.2189, -0.2399, -0.8618],
         [ 0.0476,  0.6166,  0.4904,  ...,  0.5319,  1.4706,  0.8851]],

        ...,

        [[-0.4251, -1.3139, -1.6530,  ..., -0.1279, -0.8907,  0.7626],
         [-1.1603, -0.6090, -1.09

In [36]:
# Create a learnable position embedding table:
# - We allocate one vector per absolute position in the context (0..context_length-1)
# - context_length must be >= the longest sequence length you feed the model
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

# Inspect the randomly initialized position vectors (shape: [context_length, output_dim])
print("Raw position table (learnable weights):")
print(pos_embedding_layer.weight)

Parameter containing:
tensor([[-0.2214, -0.6219, -0.0931,  ...,  0.7471, -0.1968, -0.0575],
        [ 1.6479, -0.9182, -0.3797,  ...,  0.0520,  0.4360,  0.0520],
        [ 2.8840, -1.7412,  0.9570,  ..., -1.5924, -0.3774, -0.6991],
        [ 0.7843,  0.2158, -0.1136,  ..., -2.7918, -1.4832, -0.7604]],
       requires_grad=True)


In [37]:
# Build position indices for the sequence length [0, 1, 2, ..., seq_len-1]
# For a seq_len of 4, this is tensor([0, 1, 2, 3])
pos_indices = torch.arange(max_length)

# Lookup position embeddings => shape: [seq_len, output_dim]
pos_embeddings = pos_embedding_layer(pos_indices)
print("Position embeddings shape:", pos_embeddings.shape)  # e.g., [4, 256]
print(pos_embeddings)

torch.Size([4, 256])
tensor([[-0.2214, -0.6219, -0.0931,  ...,  0.7471, -0.1968, -0.0575],
        [ 1.6479, -0.9182, -0.3797,  ...,  0.0520,  0.4360,  0.0520],
        [ 2.8840, -1.7412,  0.9570,  ..., -1.5924, -0.3774, -0.6991],
        [ 0.7843,  0.2158, -0.1136,  ..., -2.7918, -1.4832, -0.7604]],
       grad_fn=<EmbeddingBackward0>)


In [38]:
# Combine token and position information:
# - token_embeddings shape: [batch, seq_len, output_dim]
# - pos_embeddings   shape: [seq_len, output_dim]
# PyTorch will broadcast pos_embeddings across the batch dimension automatically:
# [batch, seq_len, output_dim] + [seq_len, output_dim] -> [batch, seq_len, output_dim]
# If you prefer being explicit, you can unsqueeze: pos_embeddings[None, :, :] (shape [1, seq_len, output_dim])
input_embeddings = token_embeddings + pos_embeddings
print("Final input embeddings shape:", input_embeddings.shape)  # e.g., [8, 4, 256]
print(input_embeddings)

torch.Size([8, 4, 256])
tensor([[[ 0.5720, -2.1740, -0.1688,  ...,  0.9444,  0.1996,  0.3124],
         [ 1.3327, -1.5452,  0.1228,  ...,  1.4366, -0.2740,  0.1900],
         [ 3.9273, -2.3975,  2.0987,  ..., -3.3110,  0.7839, -1.7810],
         [ 0.4537,  1.2559, -0.9938,  ..., -4.4741, -1.6357, -1.0020]],

        [[ 0.5977,  0.6067,  1.1473,  ...,  0.9691,  0.3378,  0.0908],
         [ 0.0948,  0.9111, -1.0515,  ..., -0.8423,  0.1205,  0.0752],
         [ 2.1904, -2.0311, -0.1978,  ...,  0.5170, -2.0719, -1.3697],
         [ 0.9959,  0.2610, -1.0178,  ..., -3.4910, -3.6791, -0.9315]],

        [[ 0.6347,  0.0427, -0.2227,  ..., -0.1422, -1.2086,  1.0075],
         [ 1.6962, -0.9116, -1.1600,  ..., -1.3855,  0.9907,  1.3233],
         [ 3.9750, -0.3631, -0.0245,  ..., -0.3736, -0.6174, -1.5609],
         [ 0.8320,  0.8324,  0.3768,  ..., -2.2599, -0.0125,  0.1246]],

        ...,

        [[-0.6465, -1.9357, -1.7461,  ...,  0.6192, -1.0874,  0.7051],
         [ 0.4876, -1.5272, -1.47

# Notes and tips:
- These are learned “absolute” position embeddings. They work up to context_length tokens.
If you later use longer sequences than context_length, you must enlarge the table and re-train or re-init.
- For variable-length batches with padding, ensure position indices only cover true tokens (not pad), or mask attention accordingly so pads don’t affect learning.
- Alternative schemes include sinusoidal or rotary embeddings; the add step stays the same: token + position.

