In [1]:
# Quick sanity check: show installed versions used in this notebook.
# This helps when results differ across environments.
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.7.1
tiktoken version: 0.9.0


In [2]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    """
    Creates overlapping (input, target) pairs from a long text using a sliding window.

    - Tokenization: We use a fast BPE tokenizer (tiktoken GPT-2).
    - Windowing: For each window of length `max_length`, the target is the same
      sequence shifted by one position (next-token prediction).
    """
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text once for efficiency.
        # allowed_special ensures the special token is not removed if present in the text.
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Sliding window over the tokenized text:
        # - Start at i, take max_length tokens as input
        # - Targets are the same tokens shifted by 1 (next-token labels)
        # - Step forward by `stride` to create overlapping chunks
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]                 # [t_i, ..., t_{i+L-1}]
            target_chunk = token_ids[i + 1: i + max_length + 1]       # [t_{i+1}, ..., t_{i+L}]
            self.input_ids.append(torch.tensor(input_chunk))          # shape: (max_length,)
            self.target_ids.append(torch.tensor(target_chunk))        # shape: (max_length,)

    def __len__(self):
        # Number of (input, target) pairs produced by the sliding window
        return len(self.input_ids)

    def __getitem__(self, idx):
        # Returns a tuple (x, y) where y is x shifted by one token
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size, max_length, stride,
                         shuffle=True, drop_last=True, num_workers=0):
    """
    Wrap GPTDatasetV1 with a PyTorch DataLoader for batched iteration.

    Args:
        txt: Raw text to tokenize and slice
        batch_size: Number of sequences per batch
        max_length: Sequence length per sample
        stride: Step size between consecutive windows (controls overlap)
        shuffle: Shuffle dataset order each epoch
        drop_last: Drop last incomplete batch to keep shapes consistent
        num_workers: Number of subprocesses for data loading
    """
    # Initialize the production-grade GPT-2 tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Build the sliding-window dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create a DataLoader that yields batches of shape:
    #   x: (batch_size, max_length)
    #   y: (batch_size, max_length)
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

# Load a small text for demonstration.
# Assumes the file is present locally.
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Define embedding hyperparameters:
# - vocab_size: tokenizer-dependent (GPT-2 uses 50257)
# - output_dim: embedding width (model hidden size)
# - context_length: maximum sequence length for positional embeddings
vocab_size = 50257
output_dim = 256
context_length = 1024

# Token and positional embedding layers:
# token_embedding_layer maps token IDs to vectors of size output_dim
# pos_embedding_layer maps positions [0..context_length-1] to vectors
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

# Create a small DataLoader to visualize shapes and the embedding addition.
batch_size = 8
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=batch_size,
    max_length=max_length,
    stride=max_length  # no overlap for simplicity (stride == max_length)
)

In [3]:
# Iterate one batch to build input embeddings:
# - x: token IDs (batch_size, max_length)
# - y: next-token labels (batch_size, max_length)
for batch in dataloader:
    x, y = batch

    # Token embeddings: (batch_size, max_length, output_dim)
    token_embeddings = token_embedding_layer(x)

    # Positional embeddings: (max_length, output_dim)
    # Broadcast across batch dimension when added to token embeddings.
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    # Final input embeddings are the sum (standard in transformer encoders/decoders)
    # Shape remains: (batch_size, max_length, output_dim)
    input_embeddings = token_embeddings + pos_embeddings

    break  # we only need the first batch for demonstration

In [4]:
# Confirm the resulting tensor shape: (batch_size, sequence_length, embedding_dim)
print(input_embeddings.shape)

torch.Size([8, 4, 256])
