In [13]:
# Read the input text file
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Print basic info about the text
print("Total number of characters:", len(raw_text))
print(raw_text[:20])  # Show first 20 characters

# ---------------- Tokenizer Definition ----------------

# Import regular expressions module
import re

# Simple tokenizer that uses a custom vocabulary to encode/decode text
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab  # Maps strings to integers
        self.int_to_str = {i: s for s, i in vocab.items()}  # Reverse mapping

    def encode(self, text):
        # Split on punctuation and whitespace, preserving the punctuation
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        # Remove empty strings and extra spaces
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        # Convert tokens to their corresponding ids using the vocab
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        # Convert ids back to strings and join them with space
        text = " ".join([self.int_to_str[i] for i in ids])
        # Clean up spaces before punctuation
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

Total number of characters: 20479
I HAD always thought


In [1]:
!pip install tiktoken

Collecting tiktoken
  Using cached tiktoken-0.9.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Using cached tiktoken-0.9.0-cp312-cp312-win_amd64.whl (894 kB)
Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl (273 kB)
Installing collected packages: regex, tiktoken

   -------------------- ------------------- 1/2 [tiktoken]
   ---------------------------------------- 2/2 [tiktoken]

Successfully installed regex-2024.11.6 tiktoken-0.9.0


In [7]:
import tiktoken

# Load the GPT-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Define the input text with a special token
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces "
    "of someunknownPlace."
)

# Encode the text, allowing the special token
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

# Decode back to string
strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


In [19]:
!pip install torch

Collecting torch
  Using cached torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Using cached torch-2.7.1-cp312-cp312-win_amd64.whl (216.1 MB)
Using cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Using cached filelock-3.18.0-py3-none-any.whl (16 kB)
Using cached fsspec-2025.5.1-py3-none-any.whl (199 kB)
Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ----

In [29]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

def create_gpt_dataloader(
    txt,
    batch_size=4,
    max_length=256,
    stride=128,
    shuffle=True,
    drop_last=True,
    num_workers=0,
    tokenizer_name="gpt2"
):
    class GPTDataset(Dataset):
        def __init__(self, txt, tokenizer, max_length, stride):
            self.input_ids = []
            self.target_ids = []

            # Tokenize the entire text
            token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

            # Create overlapping chunks
            for i in range(0, len(token_ids) - max_length, stride):
                input_chunk = token_ids[i:i + max_length]
                target_chunk = token_ids[i + 1: i + max_length + 1]
                self.input_ids.append(torch.tensor(input_chunk))
                self.target_ids.append(torch.tensor(target_chunk))

        def __len__(self):
            return len(self.input_ids)

        def __getitem__(self, idx):
            return self.input_ids[idx], self.target_ids[idx]

    # Initialize tokenizer
    tokenizer = tiktoken.get_encoding(tokenizer_name)

    # Create dataset and dataloader
    dataset = GPTDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [41]:
dataloader = create_gpt_dataloader(raw_text, batch_size=4, max_length=64, stride=32)

for inputs, targets in dataloader:
    print("Input IDs:", inputs[0][:4])
    print("Target IDs:", targets[0][:4])
    break

Input IDs: tensor([ 198, 6653, 6563, 2951])
Target IDs: tensor([6653, 6563, 2951, 6348])
