# Tokenizer experiments

In [2]:
from gpt.tokenizer import train, Tokenizer
import json
import base64
from config import DATA_DIR, EOT


print("Imports and global definitions loaded.")
print(f"Data directory: {DATA_DIR}")

Imports and global definitions loaded.
Data directory: /Users/jg/code/cs336/gpt/data


## TinyStories tokenizer

### Training

In [2]:
def train_tinystories_tok():
    vocab, merges = train(
        DATA_DIR / "TinyStoriesV2-GPT4-train.txt",
        10000,
        [EOT],
    )

    out = DATA_DIR / "tinystories_bpe"
    out.mkdir(exist_ok=True)

    # ── vocab: id → bytes (stored as base64)
    with open(out / "vocab.json", "w") as f:
        json.dump(
            {
                str(i): base64.b64encode(tok).decode("ascii")
                for i, tok in vocab.items()
            },
            f,
        )

    # ── merges: one "token1 token2" per line, as base64
    with open(out / "merges.txt", "w") as f:
        for a, b in merges:
            a_str = base64.b64encode(a).decode("ascii")
            b_str = base64.b64encode(b).decode("ascii")
            f.write(f"{a_str} {b_str}\n")

train_tinystories_tok()

print("Tokenizer training complete.")
print(f"Out directory: {DATA_DIR / 'tinystories_bpe'}")

Tokenizer training complete.
Out directory: /Users/jg/code/cs336/gpt/data/tinystories_bpe


### Compression ratio

In [6]:
import random


t = Tokenizer.from_files(DATA_DIR / "tinystories_bpe" / "vocab.json", DATA_DIR / "tinystories_bpe" / "merges.txt", [EOT])


def analyze_compression(filename, split_by_eot=False):
    # Read and sample 10 documents
    with open(DATA_DIR / filename, "r") as f:
        if split_by_eot:
            # For TinyStories, split by EOT token
            content = f.read()
            documents = [doc.strip() for doc in content.split(EOT) if doc.strip()]
        else:
            # For OWT, split by newlines
            documents = [line.strip() for line in f if line.strip()]

        # Sample 10 random documents
        sampled_docs = random.sample(documents, 10)

    # Encode documents and calculate stats
    total_bytes = 0
    total_tokens = 0

    print(f"\nAnalyzing {filename}:")
    print("-" * 80)

    for i, doc in enumerate(sampled_docs, 1):
        # Get original size in bytes
        doc_bytes = len(doc.encode("utf-8"))
        # Encode to tokens
        tokens = t.encode(doc)
        num_tokens = len(tokens)

        total_bytes += doc_bytes
        total_tokens += num_tokens

        print(f"Document {i}:")
        print(f"Original text: {doc[:100]}...")
        print(f"Original size: {doc_bytes} bytes")
        print(f"Number of tokens: {num_tokens}")
        print(f"Compression ratio: {doc_bytes/num_tokens:.2f} bytes/token")
        print("-" * 80)

    print(f"\nOverall statistics for {filename}:")
    print(f"Total bytes: {total_bytes}")
    print(f"Total tokens: {total_tokens}")
    print(f"Average compression ratio: {total_bytes/total_tokens:.2f} bytes/token")


# Analyze both datasets
analyze_compression("TinyStoriesV2-GPT4-train.txt", split_by_eot=True)
analyze_compression("owt_valid.txt", split_by_eot=True)


Analyzing TinyStoriesV2-GPT4-train.txt:
--------------------------------------------------------------------------------
Document 1:
Original text: Once upon a time, there was a little girl who loved eating her mother's cooking. Every day, the mom ...
Original size: 900 bytes
Number of tokens: 195
Compression ratio: 4.62 bytes/token
--------------------------------------------------------------------------------
Document 2:
Original text: Once there was a small girl called Amy. She was three years old and loved to repeat the same things ...
Original size: 725 bytes
Number of tokens: 164
Compression ratio: 4.42 bytes/token
--------------------------------------------------------------------------------
Document 3:
Original text: Once upon a time, there was a green ant named Andy. Andy loved to march all day long. One day, as An...
Original size: 625 bytes
Number of tokens: 148
Compression ratio: 4.22 bytes/token
--------------------------------------------------------------------------

### Encode

In [4]:
import numpy as np
import os
import time

# t (tokenizer), DATA_DIR are assumed to be defined from previous cells.
# If EOT was part of the tokenizer's special tokens, encode_iterable will handle it.

datasets_to_encode = {
    "tinystories_train": "TinyStoriesV2-GPT4-train.txt",
    "tinystories_valid": "TinyStoriesV2-GPT4-valid.txt",
}

t = Tokenizer.from_files(DATA_DIR / "tinystories_bpe" / "vocab.json", DATA_DIR / "tinystories_bpe" / "merges.txt", [EOT])

print("Starting tokenization of datasets...")

for name, filename in datasets_to_encode.items():
    input_path = DATA_DIR / filename
    # Ensure the output filename clearly indicates it contains tokens and the original dataset name
    output_filename = f"{filename.split('.')[0]}_tokens.npy"  # e.g., TinyStoriesV2-GPT4-train_tokens.npy
    output_path = DATA_DIR / output_filename

    if not input_path.exists():
        print(f"Input file {input_path} not found. Skipping.")
        continue

    print(f"\nEncoding {input_path} to {output_path}...")

    start_time = time.time()

    all_tokens = []
    try:
        with open(input_path, "r", encoding="utf-8") as f:
            # encode_iterable processes the file content iteratively (e.g., line by line)
            # and yields token IDs. This is memory efficient.
            for token_id in t.encode_iterable(f):
                all_tokens.append(token_id)
    except Exception as e:
        print(f"Error encoding file {input_path}: {e}")
        continue

    token_array = np.array(all_tokens, dtype=np.uint16)

    try:
        np.save(output_path, token_array)
        print(f"Saved {len(token_array)} tokens to {output_path}.")
    except Exception as e:
        print(f"Error saving file {output_path}: {e}")
        continue

    end_time = time.time()
    duration = end_time - start_time

    try:
        file_size_bytes = os.path.getsize(input_path)
        throughput_mb_per_second = (
            (file_size_bytes / (1024 * 1024)) / duration
            if duration > 0
            else float("inf")
        )

        print(f"Original file size: {file_size_bytes / (1024**2):.2f} MB")
        print(f"Encoding time: {duration:.2f} seconds")
        print(f"Throughput: {throughput_mb_per_second:.2f} MB/s")
    except Exception as e:
        print(f"Error calculating throughput for {input_path}: {e}")

    print("-" * 40)

print("\nDataset tokenization complete.")

Starting tokenization of datasets...

Encoding /Users/jg/code/cs336/gpt/data/TinyStoriesV2-GPT4-train.txt to /Users/jg/code/cs336/gpt/data/TinyStoriesV2-GPT4-train_tokens.npy...
Saved 540796778 tokens to /Users/jg/code/cs336/gpt/data/TinyStoriesV2-GPT4-train_tokens.npy.
Original file size: 2124.55 MB
Encoding time: 1332.68 seconds
Throughput: 1.59 MB/s
----------------------------------------

Encoding /Users/jg/code/cs336/gpt/data/TinyStoriesV2-GPT4-valid.txt to /Users/jg/code/cs336/gpt/data/TinyStoriesV2-GPT4-valid_tokens.npy...
Saved 5461210 tokens to /Users/jg/code/cs336/gpt/data/TinyStoriesV2-GPT4-valid_tokens.npy.
Original file size: 21.46 MB
Encoding time: 12.64 seconds
Throughput: 1.70 MB/s
----------------------------------------

Dataset tokenization complete.
