# BPE Tokenizer Training

This notebook shows how to train a BPE tokenizer from a dataset.

In [1]:
from llm_trainer.tokenizer import create_tokenizer

# Create BPE tokenizer
tokenizer = create_tokenizer("bpe")

In [3]:
from datasets import load_dataset
dataset = load_dataset('wikitext', 'wikitext-103-raw-v1', split='train')
text_column = 'text'

tokenizer.train(
    dataset=dataset,
    vocab_size=3200,
    max_samples=1000,
    text_column=text_column,
    verbose=True
)


Dataset size: 1801350, using 1000 samples


Loading texts:   0%|          | 1000/1801350 [00:00<01:42, 17549.90it/s]


Loaded 647 texts
Training BPE tokenizer...
Collecting word frequencies...


Processing texts: 100%|██████████| 647/647 [00:00<00:00, 2239.70it/s]


Found 4170 unique words
Initial vocabulary size: 120
Learning 3080 BPE merges...


Learning merges: 100%|██████████| 3080/3080 [01:01<00:00, 49.96it/s]

Final vocabulary size: 3200
Learned 3080 merges





In [4]:
# Test the tokenizer
test_text = "The quick brown fox jumps over the lazy dog."
token_ids = tokenizer.encode(test_text)
print(f"Text: {test_text}")
print(f"Token IDs: {token_ids[:20]}...")  # Show first 20
print(f"Total tokens: {len(token_ids)}")

Text: The quick brown fox jumps over the lazy dog.
Token IDs: [2, 185, 1, 330, 787, 1, 844, 2705, 1, 65, 74, 521, 1, 69, 256, 472, 1, 551, 1, 126]...
Total tokens: 29


In [5]:
# Save tokenizer
tokenizer.save_pretrained("./saved_tokenizer")
print("Tokenizer saved!")

Tokenizer saved!


In [6]:
# Load tokenizer
loaded_tokenizer = create_tokenizer("bpe", pretrained_path="./saved_tokenizer")
print(f"Loaded tokenizer vocab size: {loaded_tokenizer.vocab_size}")

Loaded tokenizer vocab size: 3200
