# Tokenizer Basics

This notebook demonstrates the basics of using tokenizers in LLM Trainer.

In [1]:
from llm_trainer.tokenizer import create_tokenizer

# Create a simple tokenizer
tokenizer = create_tokenizer("simple")

# Train on sample text
texts = [
    "Hello world! This is a test.",
    "Tokenization breaks text into tokens.",
    "Each token gets a unique ID."
]

tokenizer.train(texts, vocab_size=1000, verbose=True)

Building simple word-level vocabulary...


Counting words: 100%|██████████| 3/3 [00:00<00:00, 7752.87it/s]

Vocabulary size: 20 words
Special tokens: 4
Regular words: 16





In [2]:
# Encode text
text = "Hello world!"
token_ids = tokenizer.encode(text)
print(f"Text: {text}")
print(f"Token IDs: {token_ids}")
print(f"Number of tokens: {len(token_ids)}")

Text: Hello world!
Token IDs: [2, 5, 6, 3]
Number of tokens: 4


In [3]:
# Decode back to text
decoded = tokenizer.decode(token_ids)
print(f"Decoded: {decoded}")

Decoded: Hello world!


In [4]:
# Get vocabulary information
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Sample tokens: {list(tokenizer.get_vocab().keys())[:10]}")

Vocabulary size: 20
Sample tokens: ['<pad>', '<unk>', '<bos>', '<eos>', 'a', 'Hello', 'world!', 'This', 'is', 'test.']
