# All Tokenizers Demo
This notebook demonstrates all available tokenizer types in the `llm-trainer` framework.


In [None]:
from llm_trainer.tokenizer import create_tokenizer
import os

# Sample text for demonstration
sample_text = "The quick brown fox jumps over the lazy dog. Hello world! 12345."
print(f"Original text: {sample_text}")


## 1. BPE Tokenizer (Byte Pair Encoding)
BPE is the most common tokenizer for modern LLMs.


In [None]:
# Create and train a small BPE tokenizer
bpe_tokenizer = create_tokenizer("bpe")
bpe_tokenizer.train([sample_text], vocab_size=100)

encoded = bpe_tokenizer.encode(sample_text)
decoded = bpe_tokenizer.decode(encoded)

print(f"BPE Encoded: {encoded}")
print(f"BPE Decoded: {decoded}")
print(f"BPE Tokens: {[bpe_tokenizer.decode([t]) for t in encoded]}")


## 2. WordPiece Tokenizer
Commonly used in BERT models.


In [None]:
wp_tokenizer = create_tokenizer("wordpiece")
wp_tokenizer.train([sample_text], vocab_size=100)

encoded = wp_tokenizer.encode(sample_text)
print(f"WordPiece Tokens: {[wp_tokenizer.decode([t]) for t in encoded]}")


## 3. SentencePiece Tokenizer
Commonly used in Llama and T5 models.


In [None]:
sp_tokenizer = create_tokenizer("sentencepiece")
sp_tokenizer.train([sample_text], vocab_size=100)

encoded = sp_tokenizer.encode(sample_text)
print(f"SentencePiece Tokens: {[sp_tokenizer.decode([t]) for t in encoded]}")


## 4. Character and Byte-level BPE
Character-level tokenization and GPT-2 style byte-level BPE.


In [None]:
char_tokenizer = create_tokenizer("char")
char_tokenizer.train([sample_text])
print(f"Char Tokens: {[char_tokenizer.decode([t]) for t in char_tokenizer.encode(sample_text)]}")

byte_tokenizer = create_tokenizer("bytebpe")
byte_tokenizer.train([sample_text], vocab_size=100)
print(f"ByteBPE Tokens: {[byte_tokenizer.decode([t]) for t in byte_tokenizer.encode(sample_text)]}")


## 5. HuggingFace Pretrained Tokenizer
Load any tokenizer from the HuggingFace Hub.


In [None]:
# Load GPT-2 tokenizer
hf_tokenizer = create_tokenizer("hf", pretrained_path="gpt2")

encoded = hf_tokenizer.encode(sample_text)
print(f"HF GPT-2 Tokens: {[hf_tokenizer.decode([t]) for t in encoded]}")
