# Comparing Different Tokenizers

This notebook compares different tokenizer types to help you choose the right one.

In [None]:
from llm_trainer.tokenizer import create_tokenizer, get_available_tokenizers

# See available tokenizers
available = get_available_tokenizers()
print("Available tokenizers:")
for tokenizer_type, description in available.items():
    print(f"  â€¢ {tokenizer_type}: {description}")

In [None]:
# Test text
test_text = "The quick brown fox jumps over the lazy dog. Hello world!"
print(f"Test text: {test_text}")
print(f"Character count: {len(test_text)}\n")

In [None]:
# Compare tokenizers
tokenizer_types = ["simple", "char", "bpe"]
results = {}

for tokenizer_type in tokenizer_types:
    print(f"\n{'='*50}")
    print(f"Testing {tokenizer_type.upper()} tokenizer")
    print(f"{'='*50}")
    
    # Create and train tokenizer
    tokenizer = create_tokenizer(tokenizer_type)
    
    # Train on sample data
    training_texts = [
        "The quick brown fox jumps over the lazy dog.",
        "Hello world! This is a test.",
        "Tokenization is important for NLP."
    ] * 10  # Repeat for more data
    
    tokenizer.train(training_texts, vocab_size=500, verbose=False)
    
    # Test encoding
    token_ids = tokenizer.encode(test_text, add_special_tokens=False)
    decoded = tokenizer.decode(token_ids)
    
    results[tokenizer_type] = {
        "vocab_size": tokenizer.vocab_size,
        "num_tokens": len(token_ids),
        "token_ids": token_ids[:10],  # First 10
        "decoded": decoded
    }
    
    print(f"Vocabulary size: {tokenizer.vocab_size}")
    print(f"Number of tokens: {len(token_ids)}")
    print(f"First 10 token IDs: {token_ids[:10]}")
    print(f"Decoded text: {decoded}")

In [None]:
# Summary comparison
print("\n" + "="*60)
print("SUMMARY COMPARISON")
print("="*60)
print(f"{'Tokenizer':<15} {'Vocab Size':<15} {'Tokens':<10} {'Compression'}")
print("-"*60)

for tokenizer_type, result in results.items():
    compression = len(test_text) / result["num_tokens"] if result["num_tokens"] > 0 else 0
    print(f"{tokenizer_type:<15} {result['vocab_size']:<15} {result['num_tokens']:<10} {compression:.2f}")