In [1]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

print("="*80)
print("TOKENIZER EXAMPLE")
print("="*80)


TOKENIZER EXAMPLE


In [2]:
# Simulate dataset
def get_all_sentences(ds, lang):
    """Yields sentences from dataset"""
    sentences = [
        "I love machine learning",
        "I love deep learning",
        "transformers are amazing",
        "attention is all you need",
        "I love transformers"
    ]
    for sentence in sentences:
        yield sentence

# Config
config = {
    'tokenizer_file': './tokenizer_{0}.json'
}
lang = "en"

In [3]:
# Build tokenizer
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    print(f"\nTokenizer path: {tokenizer_path}")
    
    if not Path.exists(tokenizer_path):
        print("Building new tokenizer...")
        
        # Create tokenizer
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        
        # Create trainer
        trainer = WordLevelTrainer(
            special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
            min_frequency=2
        )
        
        # Train
        print("Training tokenizer on sentences...")
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        
        # Save
        tokenizer.save(str(tokenizer_path))
        print(f"Tokenizer saved to {tokenizer_path}")
    else:
        print("Loading existing tokenizer...")
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    
    return tokenizer

In [4]:
# Get tokenizer
tokenizer = get_or_build_tokenizer(config, None, lang)

# Test tokenizer
print("\n" + "="*80)
print("TESTING TOKENIZER")
print("="*80)


Tokenizer path: tokenizer_en.json
Building new tokenizer...
Training tokenizer on sentences...
Tokenizer saved to tokenizer_en.json

TESTING TOKENIZER


In [5]:
# Get vocabulary
vocab = tokenizer.get_vocab()
print(f"\nVocabulary size: {len(vocab)}")
print("\nFirst 15 tokens:")
for token, id in list(vocab.items())[:15]:
    print(f"  {token:>15s}: {id}")


Vocabulary size: 8

First 15 tokens:
            [PAD]: 1
             love: 5
         learning: 6
            [SOS]: 2
            [UNK]: 0
            [EOS]: 3
                I: 4
     transformers: 7


In [6]:
# Encode text
text = "I love transformers"
encoded = tokenizer.encode(text)
print(f"\n\nOriginal text: '{text}'")
print(f"Token IDs: {encoded.ids}")
print(f"Tokens: {encoded.tokens}")



Original text: 'I love transformers'
Token IDs: [4, 5, 7]
Tokens: ['I', 'love', 'transformers']


In [7]:
# Decode back
decoded = tokenizer.decode(encoded.ids)
print(f"Decoded text: '{decoded}'")

Decoded text: 'I love transformers'


In [8]:

# Unknown word
text_unk = "I love python"  # 'python' not in training data
encoded_unk = tokenizer.encode(text_unk)
print(f"\n\nText with unknown word: '{text_unk}'")
print(f"Token IDs: {encoded_unk.ids}")
print(f"Tokens: {encoded_unk.tokens}")



Text with unknown word: 'I love python'
Token IDs: [4, 5, 0]
Tokens: ['I', 'love', '[UNK]']


### COMPLETE HUGGING FACE TOKENIZERS LIBRARY TUTORIAL

In [13]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel, BPE, WordPiece, Unigram
from tokenizers.pre_tokenizers import Whitespace, ByteLevel, CharDelimiterSplit
from tokenizers.trainers import WordLevelTrainer, BpeTrainer, WordPieceTrainer, UnigramTrainer
from tokenizers.normalizers import NFD, Lowercase, StripAccents, Sequence
from tokenizers.processors import TemplateProcessing

### STEP1 -GIVE THE DEFINITION OF A TOKEN (what does one token mean -is it a word ,character ?? etc.)

In [14]:
print("="*80)
print("PART 1: TOKENIZER MODELS (How tokens are created)")
print("="*80)

PART 1: TOKENIZER MODELS (How tokens are created)


In [17]:
# ============================================================================
# 1. TOKENIZER MODELS
# ============================================================================

print("\n" + "-"*80)
print("1.1 WordLevel (Simplest - Each word = One token)")
print("-"*80)

"""
WordLevel Tokenizer
-------------------
- Treats each WHOLE WORD as a single token
- Unknown words → [UNK]
- Vocabulary grows with unique words
- Simple but large vocabulary

Syntax:
    from tokenizers.models import WordLevel
    model = WordLevel(unk_token="[UNK]")

Parameters:
    - unk_token (str): Token for unknown words. Default: "[UNK]"
    - vocab (Dict[str, int], optional): Pre-built vocabulary

When to use:
    ✓ Small, controlled vocabulary
    ✓ Words are meaningful units (not subwords)
    ✗ Large datasets (huge vocab)
    ✗ Handling rare words (all become [UNK])
"""

tokenizer_wl = Tokenizer(WordLevel(unk_token="[UNK]"))
print("Created WordLevel tokenizer")
print(f"  Model type: {type(tokenizer_wl.model)}")
print(f"  Unknown token: [UNK]")


# Example vocabulary
example_vocab_wl = {
    "[UNK]": 0, "[PAD]": 1, "hello": 2, "world": 3, "cat": 4, "dog": 5
}
print(f"\nExample vocabulary: {example_vocab_wl}")
print("Tokenizing 'hello world':")
print("  Result: ['hello', 'world'] → IDs [2, 3]")
print("Tokenizing 'hello python' (python not in vocab):")
print("  Result: ['hello', '[UNK]'] → IDs [2, 0]")


--------------------------------------------------------------------------------
1.1 WordLevel (Simplest - Each word = One token)
--------------------------------------------------------------------------------
Created WordLevel tokenizer
  Model type: <class 'tokenizers.models.WordLevel'>
  Unknown token: [UNK]

Example vocabulary: {'[UNK]': 0, '[PAD]': 1, 'hello': 2, 'world': 3, 'cat': 4, 'dog': 5}
Tokenizing 'hello world':
  Result: ['hello', 'world'] → IDs [2, 3]
Tokenizing 'hello python' (python not in vocab):
  Result: ['hello', '[UNK]'] → IDs [2, 0]


In [18]:
print("\n" + "-"*80)
print("1.2 BPE (Byte-Pair Encoding - Subword tokenization)")
print("-"*80)

"""
BPE (Byte-Pair Encoding)
------------------------
- Learns subword units by merging frequent character pairs
- Can represent any word using subwords
- Used by GPT-2, RoBERTa

Syntax:
    from tokenizers.models import BPE
    model = BPE(unk_token="[UNK]")

Parameters:
    - unk_token (str, optional): Unknown token
    - continuing_subword_prefix (str): Prefix for continuing subwords. Default: "##"
    - end_of_word_suffix (str): Suffix for end-of-word subwords
    - dropout (float, optional): BPE dropout for regularization

How it works:
    1. Start with characters: h, e, l, l, o
    2. Merge frequent pairs: (l, l) → ll
    3. Continue merging: (he, ll) → hell, etc.
    4. Result: "hello" might become ["hell", "o"] or ["h", "ello"]

When to use:
    ✓ Large vocabulary needed
    ✓ Handle rare/unknown words
    ✓ Open-domain text (internet, books)
    ✗ When interpretability needed
"""

tokenizer_bpe = Tokenizer(BPE(unk_token="[UNK]"))
print("Created BPE tokenizer")
print(f"  Model type: {type(tokenizer_bpe.model)}")
print("\nExample (after training):")
print("  'unhappiness' → ['un', 'happiness'] or ['un', 'happy', 'ness']")
print("  'antidisestablishmentarianism' → subword pieces")


--------------------------------------------------------------------------------
1.2 BPE (Byte-Pair Encoding - Subword tokenization)
--------------------------------------------------------------------------------
Created BPE tokenizer
  Model type: <class 'tokenizers.models.BPE'>

Example (after training):
  'unhappiness' → ['un', 'happiness'] or ['un', 'happy', 'ness']
  'antidisestablishmentarianism' → subword pieces


In [19]:
print("\n" + "-"*80)
print("1.3 WordPiece (Used by BERT)")
print("-"*80)

"""
WordPiece
---------
- Similar to BPE but uses likelihood instead of frequency
- Used by BERT, DistilBERT
- Subword tokenization with ## prefix for continuations

Syntax:
    from tokenizers.models import WordPiece
    model = WordPiece(unk_token="[UNK]")

Parameters:
    - unk_token (str): Unknown token. Default: "[UNK]"
    - continuing_subword_prefix (str): Prefix for subwords. Default: "##"
    - max_input_chars_per_word (int): Max chars per word. Default: 100

Difference from BPE:
    - BPE: Merge most frequent pairs
    - WordPiece: Merge pairs that maximize likelihood

Example tokenization:
    "playing" → ["play", "##ing"]
    "unbelievable" → ["un", "##be", "##liev", "##able"]

When to use:
    ✓ BERT-style models
    ✓ When using pretrained BERT tokenizer
"""

tokenizer_wp = Tokenizer(WordPiece(unk_token="[UNK]"))
print("Created WordPiece tokenizer")
print(f"  Continuing subword prefix: ##")
print("\nExample (after training):")
print("  'playing' → ['play', '##ing']")
print("  'unhappiness' → ['un', '##happiness'] or ['un', '##happy', '##ness']")


--------------------------------------------------------------------------------
1.3 WordPiece (Used by BERT)
--------------------------------------------------------------------------------
Created WordPiece tokenizer
  Continuing subword prefix: ##

Example (after training):
  'playing' → ['play', '##ing']
  'unhappiness' → ['un', '##happiness'] or ['un', '##happy', '##ness']


In [20]:
print("\n" + "-"*80)
print("1.4 Unigram (Probabilistic)")
print("-"*80)

"""
Unigram
-------
- Probabilistic language model
- Each token has a probability
- Used by SentencePiece, T5

Syntax:
    from tokenizers.models import Unigram
    model = Unigram()

How it works:
    - Starts with large vocabulary
    - Removes tokens to maximize likelihood
    - Multiple tokenizations possible (chooses most probable)

When to use:
    ✓ When probability-based tokenization needed
    ✓ SentencePiece-style tokenization
"""

tokenizer_uni = Tokenizer(Unigram())
print("Created Unigram tokenizer")
print("  Uses probabilistic approach")


--------------------------------------------------------------------------------
1.4 Unigram (Probabilistic)
--------------------------------------------------------------------------------
Created Unigram tokenizer
  Uses probabilistic approach


### Step2 - On what Basis to splitting the tokens?

In [21]:
# ============================================================================
# 2. PRE-TOKENIZERS (Split text before tokenization)
# ============================================================================

print("\n\n" + "="*80)
print("PART 2: PRE-TOKENIZERS (How to split text into words)")
print("="*80)

print("\n" + "-"*80)
print("2.1 Whitespace (Split on spaces)")
print("-"*80)

"""
Whitespace Pre-tokenizer
------------------------
- Splits on whitespace: spaces, tabs, newlines
- Simplest pre-tokenizer

Syntax:
    from tokenizers.pre_tokenizers import Whitespace
    tokenizer.pre_tokenizer = Whitespace()

Example:
    Input:  "Hello world\thow are you?"
    Output: ["Hello", "world", "how", "are", "you?"]
    
Note: Punctuation stays attached to words!
"""

from tokenizers.pre_tokenizers import Whitespace
tokenizer_wl.pre_tokenizer = Whitespace()

print("Set Whitespace pre-tokenizer")
print("\nExample splitting:")
print("  Input:  'Hello world! How are you?'")
print("  Output: ['Hello', 'world!', 'How', 'are', 'you?']")
print("  Note: Punctuation attached to words")




PART 2: PRE-TOKENIZERS (How to split text into words)

--------------------------------------------------------------------------------
2.1 Whitespace (Split on spaces)
--------------------------------------------------------------------------------
Set Whitespace pre-tokenizer

Example splitting:
  Input:  'Hello world! How are you?'
  Output: ['Hello', 'world!', 'How', 'are', 'you?']
  Note: Punctuation attached to words


In [22]:
print("\n" + "-"*80)
print("2.2 ByteLevel (GPT-2 style)")
print("-"*80)

"""
ByteLevel Pre-tokenizer
-----------------------
- Used by GPT-2, RoBERTa
- Splits on whitespace but handles bytes
- Adds 'Ġ' prefix for space character

Syntax:
    from tokenizers.pre_tokenizers import ByteLevel
    tokenizer.pre_tokenizer = ByteLevel()

Parameters:
    - add_prefix_space (bool): Add space before first token. Default: True

Example:
    Input:  "Hello world"
    Output: ["Hello", "Ġworld"]  (Ġ = space marker)

When to use:
    ✓ GPT-2 style models
    ✓ When using byte-level encoding
"""

print("ByteLevel pre-tokenizer")
print("\nExample splitting:")
print("  Input:  'Hello world'")
print("  Output: ['Hello', 'Ġworld']  (Ġ represents space)")


--------------------------------------------------------------------------------
2.2 ByteLevel (GPT-2 style)
--------------------------------------------------------------------------------
ByteLevel pre-tokenizer

Example splitting:
  Input:  'Hello world'
  Output: ['Hello', 'Ġworld']  (Ġ represents space)


In [23]:
print("\n" + "-"*80)
print("2.3 CharDelimiterSplit (Custom delimiter)")
print("-"*80)

"""
CharDelimiterSplit
------------------
- Split on custom character delimiter

Syntax:
    from tokenizers.pre_tokenizers import CharDelimiterSplit
    tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter=',')

Parameters:
    - delimiter (str): Character to split on

Example:
    delimiter = ','
    Input:  "apple,banana,cherry"
    Output: ["apple", "banana", "cherry"]
"""

print("CharDelimiterSplit(delimiter=',')")
print("\nExample splitting:")
print("  Input:  'apple,banana,cherry'")
print("  Output: ['apple', 'banana', 'cherry']")


--------------------------------------------------------------------------------
2.3 CharDelimiterSplit (Custom delimiter)
--------------------------------------------------------------------------------
CharDelimiterSplit(delimiter=',')

Example splitting:
  Input:  'apple,banana,cherry'
  Output: ['apple', 'banana', 'cherry']


### NORMALIZERS - Pre processing of Data

In [24]:
# ============================================================================
# 3. NORMALIZERS (Clean/standardize text)
# ============================================================================

print("\n\n" + "="*80)
print("PART 3: NORMALIZERS (Text cleaning/standardization)")
print("="*80)

print("\n" + "-"*80)
print("3.1 Lowercase (Convert to lowercase)")
print("-"*80)

"""
Lowercase Normalizer
--------------------
- Converts all text to lowercase
- Reduces vocabulary size
- Loses case information

Syntax:
    from tokenizers.normalizers import Lowercase
    tokenizer.normalizer = Lowercase()

Example:
    Input:  "Hello WORLD"
    Output: "hello world"

When to use:
    ✓ Case doesn't matter
    ✓ Reduce vocabulary
    ✗ Case is meaningful (names, acronyms)
"""

from tokenizers.normalizers import Lowercase
print("Lowercase normalizer")
print("\nExample:")
print("  Input:  'Hello WORLD'")
print("  Output: 'hello world'")



PART 3: NORMALIZERS (Text cleaning/standardization)

--------------------------------------------------------------------------------
3.1 Lowercase (Convert to lowercase)
--------------------------------------------------------------------------------
Lowercase normalizer

Example:
  Input:  'Hello WORLD'
  Output: 'hello world'


In [25]:
print("\n" + "-"*80)
print("3.2 NFD (Unicode Normalization)")
print("-"*80)

"""
NFD (Unicode Normalization Form D)
-----------------------------------
- Decomposes unicode characters
- Example: é → e + ́ (e + combining accent)

Syntax:
    from tokenizers.normalizers import NFD
    tokenizer.normalizer = NFD()

Example:
    Input:  "café"  (é is single character U+00E9)
    Output: "café"  (e + ́ is two characters U+0065 + U+0301)

When to use:
    ✓ Handling accented characters
    ✓ Before StripAccents
"""

print("NFD normalizer")
print("\nExample:")
print("  Input:  'café' (é as single char)")
print("  Output: 'café' (e + combining accent)")


--------------------------------------------------------------------------------
3.2 NFD (Unicode Normalization)
--------------------------------------------------------------------------------
NFD normalizer

Example:
  Input:  'café' (é as single char)
  Output: 'café' (e + combining accent)


In [26]:
print("\n" + "-"*80)
print("3.3 StripAccents (Remove accents)")
print("-"*80)

"""
StripAccents
------------
- Removes diacritical marks/accents
- Usually used after NFD

Syntax:
    from tokenizers.normalizers import StripAccents
    tokenizer.normalizer = StripAccents()

Example:
    Input:  "café naïve"
    Output: "cafe naive"

When to use:
    ✓ Ignore accents
    ✓ Treat café = cafe
"""

print("StripAccents normalizer")
print("\nExample:")
print("  Input:  'café naïve résumé'")
print("  Output: 'cafe naive resume'")


--------------------------------------------------------------------------------
3.3 StripAccents (Remove accents)
--------------------------------------------------------------------------------
StripAccents normalizer

Example:
  Input:  'café naïve résumé'
  Output: 'cafe naive resume'


In [32]:
print("\n" + "-"*80)
print("3.4 Sequence (Combine multiple normalizers)")
print("-"*80)

"""
Sequence
--------
- Chains multiple normalizers
- Applies in order

Syntax:
    from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

Example pipeline:
    Input:  "CAFÉ"
    → NFD:  "CAFÉ" (decompose é)
    → Lowercase: "café"
    → StripAccents: "cafe"
"""
print()
from tokenizers.normalizers import Sequence, NFD, StripAccents
normalizer_seq = Sequence([NFD(), Lowercase(), StripAccents()])

print("Sequence normalizer (NFD → Lowercase → StripAccents)")
print("\nExample pipeline:")
print("  Input:  'CAFÉ Naïve'")
print("  → NFD:  'CAFÉ Naïve' (decompose)")
print("  → Lowercase: 'café naïve'")
print("  → StripAccents: 'cafe naive'")


--------------------------------------------------------------------------------
3.4 Sequence (Combine multiple normalizers)
--------------------------------------------------------------------------------

Sequence normalizer (NFD → Lowercase → StripAccents)

Example pipeline:
  Input:  'CAFÉ Naïve'
  → NFD:  'CAFÉ Naïve' (decompose)
  → Lowercase: 'café naïve'
  → StripAccents: 'cafe naive'


### Trainers

In [33]:
# ============================================================================
# 4. TRAINERS (How to build vocabulary)
# ============================================================================

print("\n\n" + "="*80)
print("PART 4: TRAINERS (Building vocabulary)")
print("="*80)

print("\n" + "-"*80)
print("4.1 WordLevelTrainer")
print("-"*80)

"""
WordLevelTrainer
----------------
- Trains WordLevel tokenizer
- Builds vocabulary from whole words

Syntax:
    from tokenizers.trainers import WordLevelTrainer
    trainer = WordLevelTrainer(
        special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
        min_frequency=2,
        vocab_size=30000
    )

Parameters:
    - special_tokens (List[str]): Special tokens to add. Default: []
    - min_frequency (int): Minimum word frequency. Default: 0
    - vocab_size (int, optional): Maximum vocabulary size
    - show_progress (bool): Show progress bar. Default: True

Training process:
    1. Scan all sentences
    2. Count word frequencies
    3. Keep words with frequency ≥ min_frequency
    4. Assign unique ID to each word
"""

from tokenizers.trainers import WordLevelTrainer

trainer_wl = WordLevelTrainer(
    special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
    min_frequency=2,
    vocab_size=10000
)

print("WordLevelTrainer created")
print("  Special tokens: [UNK], [PAD], [SOS], [EOS]")
print("  min_frequency: 2 (ignore words appearing once)")
print("  vocab_size: 10000 (max vocabulary size)")
print("\nTraining process:")
print("  1. Count: {'cat': 5, 'dog': 3, 'python': 1, ...}")
print("  2. Filter: Keep cat (5≥2), dog (3≥2), drop python (1<2)")
print("  3. Build vocab: {[UNK]:0, [PAD]:1, [SOS]:2, [EOS]:3, cat:4, dog:5, ...}")



PART 4: TRAINERS (Building vocabulary)

--------------------------------------------------------------------------------
4.1 WordLevelTrainer
--------------------------------------------------------------------------------
WordLevelTrainer created
  Special tokens: [UNK], [PAD], [SOS], [EOS]
  min_frequency: 2 (ignore words appearing once)
  vocab_size: 10000 (max vocabulary size)

Training process:
  1. Count: {'cat': 5, 'dog': 3, 'python': 1, ...}
  2. Filter: Keep cat (5≥2), dog (3≥2), drop python (1<2)
  3. Build vocab: {[UNK]:0, [PAD]:1, [SOS]:2, [EOS]:3, cat:4, dog:5, ...}


In [34]:
print("\n" + "-"*80)
print("4.2 BpeTrainer")
print("-"*80)

"""
BpeTrainer
----------
- Trains BPE tokenizer
- Learns subword merges

Syntax:
    from tokenizers.trainers import BpeTrainer
    trainer = BpeTrainer(
        special_tokens=["[UNK]", "[PAD]"],
        vocab_size=30000,
        min_frequency=2
    )

Parameters:
    - special_tokens (List[str]): Special tokens
    - vocab_size (int): Target vocabulary size. Default: 30000
    - min_frequency (int): Min frequency. Default: 0
    - show_progress (bool): Show progress. Default: True
    - continuing_subword_prefix (str): Subword prefix. Default: "##"

Training process:
    1. Start with characters
    2. Count pair frequencies
    3. Merge most frequent pair
    4. Repeat until vocab_size reached
"""

from tokenizers.trainers import BpeTrainer

trainer_bpe = BpeTrainer(
    special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
    vocab_size=30000,
    min_frequency=2
)

print("BpeTrainer created")
print("  vocab_size: 30000")
print("  min_frequency: 2")
print("\nTraining process:")
print("  1. Start: {h, e, l, o, ...}")
print("  2. Most frequent pair: (l, o) → merge to 'lo'")
print("  3. Continue: {h, e, lo, ...} → (h, e) → 'he'")
print("  4. Final vocab: {a, b, c, ..., he, lo, hello, ...}")


--------------------------------------------------------------------------------
4.2 BpeTrainer
--------------------------------------------------------------------------------
BpeTrainer created
  vocab_size: 30000
  min_frequency: 2

Training process:
  1. Start: {h, e, l, o, ...}
  2. Most frequent pair: (l, o) → merge to 'lo'
  3. Continue: {h, e, lo, ...} → (h, e) → 'he'
  4. Final vocab: {a, b, c, ..., he, lo, hello, ...}


In [35]:
print("\n" + "-"*80)
print("4.3 WordPieceTrainer")
print("-"*80)

"""
WordPieceTrainer
----------------
- Trains WordPiece tokenizer
- BERT-style subword tokenization

Syntax:
    from tokenizers.trainers import WordPieceTrainer
    trainer = WordPieceTrainer(
        special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
        vocab_size=30000
    )

Parameters:
    - special_tokens (List[str]): Special tokens
    - vocab_size (int): Vocabulary size. Default: 30000
    - min_frequency (int): Min frequency. Default: 0
    - continuing_subword_prefix (str): Prefix. Default: "##"
"""

from tokenizers.trainers import WordPieceTrainer

trainer_wp = WordPieceTrainer(
    special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
    vocab_size=30000
)

print("WordPieceTrainer created")
print("  Special tokens: BERT-style [CLS], [SEP], [MASK]")
print("  Uses ## prefix for subwords")



--------------------------------------------------------------------------------
4.3 WordPieceTrainer
--------------------------------------------------------------------------------
WordPieceTrainer created
  Special tokens: BERT-style [CLS], [SEP], [MASK]
  Uses ## prefix for subwords


In [36]:
print("\n" + "-"*80)
print("4.4 UnigramTrainer")
print("-"*80)

"""
UnigramTrainer
--------------
- Trains Unigram tokenizer
- Probabilistic approach

Syntax:
    from tokenizers.trainers import UnigramTrainer
    trainer = UnigramTrainer(
        special_tokens=["[UNK]", "[PAD]"],
        vocab_size=8000
    )

Parameters:
    - special_tokens (List[str]): Special tokens
    - vocab_size (int): Vocabulary size. Default: 8000
"""

from tokenizers.trainers import UnigramTrainer

trainer_uni = UnigramTrainer(
    special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
    vocab_size=8000
)

print("UnigramTrainer created")
print("  vocab_size: 8000")
print("  Probabilistic tokenization")


--------------------------------------------------------------------------------
4.4 UnigramTrainer
--------------------------------------------------------------------------------
UnigramTrainer created
  vocab_size: 8000
  Probabilistic tokenization


### POST PROCESSING

In [37]:
# ============================================================================
# 5. POST-PROCESSORS (Add special tokens around sequences)
# ============================================================================

print("\n\n" + "="*80)
print("PART 5: POST-PROCESSORS (Add special tokens to sequences)")
print("="*80)

"""
TemplateProcessing
------------------
- Adds special tokens around tokenized sequences
- Used for BERT, GPT, etc.

Syntax:
    from tokenizers.processors import TemplateProcessing
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B [SEP]",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)]
    )

Parameters:
    - single (str): Template for single sequence
        $A = first sequence
    - pair (str): Template for sequence pair
        $A = first sequence, $B = second sequence
    - special_tokens (List[Tuple[str, int]]): Special tokens and their IDs

Example:
    Single sequence:
        Input: "Hello world"
        Template: "[CLS] $A [SEP]"
        Output: "[CLS] Hello world [SEP]"
    
    Sequence pair:
        Input: ("Hello", "world")
        Template: "[CLS] $A [SEP] $B [SEP]"
        Output: "[CLS] Hello [SEP] world [SEP]"
"""

print("\nTemplateProcessing")
print("-" * 80)
print("Adds special tokens around sequences")
print("\nBERT-style example:")
print("  Template: '[CLS] $A [SEP]'")
print("  Input: 'Hello world'")
print("  Output: '[CLS] Hello world [SEP]'")
print("\nSequence pair (for classification):")
print("  Template: '[CLS] $A [SEP] $B [SEP]'")
print("  Input: ('This is sentence 1', 'This is sentence 2')")
print("  Output: '[CLS] This is sentence 1 [SEP] This is sentence 2 [SEP]'")



PART 5: POST-PROCESSORS (Add special tokens to sequences)

TemplateProcessing
--------------------------------------------------------------------------------
Adds special tokens around sequences

BERT-style example:
  Template: '[CLS] $A [SEP]'
  Input: 'Hello world'
  Output: '[CLS] Hello world [SEP]'

Sequence pair (for classification):
  Template: '[CLS] $A [SEP] $B [SEP]'
  Input: ('This is sentence 1', 'This is sentence 2')
  Output: '[CLS] This is sentence 1 [SEP] This is sentence 2 [SEP]'


In [43]:
# ============================================================================
# 6. COMPLETE WORKING EXAMPLE
# ============================================================================

print("\n\n" + "="*80)
print("PART 6: COMPLETE WORKING EXAMPLE ")
print("="*80)

# Sample training data
training_data = [
    "I love machine learning and deep learning",
    "Transformers are amazing for NLP tasks",
    "I love natural language processing",
    "Deep learning models are powerful",
    "Machine learning is everywhere",
    "I love transformers and attention mechanisms"
]

print("\nTraining data:")
for i, text in enumerate(training_data, 1):
    print(f"  {i}. {text}")

# Create and train tokenizer
print("\n" + "-"*80)
print("Building WordLevel tokenizer...")
print("-"*80)

tokenizer_object = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer_object.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer_object.pre_tokenizer = Whitespace()

trainer_object = WordLevelTrainer(
    special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
    min_frequency=1
)

print("\nTraining tokenizer...")
tokenizer_object.train_from_iterator(training_data, trainer=trainer_object)

# Add post-processor
tokenizer_object.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    special_tokens=[("[SOS]", 2), ("[EOS]", 3)]
)

print("✓ Tokenizer trained successfully!")



PART 6: COMPLETE WORKING EXAMPLE 

Training data:
  1. I love machine learning and deep learning
  2. Transformers are amazing for NLP tasks
  3. I love natural language processing
  4. Deep learning models are powerful
  5. Machine learning is everywhere
  6. I love transformers and attention mechanisms

--------------------------------------------------------------------------------
Building WordLevel tokenizer...
--------------------------------------------------------------------------------

Training tokenizer...
✓ Tokenizer trained successfully!


In [44]:
# Show vocabulary
vocab = tokenizer_object.get_vocab()
print(f"\nVocabulary size: {len(vocab)}")
print("\nVocabulary:")
for token, idx in sorted(vocab.items(), key=lambda x: x[1]):
    print(f"  {idx:2d}: {token}")


Vocabulary size: 25

Vocabulary:
   0: [UNK]
   1: [PAD]
   2: [SOS]
   3: [EOS]
   4: learning
   5: i
   6: love
   7: and
   8: are
   9: deep
  10: machine
  11: transformers
  12: amazing
  13: attention
  14: everywhere
  15: for
  16: is
  17: language
  18: mechanisms
  19: models
  20: natural
  21: nlp
  22: powerful
  23: processing
  24: tasks


In [46]:
# Test tokenization
print("\n" + "-"*80)
print("Testing tokenization")
print("-"*80)

test_texts = [
    "I love transformers",
    "Deep learning is amazing",
    "Python programming"  # 'python' and 'programming' not in training data
]

for text in test_texts:
    encoded = tokenizer_object.encode(text)
    print(f"\nText: '{text}'")
    print(f"  Tokens: {encoded.tokens}")
    print(f"  IDs: {encoded.ids}")
    decoded = tokenizer_object.decode(encoded.ids)
    print(f"  Decoded: '{decoded}'")


--------------------------------------------------------------------------------
Testing tokenization
--------------------------------------------------------------------------------

Text: 'I love transformers'
  Tokens: ['[SOS]', 'i', 'love', 'transformers', '[EOS]']
  IDs: [2, 5, 6, 11, 3]
  Decoded: 'i love transformers'

Text: 'Deep learning is amazing'
  Tokens: ['[SOS]', 'deep', 'learning', 'is', 'amazing', '[EOS]']
  IDs: [2, 9, 4, 16, 12, 3]
  Decoded: 'deep learning is amazing'

Text: 'Python programming'
  Tokens: ['[SOS]', '[UNK]', '[UNK]', '[EOS]']
  IDs: [2, 0, 0, 3]
  Decoded: ''


In [47]:

# Save and load
print("\n" + "-"*80)
print("Saving and loading tokenizer")
print("-"*80)

save_path = "./example_tokenizer.json"
tokenizer_object.save(save_path)
print(f"✓ Saved to: {save_path}")

loaded_tokenizer = Tokenizer.from_file(save_path)
print(f"✓ Loaded from: {save_path}")

# Verify loaded tokenizer works
test_text = "I love machine learning"
original_encoded = tokenizer_object.encode(test_text)
loaded_encoded = loaded_tokenizer.encode(test_text)

print(f"\nVerification:")
print(f"  Original: {original_encoded.ids}")
print(f"  Loaded:   {loaded_encoded.ids}")
print(f"  Match: {original_encoded.ids == loaded_encoded.ids} ✓")


--------------------------------------------------------------------------------
Saving and loading tokenizer
--------------------------------------------------------------------------------
✓ Saved to: ./example_tokenizer.json
✓ Loaded from: ./example_tokenizer.json

Verification:
  Original: [2, 5, 6, 10, 4, 3]
  Loaded:   [2, 5, 6, 10, 4, 3]
  Match: True ✓


In [48]:

# ============================================================================
# 7. COMMON METHODS REFERENCE
# ============================================================================

print("\n\n" + "="*80)
print("PART 7: COMMON TOKENIZER METHODS")
print("="*80)

print("""
Tokenizer Object Methods:
------------------------

1. tokenizer.encode(text)
   - Encodes text to token IDs
   - Returns: Encoding object
   - Example: tokenizer.encode("Hello world")

2. tokenizer.encode_batch(texts)
   - Encodes multiple texts
   - Returns: List[Encoding]
   - Example: tokenizer.encode_batch(["Hello", "World"])

3. tokenizer.decode(ids)
   - Decodes token IDs back to text
   - Returns: str
   - Example: tokenizer.decode([1, 2, 3])

4. tokenizer.decode_batch(sequences)
   - Decodes multiple sequences
   - Returns: List[str]

5. tokenizer.get_vocab()
   - Returns vocabulary dictionary
   - Returns: Dict[str, int]
   - Example: {"hello": 0, "world": 1}

6. tokenizer.get_vocab_size()
   - Returns vocabulary size
   - Returns: int

7. tokenizer.token_to_id(token)
   - Get ID of specific token
   - Returns: int or None

8. tokenizer.id_to_token(id)
   - Get token from ID
   - Returns: str or None

9. tokenizer.save(path)
   - Save tokenizer to file
   - Example: tokenizer.save("tokenizer.json")

10. Tokenizer.from_file(path)
    - Load tokenizer from file
    - Example: Tokenizer.from_file("tokenizer.json")

11. tokenizer.train_from_iterator(iterator, trainer)
    - Train from text iterator
    - Example: tokenizer.train_from_iterator(texts, trainer)

12. tokenizer.enable_padding(pad_id, pad_token)
    - Enable padding
    - Example: tokenizer.enable_padding(pad_id=1, pad_token="[PAD]")

13. tokenizer.enable_truncation(max_length)
    - Enable truncation
    - Example: tokenizer.enable_truncation(max_length=512)

Encoding Object Properties:
---------------------------
encoding.ids          - List of token IDs
encoding.tokens       - List of token strings
encoding.offsets      - Character offsets
encoding.attention_mask - Attention mask (for padding)
encoding.type_ids     - Token type IDs (for BERT)
""")

print("="*80)
print("TUTORIAL COMPLETE!")
print("="*80)
print("\nKey takeaways:")
print("  1. Models: WordLevel, BPE, WordPiece, Unigram")
print("  2. Pre-tokenizers: Split text (Whitespace, ByteLevel, etc.)")
print("  3. Normalizers: Clean text (Lowercase, StripAccents, etc.)")
print("  4. Trainers: Build vocabulary (min_frequency, vocab_size)")
print("  5. Post-processors: Add special tokens ([CLS], [SEP], etc.)")
print("  6. Save/Load: .save() and .from_file()")
print("\nYou now have a complete understanding of Hugging Face tokenizers!")




PART 7: COMMON TOKENIZER METHODS

Tokenizer Object Methods:
------------------------

1. tokenizer.encode(text)
   - Encodes text to token IDs
   - Returns: Encoding object
   - Example: tokenizer.encode("Hello world")

2. tokenizer.encode_batch(texts)
   - Encodes multiple texts
   - Returns: List[Encoding]
   - Example: tokenizer.encode_batch(["Hello", "World"])

3. tokenizer.decode(ids)
   - Decodes token IDs back to text
   - Returns: str
   - Example: tokenizer.decode([1, 2, 3])

4. tokenizer.decode_batch(sequences)
   - Decodes multiple sequences
   - Returns: List[str]

5. tokenizer.get_vocab()
   - Returns vocabulary dictionary
   - Returns: Dict[str, int]
   - Example: {"hello": 0, "world": 1}

6. tokenizer.get_vocab_size()
   - Returns vocabulary size
   - Returns: int

7. tokenizer.token_to_id(token)
   - Get ID of specific token
   - Returns: int or None

8. tokenizer.id_to_token(id)
   - Get token from ID
   - Returns: str or None

9. tokenizer.save(path)
   - Save token