The basics of building and inspecting a simple
WordLevel tokenizer using Hugging Face Tokenizers, based on a small dummy dataset.
It follows the official Hugging Face Tokenizers quick tour:
https://huggingface.co/docs/tokenizers/quicktour.

In [1]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [2]:
dummy_ds = [
    {"translation": {"en": "hello world", "el": "γειά σου κόσμε"}},
    {"translation": {"en": "hello there", "el": "γειά σου"}},
    {"translation": {"en": "machine learning is fun", "el": "η μηχανική μάθηση είναι ωραία"}},
]

In [4]:
def get_all_sentences(ds, lang):
  for item in ds:
    yield item["translation"][lang]

In [5]:
list(get_all_sentences(dummy_ds, "en"))

['hello world', 'hello there', 'machine learning is fun']

In [6]:
list(get_all_sentences(dummy_ds, "el"))

['γειά σου κόσμε', 'γειά σου', 'η μηχανική μάθηση είναι ωραία']

In [7]:
# create a WordLevel tokenizer (1 word → 1 token)
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))

In [8]:
tokenizer

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=WordLevel(vocab={}, unk_token="[UNK]"))

In [9]:
# test it (this produces error as no tokens added yet)
tokenizer.encode("hello").ids

Exception: WordLevel error: Missing [UNK] token from the vocabulary

In [10]:
# add a pre-tokenizer (splits text before encoding)
tokenizer.pre_tokenizer = Whitespace()

In [12]:
tokenizer.pre_tokenizer.pre_tokenize_str("hello world")

[('hello', (0, 5)), ('world', (6, 11))]

In [13]:
# define the trainer
trainer = WordLevelTrainer(
    special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], # tokens added to vocab
    min_frequency=2, # drop rare words, e.g., if 'hello' appears twice, keep it, if 'word' appears once, drop it (becomes [UNK])
)

In [15]:
trainer

WordLevelTrainer(WordLevelTrainer(min_frequency=2, vocab_size=30000, show_progress=True, special_tokens=[AddedToken(content="[UNK]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True), AddedToken(content="[PAD]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True), AddedToken(content="[SOS]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True), AddedToken(content="[EOS]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True)], words={"is":1, "hello":2, "world":1, "learning":1, "machine":1, "there":1, "fun":1}))

In [14]:
# train the tokenizer from our text
tokenizer.train_from_iterator(
    get_all_sentences(dummy_ds, "en"),
    trainer=trainer
)

In [16]:
# inspect the vocab
tokenizer.get_vocab()

{'[PAD]': 1, '[UNK]': 0, '[SOS]': 2, 'hello': 4, '[EOS]': 3}

In [17]:
# encode exampple 1
tokenizer.encode("hello").ids

[4]

In [18]:
# encode example 2
tokenizer.encode("hello world").ids

[4, 0]