In [285]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import normalizers
from tokenizers.normalizers import NFKC, NFKD, Nmt
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast

In [287]:
df = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

In [289]:
def get_training_corpus():
    for i in range(0, len(df), 1000):
        yield df[i:i+1000]["text"]

In [291]:
#Dealing with unknown words
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [293]:
#Normalize
tokenizer.normalizer = normalizers.Sequence([
    NFKC(),
    NFKD(),
    Nmt()
])

In [295]:
#splt words based on white space
tokenizer.pre_tokenizer = Whitespace()

In [297]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = BpeTrainer(vocab_size=1000000, special_tokens=special_tokens)

In [299]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [301]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ]
)

In [303]:
encoding = tokenizer.encode("Hello World! My name is Jay, what's yours?")
print("Tokens:", encoding.tokens)

Tokens: ['[CLS]', 'Hello', 'World', '!', 'My', 'name', 'is', 'Jay', ',', 'what', "'", 's', 'y', 'ours', '?', '[SEP]']


In [305]:
tokenizer.save("my_tokenizer.json")

In [307]:
loaded_tokenizer = Tokenizer.from_file("my_tokenizer.json")
encoding = loaded_tokenizer.encode("Hello World! My name is Jay, what's yours?")
print("Tokens:", encoding.tokens)

Tokens: ['[CLS]', 'Hello', 'World', '!', 'My', 'name', 'is', 'Jay', ',', 'what', "'", 's', 'y', 'ours', '?', '[SEP]']


In [309]:
wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
wrapped_tokenizer.encode("Hello World! My name is Jay, what's yours?")

[2, 80283, 1700, 5, 2899, 1730, 894, 6348, 16, 1729, 11, 87, 93, 2333, 35, 3]