

# <h3 align="center"> $\underline{\text{ Tokenizer}}$</h3>

<h3 align="center">$ \text{Tokenization for Large Language Models}$</h3>








In [None]:
!pip install transformers

In [None]:
!pip install datasets

Building BERT, GPT-2, and XLNet tokenizers, with three main tokenization algorithms: WordPiece (Bert), BPE (GPT-2), and Unigram (XLNet).

# Building a BERT tokenizer (WorldPiece)

# 1. Gather a Corpus
Dataset: Wikitext

In [None]:
import transformers
from datasets import load_dataset

dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")

To train the tokenizer, we need a generator that will yield batches of 1,000 texts.

In [9]:
def getTCorpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

Generateinga text file that contains all the texts-inputs from the dataset to use locally

In [11]:
with open("wikitext-2.txt", "w", encoding="utf-8") as t:
    for i in range(len(dataset)):
        t.write(dataset[i]["text"] + "\n")

# 2. Create a backend_tokenizer with *tokenizers*

In [15]:
from tokenizers import (decoders,models,normalizers,pre_tokenizers,processors,trainers,Tokenizer,)
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

a) Normalization: By hand you can compose several normalizers using *Sequence()*

In [34]:
#tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)  #prebuilt
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()])

In [29]:
# Checking out the normalizer
print(tokenizer.normalizer.normalize_str("HÖw Äre YÖü?"))

how are you?


b) Pre-tokenization

In [35]:
#tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()  #prebuilt
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [39]:
#spliting on whitespace and punctuation:
tokenizer.pre_tokenizer.pre_tokenize_str("I'm fine, thanks!")

[('I', (0, 1)),
 ("'", (1, 2)),
 ('m', (2, 3)),
 ('fine', (4, 8)),
 (',', (8, 9)),
 ('thanks', (10, 16)),
 ('!', (16, 17))]

In [41]:
#spliting only on whitespace:
pre_tokenizer = pre_tokenizers.WhitespaceSplit()
pre_tokenizer.pre_tokenize_str("I'm fine, thanks!")

[("I'm", (0, 3)), ('fine,', (4, 9)), ('thanks!', (10, 17))]

In [44]:
#Sequence to compose several pre-tokenizers:
pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()])
pre_tokenizer.pre_tokenize_str("I'm fine, thanks-so-much!")

[('I', (0, 1)),
 ("'", (1, 2)),
 ('m', (2, 3)),
 ('fine', (4, 8)),
 (',', (8, 9)),
 ('thanks', (10, 16)),
 ('-', (16, 17)),
 ('so', (17, 19)),
 ('-', (19, 20)),
 ('much', (20, 24)),
 ('!', (24, 25))]

c) Model

 We require the *WordPieceTrainer*. You need to pass all the special tokens you intend to use

In [46]:
#Running the inputs through the model
special_tokens = ["[SEP]", "[MASK]", "[UNK]", "[PAD]", "[CLS]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [48]:
tokenizer.train_from_iterator(getTCorpus(), trainer=trainer)

In [51]:
#Testing the Tokenizer
encod = tokenizer.encode("Testing the tokenizer!")
print(encod.tokens)

['testing', 'the', 'tok', '##eni', '##zer', '!']


In [53]:
# We need to know the IDs of the [CLS] and [SEP] tokens
cls_id = tokenizer.token_to_id("[CLS]")
sep_id = tokenizer.token_to_id("[SEP]")
print(cls_id, sep_id)

4 0


TemplateProcessor: specify how to treat a single sentence and a pair of sentences

In [55]:
# Special tokens
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",             # The first single sentence is represented by $A
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",  # The second sentence (if pair) is represented by $B
    special_tokens=[("[CLS]", cls_id), ("[SEP]", sep_id)],)

In [63]:
encod = tokenizer.encode("Testing the tokenizer!", "Yes, why not?")
print(encod.tokens)
print(encod.type_ids)

['[CLS]', 'testing', 'the', 'tok', '##eni', '##zer', '!', '[SEP]', 'yes', ',', 'why', 'not', '?', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]


d) Postprocessor

Including a decoder:

In [64]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [66]:
tokenizer.decode(encod.ids)

'testing the tokenizer! yes, why not?'

Saving our tokenizer in a single JSON:

In [67]:
tokenizer.save("tokenizer.json")

# 3. Load the backend_tokenizer in a *transformers tokenizer*

In [68]:
load_tokenizer = Tokenizer.from_file("tokenizer.json")

You can then use this tokenizer like any other tokenizer.

In [69]:
from transformers import PreTrainedTokenizerFast
#set all the special tokens
wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,tokenizer_file="tokenizer.json",
    unk_token="[UNK]",pad_token="[PAD]",cls_token="[CLS]",sep_token="[SEP]",mask_token="[MASK]",)

# Building GPT-2 Tokenizer (BPE)

# 2. Create a backend_tokenizer with *tokenizers*

GPT-2 uses byte-level BPE

GPT-2 does not use a normalizer

GPT-2 the only special token is the end-of-text token

In [70]:
tokenizer = Tokenizer(models.BPE())

In [71]:
# Pre-tokenization
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [72]:
tokenizer.pre_tokenizer.pre_tokenize_str("Testing pre-tokenization!")

[('Testing', (0, 7)),
 ('Ġpre', (7, 11)),
 ('-', (11, 12)),
 ('tokenization', (12, 24)),
 ('!', (24, 25))]

In [74]:
# The model to be training
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(getTCorpus(), trainer=trainer)

In [75]:
# Training
tokenizer.model = models.BPE()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)

In [76]:
encod = tokenizer.encode("Testing the super-tokenizer.")
print(encod.tokens)

['T', 'est', 'ing', 'Ġthe', 'Ġsuper', '-', 't', 'oken', 'izer', '.']


In [77]:
# Byte-level post-processing for the GPT-2 tokenizer
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [81]:
sentence = "Testing the super-tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

' super'

In [82]:
# Adding byte-level decoder
tokenizer.decoder = decoders.ByteLevel()

In [83]:
tokenizer.decode(encoding.ids)

'Testing the super-tokenizer.'

In [84]:
# Save the tokenizer and wrap it
from transformers import PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",)

# Building XLNet (Unigram)

# 2. Create a backend_tokenizer with *tokenizers*

In [85]:
tokenizer = Tokenizer(models.Unigram())

In [86]:
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence([normalizers.Replace("``", '"'),normalizers.Replace("''", '"'),
        normalizers.NFKD(),normalizers.StripAccents(),normalizers.Replace(Regex(" {2,}"), " "),])

In [87]:
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()

In [88]:
tokenizer.pre_tokenizer.pre_tokenize_str("Testing the pre-tokenizer!")

[('▁Testing', (0, 7)), ('▁the', (7, 11)), ('▁pre-tokenizer!', (11, 26))]

In [90]:
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>")
tokenizer.train_from_iterator(getTCorpus(), trainer=trainer)

In [91]:
tokenizer.model = models.Unigram()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)

In [92]:
encoding = tokenizer.encode("Testing this tokenizer.")
print(encoding.tokens)

['▁Test', 'ing', '▁this', '▁to', 'ken', 'izer', '.']


In [95]:
cls_id = tokenizer.token_to_id("<cls>")
sep_id = tokenizer.token_to_id("<sep>")
tokenizer.post_processor = processors.TemplateProcessing(single="$A:0 <sep>:0 <cls>:2",pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_id), ("<cls>", cls_id)],)

In [96]:
encoding = tokenizer.encode("Test the tokenizer.!", "on a pair of sentences...")
print(encoding.tokens)
print(encoding.type_ids)

['▁Test', '▁the', '▁to', 'ken', 'izer', '.', '!', '<sep>', '▁', 'on', '▁', 'a', '▁pair', '▁of', '▁sentence', 's', '.', '.', '.', '<sep>', '<cls>']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


In [97]:
tokenizer.decoder = decoders.Metaspace()

In [98]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,bos_token="<s>",eos_token="</s>",
    unk_token="<unk>",pad_token="<pad>",cls_token="<cls>",sep_token="<sep>",mask_token="<mask>",padding_side="left",)