# Xây dựng từng khối tokenizer

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

## WordPiece

In [2]:
from datasets import load_dataset
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [84]:
with open("wikitext-2.txt", "w", encoding="utf-8") as f:
    for i in range(len(dataset)):
        f.write(dataset[i]["text"] + "\n")

In [85]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

### Xây dựng từng block của normalize, pre-tokenizer

*   Sử dụng Sequence trong từng block để kết hợp các thuộc tính
*  Phải tự định nghĩa special tokens cho Trainer, PreTraindTokenizerFast



In [86]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

In [87]:
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [88]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [89]:
pre_tokenizer = pre_tokenizers.WhitespaceSplit()
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[("Let's", (0, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre-tokenizer.', (14, 28))]

In [90]:
pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [91]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [92]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [93]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [94]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']


In [95]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [96]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [97]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


In [98]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [99]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [100]:
tokenizer.decode(encoding.ids)

"let ' s test this tokenizer... on a pair of sentences."

In [101]:
tokenizer.save("tokenizer.json")

In [102]:
from transformers import PreTrainedTokenizerFast
new_tokenizer = Tokenizer.from_file("tokenizer.json")
wrapped_tokenizer = PreTrainedTokenizerFast(
    #tokenizer_object=new_tokenizer,
    tokenizer_file="tokenizer.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [103]:
encode = wrapped_tokenizer("Let's test pre-tokenization.")
print(encode)
print(encode.tokens())
print(wrapped_tokenizer.convert_tokens_to_ids(encode.tokens()))

{'input_ids': [2, 3019, 11, 61, 3611, 1637, 17, 24318, 18903, 9862, 7048, 1337, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'let', "'", 's', 'test', 'pre', '-', 'tok', '##eni', '##za', '##ti', '##on', '.', '[SEP]']
[2, 3019, 11, 61, 3611, 1637, 17, 24318, 18903, 9862, 7048, 1337, 18, 3]


## Xây dựng nhanh với kiến trúc sẵn có

*   Tất cả đóng gói sẵn
*   Hạn chế can thiệp vào những thuộc tính sẵn



In [106]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [107]:
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [108]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [109]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [110]:
tokenizer.train(["wikitext-2.txt"], trainer=trainer)

In [111]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


In [113]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [114]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [115]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


In [116]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '.', '.', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [117]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [118]:
tokenizer.decode(encoding.ids)

"let ' s test this tokenizer... on a pair of sentences."

In [119]:
tokenizer.save("tokenizer.json")

In [73]:
new_tokenizer = Tokenizer.from_file("tokenizer.json")

In [121]:
from transformers import BertTokenizerFast

wrapped_tokenizer = BertTokenizerFast(tokenizer_object=new_tokenizer)

In [122]:
encode = wrapped_tokenizer("Let's test pre-tokenization.")
print(encode)
print(encode.tokens())
print(wrapped_tokenizer.convert_tokens_to_ids(encode.tokens()))

{'input_ids': [2, 3019, 11, 61, 3611, 1637, 17, 24318, 18903, 9862, 7048, 1337, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'let', "'", 's', 'test', 'pre', '-', 'tok', '##eni', '##za', '##ti', '##on', '.', '[SEP]']
[2, 3019, 11, 61, 3611, 1637, 17, 24318, 18903, 9862, 7048, 1337, 18, 3]


## Byte Pair Encoding

### Xây dựng từng block của normalize, pre-tokenizer

*   Sử dụng Sequence trong từng block để kết hợp các thuộc tính
*  Phải tự định nghĩa special tokens cho Trainer, PreTraindTokenizerFast

In [123]:
tokenizer = Tokenizer(models.BPE())

In [124]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [125]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

[('Let', (0, 3)),
 ("'s", (3, 5)),
 ('Ġtest', (5, 10)),
 ('Ġpre', (10, 14)),
 ('-', (14, 15)),
 ('tokenization', (15, 27)),
 ('!', (27, 28))]

In [126]:
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [127]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['L', 'et', "'", 's', 'Ġtest', 'Ġthis', 'Ġto', 'ken', 'izer', '.']


In [128]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

' test'

In [129]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [130]:
tokenizer.decoder = decoders.ByteLevel()

In [131]:
tokenizer.decode(encoding.ids)

"Let's test this tokenizer."

In [141]:
tokenizer.save("tokenizer-bpe.json")

In [145]:
from transformers import PreTrainedTokenizerFast
new_tokenizer = Tokenizer.from_file("tokenizer-bpe.json")
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=new_tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
)

In [146]:
encode = wrapped_tokenizer("Let's test pre-tokenization.")
print(encode)
print(encode.tokens())
print(wrapped_tokenizer.convert_tokens_to_ids(encode.tokens()))

{'input_ids': [44, 269, 7, 83, 2859, 577, 13, 84, 9777, 1995, 14], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['L', 'et', "'", 's', 'Ġtest', 'Ġpre', '-', 't', 'oken', 'ization', '.']
[44, 269, 7, 83, 2859, 577, 13, 84, 9777, 1995, 14]


## Xây dựng nhanh với kiến trúc sẵn có

*   Tất cả đóng gói sẵn
*   Hạn chế can thiệp vào những thuộc tính sẵn



In [134]:
tokenizer.model = models.BPE()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)

In [136]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['L', 'et', "'", 's', 'Ġtest', 'Ġthis', 'Ġto', 'ken', 'izer', '.']


In [137]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

' test'

In [138]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [139]:
tokenizer.decoder = decoders.ByteLevel()

In [140]:
tokenizer.decode(encoding.ids)

"Let's test this tokenizer."

In [None]:
tokenizer.save("tokenizer-bpe.json")

In [135]:
from transformers import GPT2TokenizerFast

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)

In [147]:
encode = wrapped_tokenizer("Let's test pre-tokenization.")
print(encode)
print(encode.tokens())
print(wrapped_tokenizer.convert_tokens_to_ids(encode.tokens()))

{'input_ids': [44, 269, 7, 83, 2859, 577, 13, 84, 9777, 1995, 14], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['L', 'et', "'", 's', 'Ġtest', 'Ġpre', '-', 't', 'oken', 'ization', '.']
[44, 269, 7, 83, 2859, 577, 13, 84, 9777, 1995, 14]


## Unigram

### Xây dựng từng block của normalize, pre-tokenizer

*   Sử dụng Sequence trong từng block để kết hợp các thuộc tính
*  Phải tự định nghĩa special tokens cho Trainer, PreTraindTokenizerFast

In [168]:
tokenizer = Tokenizer(models.Unigram())

In [169]:
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),
        normalizers.Replace("''", '"'),
        normalizers.NFKD(),
        normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}"), " "),
    ]
)

In [170]:
tokenizer.normalizer.normalize_str("Let's test the pre-tokenizer!")

"Let's test the pre-tokenizer!"

In [171]:
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()

In [172]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")

[("▁Let's", (0, 5)),
 ('▁test', (5, 10)),
 ('▁the', (10, 14)),
 ('▁pre-tokenizer!', (14, 29))]

In [173]:
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
    vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [174]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.']


In [175]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences!")
print(encoding.tokens)
print(encoding.type_ids)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.', '.', '.', '▁', 'on', '▁', 'a', '▁pair', '▁of', '▁sentence', 's', '!']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [176]:
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id, sep_token_id)

0 1


In [177]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="$A:0 <sep>:0 <cls>:2",
    pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_token_id), ("<cls>", cls_token_id)],
)

In [178]:
tokenizer.decoder = decoders.Metaspace()

In [179]:
tokenizer.save("tokenizer-unigram.json")

In [180]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>",
    padding_side="left",
)

In [182]:
encode = wrapped_tokenizer("Let's test pre-tokenization.")
print(encode)
print(encode.tokens())
print(wrapped_tokenizer.convert_tokens_to_ids(encode.tokens()))

{'input_ids': [6428, 8030, 9, 1091, 1047, 28, 37, 293, 53, 2615, 11, 1, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁Let', "'", 's', '▁test', '▁pre', '-', 't', 'ok', 'en', 'ization', '.', '<sep>', '<cls>']
[6428, 8030, 9, 1091, 1047, 28, 37, 293, 53, 2615, 11, 1, 0]


## Xây dựng nhanh với kiến trúc sẵn có

*   Tất cả đóng gói sẵn
*   Hạn chế can thiệp vào những thuộc tính sẵn



In [183]:
tokenizer.model = models.Unigram()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)

In [184]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.', '<sep>', '<cls>']


In [185]:
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id, sep_token_id)

0 1


In [186]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="$A:0 <sep>:0 <cls>:2",
    pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_token_id), ("<cls>", cls_token_id)],
)

In [187]:
tokenizer.decoder = decoders.Metaspace()

In [188]:
tokenizer.save("tokenizer-bpe.json")

In [189]:
from transformers import XLNetTokenizerFast

wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)

In [190]:
encode = wrapped_tokenizer("Let's test pre-tokenization.")
print(encode)
print(encode.tokens())
print(wrapped_tokenizer.convert_tokens_to_ids(encode.tokens()))

{'input_ids': [6422, 7794, 9, 1093, 1055, 28, 37, 292, 53, 2617, 11, 1, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁Let', "'", 's', '▁test', '▁pre', '-', 't', 'ok', 'en', 'ization', '.', '<sep>', '<cls>']
[6422, 7794, 9, 1093, 1055, 28, 37, 292, 53, 2617, 11, 1, 0]


In [191]:
encode = wrapped_tokenizer("Let's test pre-tokenization.")
print(encode)
print(encode.tokens())
print(wrapped_tokenizer.convert_tokens_to_ids(encode.tokens()))

{'input_ids': [6422, 7794, 9, 1093, 1055, 28, 37, 292, 53, 2617, 11, 1, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁Let', "'", 's', '▁test', '▁pre', '-', 't', 'ok', 'en', 'ization', '.', '<sep>', '<cls>']
[6422, 7794, 9, 1093, 1055, 28, 37, 292, 53, 2617, 11, 1, 0]
