In [116]:
from datasets import load_dataset

dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")


In [117]:
def get_training_corpus():
    for i in range(0,len(dataset),1000):
        yield dataset[i:i+1000]['text']

In [118]:
from tokenizers import normalizers, pre_tokenizers, models, trainers,processors,decoders, Tokenizer



#### Building a WordPiece tokenizer from scratch

In [119]:
# Normalization

In [120]:
tokenizer = Tokenizer(models.WordPiece(unk_token='<unknown>'))

In [121]:
tokenizer.normalizer = normalizers.BertNormalizer()

In [122]:
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(),normalizers.Lowercase(),normalizers.StripAccents()])

In [123]:
tokenizer.normalizer.normalize_str("Héllò hôw are ü?")  ## Test 

'hello how are u?'

In [124]:
## Pre Tokenizer 
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [125]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [126]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [127]:
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),pre_tokenizers.Punctuation()
])


In [128]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [129]:
# Trainer

In [130]:
special_tokens = ["<unknown>", "<padding>", "<start>", "<sep>", "<mask>"]
trainer = trainers.WordPieceTrainer(vocab_size=25000,special_tokens=special_tokens,continuing_subword_prefix="<ss>")

In [131]:
tokenizer.train_from_iterator(get_training_corpus(),trainer=trainer,)






In [132]:
encoding = tokenizer.encode('This is Pranav Jha, I live in Banglore')
print(encoding.tokens)

['this', 'is', 'pra', '<ss>na', '<ss>v', 'j', '<ss>ha', ',', 'i', 'live', 'in', 'bang', '<ss>lore']


In [133]:
# Post Processing

In [134]:
start_token_id = tokenizer.token_to_id('<start>')
sep_token_id = tokenizer.token_to_id('<sep>')
print(start_token_id, sep_token_id)

2 3


In [135]:
tokenizer.post_processor = processors.TemplateProcessing(
single= f"<start>:0 $A:0 <sep>:0",
pair=  f"<start>:0 $A:0 <sep>:0 $B:1 <sep>:1",
special_tokens= [('<start>',start_token_id),('<sep>',sep_token_id)],
)

In [136]:
encoding = tokenizer.encode('This is Pranav Jha, I live in Banglore',"I work in Ericsson")

print(encoding.tokens)

['<start>', 'this', 'is', 'pra', '<ss>na', '<ss>v', 'j', '<ss>ha', ',', 'i', 'live', 'in', 'bang', '<ss>lore', '<sep>', 'i', 'work', 'in', 'ericsson', '<sep>']


In [137]:
encoding.type_ids

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

In [138]:
tokenizer.decoder = decoders.WordPiece(prefix='<ss>')

In [140]:
tokenizer.decode(encoding.ids)

'this is pranav jha, i live in banglore i work in ericsson'

In [141]:
## Save
tokenizer.save("tokenizer.json")

In [142]:
## Reload 
new_tokenizer = Tokenizer.from_file("tokenizer.json")

In [143]:
## Transformer 

from transformers import PreTrainedTokenizerFast

In [149]:
wrapped_tokenizer = PreTrainedTokenizerFast(
#     tokenizer_object=tokenizer,
    tokenizer_file = "tokenizer.json",    unk_token="<unknown>",
    pad_token="<padding>",
    cls_token="<start>",
    sep_token="<sep>",
    mask_token="<mask>",model_max_length=512)

In [150]:
wrapped_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=25000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '<unknown>', 'sep_token': '<sep>', 'pad_token': '<padding>', 'cls_token': '<start>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True)

In [152]:
from transformers import BertTokenizerFast

wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

#### Building a BPE tokenizer from scratch

In [156]:
from tokenizers import normalizers,pre_tokenizers,models,trainers,processors,Tokenizer

In [158]:
# Init
tokenizer = Tokenizer(model= models.BPE(unk_token='<unknown>',continuing_subword_prefix='<sw>'))

In [159]:
# Normalize  -- > GPT2 DOES NOT NEED NORMALIZER


In [163]:
# PRE TOKENIZER
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.pre_tokenizer.pre_tokenize_str('This is Pranav, I am learning ByteLevel tokenization from hugging face')

[('This', (0, 4)),
 ('Ġis', (4, 7)),
 ('ĠPranav', (7, 14)),
 (',', (14, 15)),
 ('ĠI', (15, 17)),
 ('Ġam', (17, 20)),
 ('Ġlearning', (20, 29)),
 ('ĠByteLevel', (29, 39)),
 ('Ġtokenization', (39, 52)),
 ('Ġfrom', (52, 57)),
 ('Ġhugging', (57, 65)),
 ('Ġface', (65, 70))]

In [165]:
## Trainer
trainer = trainers.BpeTrainer(vocab_size=25000,special_tokens=["<|endoftext|>"])


In [166]:
tokenizer.train_from_iterator(get_training_corpus(),trainer=trainer)






In [175]:
encoding = tokenizer.encode("This is a sample text to experiment with the gpt-2 tekenizer")

In [177]:
print(encoding.tokens)

['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġsample', 'Ġtext', 'Ġto', 'Ġexperiment', 'Ġwith', 'Ġthe', 'Ġg', 'pt', '-', '2', 'Ġte', 'ken', 'izer']


In [196]:
# Post processing
tokenizer.post_process = processors.ByteLevel(trim_offsets=True)

In [197]:
sentence = "This is a sample text to experiment with the gpt-2 tekenizer"
encoding = tokenizer.encode(sentence)
print(encoding.tokens)

['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġsample', 'Ġtext', 'Ġto', 'Ġexperiment', 'Ġwith', 'Ġthe', 'Ġg', 'pt', '-', '2', 'Ġte', 'ken', 'izer']


In [198]:
start,end = encoding.offsets[9]
sentence[start:end]

' with'

In [199]:
## decoder 
tokenizer.decoder = decoders.ByteLevel()
tokenizer.decode(encoding.ids)

'This is a sample text to experiment with the gpt-2 tekenizer'

In [201]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
)

#    --------------- or ---------------------
from transformers import GPT2TokenizerFast

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)

#### Building a Unigram tokenizer from scratch

In [202]:
tokenizer = Tokenizer(models.Unigram())

In [204]:
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),
        normalizers.Replace("''", '"'),
        normalizers.NFKD(),
        normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}"), " "),
    ]
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")

[("▁Let's", (0, 5)),
 ('▁test', (5, 10)),
 ('▁the', (10, 14)),
 ('▁pre-tokenizer!', (14, 29))]

In [205]:
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
    vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)





In [206]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.']


In [207]:
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id, sep_token_id)

0 1


In [208]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="$A:0 <sep>:0 <cls>:2",
    pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_token_id), ("<cls>", cls_token_id)],
)

In [209]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences!")
print(encoding.tokens)
print(encoding.type_ids)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.', '.', '.', '<sep>', '▁', 'on', '▁', 'a', '▁pair', '▁of', '▁sentence', 's', '!', '<sep>', '<cls>']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


In [211]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>",
    padding_side="left",
)