In [42]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token='<unk>'))

In [43]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(
    special_tokens=['<unk>', '<sep>', '<pad>', '<mask>'], vocab_size=4_096)

In [44]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [45]:
tokenizer.train(['./sentences.txt'], trainer)

In [46]:
tokenizer.get_vocab_size()

4096

In [47]:
output = tokenizer.encode('السلام عليكم و رحمة الله ...')
output

Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [48]:
output.tokens

['السلام', 'علي', 'كم', 'و', 'رح', 'مة', 'الله', '..', '.']

In [49]:
output.ids

[3808, 1823, 1786, 477, 3669, 1695, 1943, 3580, 17]

In [50]:
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single='<sep> $A <sep>',
    pair='<sep> $A <sep> $B:1 <sep>:1',
    special_tokens=[
        ('<sep>', tokenizer.token_to_id('<sep>')),
    ],
)

In [51]:
output = tokenizer.encode('السلام عليكم و رحمة الله ...')
output

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [52]:
output.tokens

['<sep>', 'السلام', 'علي', 'كم', 'و', 'رح', 'مة', 'الله', '..', '.', '<sep>']

In [53]:
output.ids

[1, 3808, 1823, 1786, 477, 3669, 1695, 1943, 3580, 17, 1]

In [54]:
tokenizer.enable_padding(pad_id=2, pad_token='<pad>')
tokenizer.enable_truncation(max_length=512)

In [55]:
tokenizer.save('../models/nano-gpt-tokenizer.json')