In [1]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
     Regex,
    AddedToken, 
)
from tokenizers.normalizers import Normalizer
from transformers import AutoTokenizer, PreTrainedTokenizerFast
import re

# Normalizers

1. Unicode normalization 
2. Mapping from 'y' to 'i (bác sỹ -> bác sĩ) except when 'y' is alone or when it's in the compound 'uy' 

In [2]:
# test = "thy"
# test_pattern = re.compile(r"(?:[th])*y")

# print(re.findall(test_pattern, test))

In [3]:
# pattern = re.compile(r"(?<=\b[hklmnst])(?:\S*)(?<=[^a\á\à\ạ\ả\ã\â\ấ\ầ\ậ\ẩ\ẫ\ă\ắ\ằ\ẳ\ẵ\ặu\ú\ù\ụ\ủ\ũ\s])([y\ỷ\ỹ\ỵ\ỳ\ý])\b")
# test_text = "bác sỹ, thạc sỹ và ca sỹ, thý thoy mình bị suy dinh dưỡng."


# print(re.findall(pattern, test_text))

In [4]:
# from encodings import normalize_encoding


# class CustomNormalizer: 
#     def __init__(self): 
#         self.pattern = re.compile(r"(?<=\b[hklmnst])(?:\S*)(?<=[^a\á\à\ạ\ả\ã\â\ấ\ầ\ậ\ẩ\ẫ\ă\ắ\ằ\ẳ\ẵ\ặu\ú\ù\ụ\ủ\ũ\s])([y\ỷ\ỹ\ỵ\ỳ\ý])\b")
#         self.y_to_i_map = {
#             'y': 'i',
#             'ỷ': 'ỉ',
#             'ỹ': 'ĩ',
#             'ỵ': 'ị',
#             'ỳ': 'ì',
#             'ý': 'í'
#         }

#     def normalize(self, normalized: NormalizedString): 
#         original = normalized.original
#         matches = self.pattern.findall(original)

#         for match in matches: 
#             normalized.replace(match, self.y_to_i_map[match])

#         print(normalized.normalized)
#         normalized.nfc()

In [5]:
# test_text = "bác sỹ, thạc sỹ và ca sỹ, thý thoy mình bị suy dinh dưỡng."
# norm = NormalizedString(test_text)
# norm.lowercase()

# norm.normalized

In [6]:
# re.findall(re.compile(r"(?<=\b[hklmnst])(?:\S*)(?<=[^a\á\à\ạ\ả\ã\â\ấ\ầ\ậ\ẩ\ẫ\ă\ắ\ằ\ẳ\ẵ\ặu\ú\ù\ụ\ủ\ũ\s])(ý)\b"), test_text)

In [7]:
tokenizer = Tokenizer(models.BPE(unk_token=None, fuse_unk=False, dropout=None, end_of_word_suffix="", continuing_subword_prefix="", byte_fallback=True,))

In [8]:
normalizer_sequence = normalizers.Sequence([
    normalizers.NFC(), 
])

# Pre-tokenizers


In [9]:
pretokenizer_sequence = pre_tokenizers.Sequence([
    pre_tokenizers.Split(Regex("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"), behavior="isolated", invert=False),
    pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False)
])

# Decoder

In [10]:
decoder = decoders.ByteLevel()

# Generate special tokens

In [11]:
cls = AddedToken(content="[CLS]", lstrip=False, normalized=False, rstrip=False, single_word=True)
pad = AddedToken(content="[PAD]", lstrip=False, normalized=False, rstrip=False, single_word=True)
mask = AddedToken(content="[MASK]", lstrip=False, normalized=False, rstrip=False, single_word=True)
sep = AddedToken(content="[SEP]", lstrip=False, normalized=False, rstrip=False, single_word=True)
unk = AddedToken(content="[UNK]", lstrip=False, normalized=False, rstrip=False, single_word=True)

# Create dataset generator

In [12]:
from datasets import load_dataset

raw_dataset = load_dataset("arrow", data_files="../data_all/data/data_00000.arrow")

In [13]:
def get_training_corpus():
    dataset = raw_dataset["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["text"]

In [14]:
training_corpus = get_training_corpus()

# Combine & Train

In [15]:
tokenizer.normalizer = normalizer_sequence
tokenizer.pre_tokenizer = pretokenizer_sequence
tokenizer.decoder = decoder

In [16]:
# (30000 + 261 + 64 - 53) 
# vocab_size = 30272
vocab_size = 500
max_token_length = 2048

In [17]:
trainer = trainers.BpeTrainer(vocab_size=vocab_size, show_progress=True, max_token_length=max_token_length) 

In [18]:
tokenizer.train_from_iterator(training_corpus, trainer=trainer)






# Config tokenizer post processors

In [19]:
# add special tokens 

tokenizer.add_special_tokens([cls, pad, mask, sep, unk])

5

In [20]:
# enable padding 
tokenizer.enable_padding(pad_id=vocab_size+1, pad_type_id=vocab_size+1, pad_token="[PAD]")

In [21]:
# enable truncation 
tokenizer.enable_truncation(max_length=max_token_length)

In [22]:
# set postprocessors
postprocessor_sequence = processors.Sequence([
    processors.ByteLevel(trim_offsets=False), 
    processors.TemplateProcessing(
    single="[CLS] $0 [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[CLS]", vocab_size), ("[SEP]", vocab_size+3)]),
])

tokenizer.post_processor = postprocessor_sequence

In [23]:
# https://github.com/huggingface/tokenizers/issues/282

# Convert to Fast Tokenizer and Save

In [24]:
tokenizer.save("./trained_tokenizer.json")

In [25]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    # tokenizer,
    tokenizer_file="./trained_tokenizer.json",    
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [26]:
wrapped_tokenizer.model_max_length = max_token_length
wrapped_tokenizer.save_pretrained("./tokenizer", legacy_format=False)

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/tokenizer.json')

# Test fast tokenizer

In [27]:
test_tokenizer = AutoTokenizer.from_pretrained("./tokenizer")

In [32]:
tokens = test_tokenizer(raw_dataset["train"]["text"][0:100], return_tensors="pt", truncation=True, padding=True)

In [33]:
tokens

{'input_ids': tensor([[500,  32,  69,  ..., 501, 501, 501],
        [500, 437, 296,  ..., 501, 501, 501],
        [500,  43, 260,  ..., 501, 501, 501],
        ...,
        [500,  49, 228,  ..., 501, 501, 501],
        [500,  49,  79,  ..., 501, 501, 501],
        [500,  43,  68,  ..., 501, 501, 501]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [34]:
test_tokenizer.batch_decode(tokens["input_ids"])

['[CLS]Chuyện đã hơn chục năm nay, nhưng lão Khổ còn đau mãi tới giờ.Vụ lúa chiêm năm ấy càng về cuối càng thuận.[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]',
 '[CLS]Đúng vào kỳ lúa đỏ đuôi, vòm trời thật nở nang.[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]',
 '[CLS]Nắng đến sướng.[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]