In [1]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

In [2]:
VOCAB_SIZE = 8192
SPECIAL_TOKENS = ['[UNK]', '[PAD]', '[BOS]', '[EOS]']
MIN_FREQUENCY = 32

In [3]:
def init_tokenizer():
    # Init the WordPiece based tokenizer with unk_token='[UNK]'
    tokenizer = Tokenizer(WordPiece(unk_token='[UNK]'))
    # Setup the normalization pipeline
    tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.post_processor = TemplateProcessing(single='[BOS] $A [EOS]', special_tokens=[('[BOS]', 2), ('[EOS]', 3)])
    tokenizer.enable_padding(pad_id=1, pad_token='[PAD]')
    tokenizer.decoder = decoders.WordPiece()
    return tokenizer

# Setup the trainer with specific vocab_size, special_tokens and min_frequency
trainer = WordPieceTrainer(vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS, min_frequency=MIN_FREQUENCY)

In [4]:
# You can train the tokenizer from wikitext file
## wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
## unzip wikitext-103-raw-v1.zip
tokenizer_train_files = [f'./Data/wikitext-103-raw-v1/wikitext-103-raw/wiki.{split}.raw' for split in ['train', 'valid', 'test']]

tokenizer_wiki = init_tokenizer()
tokenizer_wiki.train(tokenizer_train_files, trainer)

tokenizer_wiki.save('./Model/tokenizer-wiki.json')

In [5]:
# Or train from iterator
# Let's load the dataset first
import os
from datasets import load_dataset
# Load translation dataset from huggingface
os.environ['HF_DATASETS_OFFLINE'] = '1'
dataset = load_dataset('wmt19', 'zh-en')

Using the latest cached version of the module from D:\Archives\HuggingfaceCache\modules\datasets_modules\datasets\wmt19\29e210fae5690e843cae5dc43b53db36c4e02f927db50cd5235a22ab42dde90a (last modified on Sat Apr  1 21:51:16 2023) since it couldn't be found locally at wmt19., or remotely on the Hugging Face Hub.
Found cached dataset wmt19 (D:/Archives/HuggingfaceCache/datasets/wmt19/zh-en/1.0.0/29e210fae5690e843cae5dc43b53db36c4e02f927db50cd5235a22ab42dde90a)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Define an interator to get a batch of sentences from the dataset
def batch_iterator(batch_size=1000, language='en', subset_rows=250000):
    for split in ['train', 'validation']:
        for i in range(0, dataset.num_rows[split] if split == 'validation' else subset_rows, batch_size):
            batch = dataset[split][i:i + batch_size]['translation']
            batch = [d[language] for d in batch]
            yield batch

# print(next(batch_iterator(2)))
# Output: ['1929 or 1989?', 'PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.']

tokenizer_wmt19_en = init_tokenizer()
tokenizer_wmt19_en.train_from_iterator(batch_iterator(language='en'), trainer=trainer)
tokenizer_wmt19_en.save('./Model/tokenizer-wmt19-en.json')

tokenizer_wmt19_zh = init_tokenizer()
tokenizer_wmt19_zh.train_from_iterator(batch_iterator(language='zh'), trainer=trainer)
tokenizer_wmt19_zh.save('./Model/tokenizer-wmt19-zh.json')


In [7]:
tokenizer_wiki = Tokenizer.from_file('Model/tokenizer-wiki.json')

output = tokenizer_wiki.encode("When encoding multiple sentences, you can automatically pad the outputs to the longest sentence present by using Tokenizer.")
print(output.tokens)
print(output.ids)
print(tokenizer_wiki.decode(output.ids))

['[BOS]', 'when', 'en', '##c', '##od', '##ing', 'm', '##ult', '##ip', '##le', 'se', '##n', '##ten', '##ces', ',', 'you', 'can', 'aut', '##om', '##at', '##ically', 'p', '##ad', 'the', 'out', '##p', '##ut', '##s', 'to', 'the', 'long', '##est', 'se', '##n', '##ten', '##ce', 'pres', '##ent', 'by', 'us', '##ing', 'to', '##ke', '##n', '##iz', '##er', '.', '[EOS]']
[2, 7418, 7254, 4063, 7206, 7123, 54, 7400, 7259, 7150, 7203, 4062, 7706, 7476, 15, 7988, 7427, 7915, 7132, 7114, 8032, 57, 7144, 7107, 7404, 4082, 7162, 4075, 7127, 7107, 7708, 7196, 7203, 4062, 7706, 7185, 7583, 7137, 7179, 7336, 7123, 7127, 7414, 4062, 7384, 7108, 17, 3]
when encoding multiple sentences, you can automatically pad the outputs to the longest sentence present by using tokenizer.


In [8]:
tokenizer_wmt19_en = Tokenizer.from_file('./Model/tokenizer-wmt19-en.json')
tokenizer_wmt19_zh = Tokenizer.from_file('./Model/tokenizer-wmt19-zh.json')

test_sentence_en = 'It was later realized that the signal they had detected could be entirely attributed to interstellar dust.'
test_sentence_zh = '但后来他们逐渐意识到所探测到的信号可能完全来源于星际尘埃。'

encoded_en = tokenizer_wmt19_en.encode(test_sentence_en)
encoded_zh = tokenizer_wmt19_zh.encode(test_sentence_zh)

print(encoded_en.tokens)
print(encoded_en.ids)

print(encoded_zh.tokens)
print(encoded_zh.ids)

print(tokenizer_wmt19_en.decode(encoded_en.ids))
print(tokenizer_wmt19_zh.decode(encoded_zh.ids))

['[BOS]', 'it', 'was', 'later', 'realized', 'that', 'the', 'signal', 'they', 'had', 'det', '##ected', 'could', 'be', 'entirely', 'attrib', '##uted', 'to', 'inter', '##st', '##ell', '##ar', 'd', '##ust', '.', '[EOS]']
[2, 234, 362, 2169, 6641, 226, 189, 6150, 364, 662, 1132, 979, 545, 235, 3637, 5322, 2165, 205, 359, 220, 465, 203, 42, 311, 17, 3]
['[BOS]', '但', '##后', '##来', '##他', '##们', '##逐', '##渐', '##意', '##识', '##到', '##所', '##探', '##测', '##到', '##的', '##信', '##号', '##可', '##能', '##完', '##全', '##来', '##源', '##于', '##星', '##际', '##尘', '##埃', '。', '[EOS]']
[2, 307, 5030, 4950, 4923, 4824, 5571, 5119, 4849, 4825, 4743, 4642, 6103, 5114, 4743, 4650, 4813, 4669, 4731, 4740, 4832, 4827, 4950, 4945, 4834, 5826, 4944, 7169, 5224, 147, 3]
it was later realized that the signal they had detected could be entirely attributed to interstellar dust.
但后来他们逐渐意识到所探测到的信号可能完全来源于星际尘埃 。
