In [1]:
# In this notebook, you learn:
#
# 1) What is tokenization?
# 2) What is Byte Pair Encoding?
# 3) How to use HuggingFace's Tokenizer class to train a byte pair encoding tokenizer? 
#
# NOTE: Use the pre-trained spacy tokenizerS (step_2_tokenization_with_spacy) for tokenization if you are
# looking for a simple get it done tokenization solution and not looking to train Byte Pair Encoding tokenizer.

In [2]:
# Resources to learn about Tokenization:
#
# 1) https://www.youtube.com/watch?v=zduSFxRajkE
#       -- Explains tokenization and creates a byte pair encoding tokenizer from scratch.
#       -- Best video by far to understand byte level BPE tokenization.
#       -- Very long video.
#       -- MUST WATCH - MUST WATCH - MUST WATCH - MUST WATCH - MUST WATCH - MUST WATCH.
# 2) https://realpython.com/introduction-to-python-generators/
#       -- Excellent resource to learn about Generators and yield statement in python.
# 3) https://github.com/huggingface/tokenizers/blob/14a07b06e4a8bd8f80d884419ae4630f5a3d8098/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py#L10
#       -- ByteLevelBPETokenizer class in github. 
#       -- Could not find any official HuggingFace documentation for this class. So, had to refer to the source code.

In [5]:
# HuggingFace imports to train my own byte level BPE tokenizer.
from tokenizers import ByteLevelBPETokenizer # type: ignore
from datasets import load_from_disk
import datasets
from typing import Generator, Optional

In [None]:
AI4_BHARAT_DATA_PATH = "../../Data/AI4Bharat"
ENGLISH_TOKENIZER_SAVE_PATH = "../../Data/trained_models/tokenizers/bpe/bpe_english_tokenizer"
TELUGU_TOKENIZER_SAVE_PATH = "../../Data/trained_models/tokenizers/bpe/bpe_telugu_tokenizer"
# Number of tokens in the vocabulary of the tokenizer.
ENGLISH_VOCAB_SIZE = 30000
TELUGU_VOCAB_SIZE = 30000

In [7]:
train_dataset_path = f"{AI4_BHARAT_DATA_PATH}/full_en_te_dataset"
tokenizer_train_dataset = load_from_disk(dataset_path=train_dataset_path)

In [8]:
# We need an iterator to iterate on the datapoints in the tokenizer dataset. In general, the datasets could
# be arbitrarily large and we do not want to load the entire dataset into memory at once. Using a lazy iterator 
# (generator) ensures that not all the data is loaded into memory at once.
# The general syntax for a generator type hint is Generator[YieldType, SendType, ReturnType]. In our case, 
# we are only yielding strings and not sending or returning anything, so we use Generator[str, None, None].
def get_data_iterator(input_dataset: datasets.arrow_dataset.Dataset, language: str) -> Generator[str, None, None]:
    for en_te_datapoint in input_dataset:
        if language == "en":
            yield en_te_datapoint["src"]
        else:
            yield en_te_datapoint["tgt"]

In [9]:
def train_tokenizer(train_dataset: datasets.arrow_dataset.Dataset, language: str, vocab_size: Optional[int] = ENGLISH_VOCAB_SIZE) -> ByteLevelBPETokenizer:
    # Use BPE to train a ByteLevel BPE tokenizer.
    tokenizer = ByteLevelBPETokenizer()
    # Train the tokenizer on the ai4bharat mini train dataset. train_from_iterator is used so that the entire 
    # dataset is not loaded into memory at once.
    tokenizer.train_from_iterator(iterator=get_data_iterator(input_dataset=train_dataset, language=language), 
                                  vocab_size= vocab_size, 
                                  special_tokens=["<sos>", "<eos>", "<pad>", "<unk>"])
    return tokenizer

In [10]:
# Note that we don't need to train two separate tokenizers for English and Telugu. We can train a single tokenizer
# on the entire dataset. The tokenizer will learn the vocabulary of both languages. However, we made a design choice
# here to train a separate tokenizers for each language. Each tokenizer performs merges and learns tokens based 
# on the language it is trained on. 
# Training English tokenizer.
en_tokenizer = train_tokenizer(train_dataset=tokenizer_train_dataset, language="en", vocab_size=ENGLISH_VOCAB_SIZE)
# Training Telugu tokenizer
te_tokenizer = train_tokenizer(train_dataset=tokenizer_train_dataset, language="te", vocab_size=TELUGU_VOCAB_SIZE)









In [11]:
print(en_tokenizer, en_tokenizer.get_vocab_size(), type(en_tokenizer))
print(te_tokenizer, te_tokenizer.get_vocab_size(), type(te_tokenizer))

Tokenizer(vocabulary_size=32000, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False) 32000 <class 'tokenizers.implementations.byte_level_bpe.ByteLevelBPETokenizer'>
Tokenizer(vocabulary_size=32000, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False) 32000 <class 'tokenizers.implementations.byte_level_bpe.ByteLevelBPETokenizer'>


In [None]:
# We don't need to specifically build vocabulary again if we use the HuggingFace Tokenizer. This is different from 
# using spacy tokenizer where we build our own vocabulary from the dataset. Though spacy provides inbuilt 
# vocabulary, it is not based on our training dataset but based on pre-existing training corpus used to train the 
# word based spacy tokenizer.
en_vocab = en_tokenizer.get_vocab()
te_vocab = te_tokenizer.get_vocab()
print(type(en_vocab), list(en_vocab.items())[:10])
print(type(te_vocab), list(te_vocab.items())[:10])

<class 'dict'> [('ĠSadananda', 25660), ('Ġdoll', 6059), ('Ġdefended', 24073), ('Ġhung', 10345), ('ĠJaitley', 5551), ('Ġclusters', 20086), ('Ġfranchise', 10877), ('Ġembroiled', 27552), ('eep', 983), ('Ġbr', 1019)]
<class 'dict'> [('randomization', 22668), ('à°³à°¹à°°', 29099), ('à°¦à°Ł', 550), ('à°¦à°Ĺà°²', 3050), ('Ġà°°à°¡', 9637), ('munity', 13471), ('à°°à°¯à°ªà°Ł', 16107), ('ette', 25747), ('Ġà°Ķà°°à°¯', 27160), ('1951', 6895)]


In [13]:
# Just to show that the special tokens are present in the vocabulary. 
print(en_tokenizer.token_to_id("<sos>"), en_tokenizer.token_to_id("<eos>"), en_tokenizer.token_to_id("<unk>"), en_tokenizer.token_to_id("<pad>"))
print(te_tokenizer.token_to_id("<sos>"), te_tokenizer.token_to_id("<eos>"), te_tokenizer.token_to_id("<unk>"), te_tokenizer.token_to_id("<pad>"))
# Note that the token_to_id method returns None if the token is not directly present in the vocabulary. Hence, converting 
# 'petssss' to id returns None. 
print(en_tokenizer.token_to_id("pet"), en_tokenizer.encode("pet").ids[0], en_tokenizer.token_to_id("petssss"))
# Here, you can see that 'petssss' is a combination of 3 tokens 'pet', 'ss' and 'ss'. Hence, the ids for these tokens are 
# returned.
print(en_tokenizer.encode("petssss").ids, en_tokenizer.encode("petssss").tokens)

0 1 3 2
0 1 3 2
5463 5463 None
[5463, 779, 779] ['pet', 'ss', 'ss']


In [14]:
encoded_english_sentence = en_tokenizer.encode("I hope people will find this repository useful.")
print("sentence_encoded_to_token_ids: ", encoded_english_sentence.ids)
print("sentence_encoded_to_tokens: ", encoded_english_sentence.tokens)
decoded_english_sentence = en_tokenizer.decode(encoded_english_sentence.ids)
print("token_ids_decoded_to_sentence: ", decoded_english_sentence)

sentence_encoded_to_token_ids:  [44, 3089, 577, 393, 2188, 394, 1267, 28779, 8204, 17]
sentence_encoded_to_tokens:  ['I', 'Ġhope', 'Ġpeople', 'Ġwill', 'Ġfind', 'Ġthis', 'Ġrep', 'ository', 'Ġuseful', '.']
token_ids_decoded_to_sentence:  I hope people will find this repository useful.


In [15]:
encoded_telugu_sentence = te_tokenizer.encode("అంతర్జాతీయ విమాన సర్వీసులపై నిషేధాన్ని కేంద్ర ప్రభుత్వం అక్టోబర్ 31 వరకు పొడగించింది")
print("sentence_encoded_to_token_ids: ", encoded_telugu_sentence.ids)
# This is my assumption in understanding this behavior. Since we are using ByteLevelBPETokenizer, the tokens are not valid Telugu characters 
# but some random combination of bytes. Hence, the tokens themselves are not human readable. However, when decoded using the tokenizer, we
# get the original Telugu sentence back correctly.
print("sentence_encoded_to_tokens: ", encoded_telugu_sentence.tokens)
decoded_telugu_sentence = te_tokenizer.decode(encoded_telugu_sentence.ids)
print("token_ids_decoded_to_sentence: ", decoded_telugu_sentence)

sentence_encoded_to_token_ids:  [325, 269, 370, 263, 303, 265, 272, 283, 274, 291, 264, 281, 265, 268, 457, 263, 280, 283, 278, 266, 496, 305, 299, 264, 302, 276, 308, 265, 268, 263, 268, 264, 289, 364, 273, 263, 267, 285, 263, 388, 266, 272, 263, 280, 269, 512, 263, 277, 275, 560, 263, 1379, 537, 266, 285, 310, 610, 294, 286, 294, 273, 264]
sentence_encoded_to_tokens:  ['à°ħ', 'à°Ĥ', 'à°¤à°°', 'à±į', 'à°ľ', 'à°¾', 'à°¤', 'à±Ģ', 'à°¯', 'Ġà°µ', 'à°¿', 'à°®', 'à°¾', 'à°¨', 'Ġà°¸à°°', 'à±į', 'à°µ', 'à±Ģ', 'à°¸', 'à±ģ', 'à°²à°ª', 'à±Ī', 'Ġà°¨', 'à°¿', 'à°·', 'à±ĩ', 'à°§', 'à°¾', 'à°¨', 'à±į', 'à°¨', 'à°¿', 'Ġà°ķ', 'à±ĩà°Ĥ', 'à°¦', 'à±į', 'à°°', 'Ġà°ª', 'à±į', 'à°°à°Ń', 'à±ģ', 'à°¤', 'à±į', 'à°µ', 'à°Ĥ', 'Ġà°ħà°ķ', 'à±į', 'à°Ł', 'à±ĭ', 'à°¬à°°', 'à±į', 'Ġ31', 'Ġà°µà°°à°ķ', 'à±ģ', 'Ġà°ª', 'à±Ĭ', 'à°¡à°Ĺ', 'à°¿à°Ĥ', 'à°ļ', 'à°¿à°Ĥ', 'à°¦', 'à°¿']
token_ids_decoded_to_sentence:  అంతర్జాతీయ విమాన సర్వీసులపై నిషేధాన్ని కేంద్ర ప్రభుత్వం అక్టోబర్ 31 వరకు పొడగించింది


In [16]:
# Save the trained tokenizers to disk.
en_tokenizer.save_model(ENGLISH_TOKENIZER_SAVE_PATH)
te_tokenizer.save_model(TELUGU_TOKENIZER_SAVE_PATH)

['../../Data/trained_models/tokenizers/telugu_tokenizer/vocab.json',
 '../../Data/trained_models/tokenizers/telugu_tokenizer/merges.txt']

In [17]:
# Load the saved English tokenizer from disk.
en_tokenizer_loaded = ByteLevelBPETokenizer.from_file(vocab_filename=f"{ENGLISH_TOKENIZER_SAVE_PATH}/vocab.json", 
                                                      merges_filename=f"{ENGLISH_TOKENIZER_SAVE_PATH}/merges.txt")
# Confirm that the loaded tokenizer is working as expected.
encoded_english_sentence_2 = en_tokenizer_loaded.encode("I hope people will find this repository useful.")
print("sentence_encoded_to_token_ids: ", encoded_english_sentence_2.ids)

sentence_encoded_to_token_ids:  [44, 3089, 577, 393, 2188, 394, 1267, 28779, 8204, 17]


In [18]:
# Load the saved Telugu tokenizer from disk.
te_tokenizer_loaded = ByteLevelBPETokenizer.from_file(vocab_filename=f"{TELUGU_TOKENIZER_SAVE_PATH}/vocab.json", 
                                                      merges_filename=f"{TELUGU_TOKENIZER_SAVE_PATH}/merges.txt")
# Confirm that the loaded tokenizer is working as expected.
encoded_telugu_sentence_2 = te_tokenizer_loaded.encode("అంతర్జాతీయ విమాన సర్వీసులపై నిషేధాన్ని కేంద్ర ప్రభుత్వం అక్టోబర్ 31 వరకు పొడగించింది")
print("sentence_encoded_to_token_ids: ", encoded_telugu_sentence_2.ids)

sentence_encoded_to_token_ids:  [325, 269, 370, 263, 303, 265, 272, 283, 274, 291, 264, 281, 265, 268, 457, 263, 280, 283, 278, 266, 496, 305, 299, 264, 302, 276, 308, 265, 268, 263, 268, 264, 289, 364, 273, 263, 267, 285, 263, 388, 266, 272, 263, 280, 269, 512, 263, 277, 275, 560, 263, 1379, 537, 266, 285, 310, 610, 294, 286, 294, 273, 264]
