In [1]:
# In this notebook, you learn:
#
# 1) How to use spacy tokenizers to tokenize text?
# 2) How to build vocabulary from a text corpus?
#

In [2]:
# Useful resources:
#
# 1) https://spacy.io/usage/spacy-101
#       -- A detailed overview of spacy.
# 2) https://spacy.io/usage/models
#       -- Explains how to use pretrained spacy tokenizer models.
# 3) https://realpython.com/python-for-loop/
#       -- To understand iter, iterator and iterables in python.

In [13]:
from datasets import load_from_disk
# torchtext is being deprecated, but I am using this for now since it makes it very easy to build the vocab and 
# use pre-built Spacy tokenizers. In our model, we will use the trained BPETokenizer from Huggingface.
from torchtext.vocab import build_vocab_from_iterator
from typing import Generator, List

import datasets
import pickle
import spacy

In [4]:
AI4_BHARAT_DATA_PATH = "../../Data/AI4Bharat"

In [5]:
# Load the tokenizer train dataset which we already saved to the disk in 'step_1_data_exploration.ipynb' notebook.
tokenizer_train_dataset = load_from_disk(dataset_path=f"{AI4_BHARAT_DATA_PATH}/full_en_te_dataset")
print(tokenizer_train_dataset)
print(type(tokenizer_train_dataset))
print(tokenizer_train_dataset[0])
print(tokenizer_train_dataset[10000])
print(tokenizer_train_dataset[13234])

Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 4946035
})
<class 'datasets.arrow_dataset.Dataset'>
{'idx': 0, 'src': 'Have you heard about Foie gras?', 'tgt': 'ఇక ఫ్రూట్ ఫ్లైస్ గురించి మీరు విన్నారా?'}
{'idx': 10000, 'src': 'You eat ants?', 'tgt': 'మీరు చీమలు తినడానికి?'}
{'idx': 13234, 'src': 'Thats an interesting one.', 'tgt': 'ఇందులో ఆసక్తికరమైంది ఒకటుంది.'}


## Loading Spacy Tokenizer models and tokenizing the sentences

In [6]:
# Load spacy models for English text tokenization.
en_model = spacy.load("en_core_web_sm")
en_tokenizer = en_model.tokenizer
print(en_tokenizer, type(en_tokenizer))
# Load spacy model for Telugu text tokenization.
te_model = spacy.blank("te")
te_tokenizer = te_model.tokenizer
print(te_tokenizer, type(te_tokenizer))

<spacy.tokenizer.Tokenizer object at 0x7f13ffb36710> <class 'spacy.tokenizer.Tokenizer'>
<spacy.tokenizer.Tokenizer object at 0x7f140d34cc10> <class 'spacy.tokenizer.Tokenizer'>


In [7]:
# Tokenizer the first sentences in the tokenizer train dataset.
en_tokens = [token.text for token in en_tokenizer(tokenizer_train_dataset[0]["src"])]
print("english_tokens: ", en_tokens)
te_tokens = [token.text for token in te_tokenizer(tokenizer_train_dataset[0]["tgt"])]
print("telugu_tokens: ", te_tokens)

english_tokens:  ['Have', 'you', 'heard', 'about', 'Foie', 'gras', '?']
telugu_tokens:  ['ఇక', 'ఫ్రూట్', 'ఫ్లైస్', 'గురించి', 'మీరు', 'విన్నారా', '?']


## Building Vocabulary

In [8]:
def tokenize(input_text: str, tokenizer: spacy.tokenizer.Tokenizer) -> List[str]:
    """Tokenizes the input text using the provided tokenizer and returns individual tokens.

    Args:
        input_text (str): Text to be tokenized.
        tokenizer (spacy.tokenizer.Tokenizer): Spacy Tokenizer to tokenize the input text.

    Returns:
        list: A list containing tokens of the input text.
    """
    return [token.text for token in tokenizer(input_text)]

def get_text(input: dict, language: str) -> str:
    """Extracts the text from the input dictionary based on the language provided.

    Args:
        input (dict): Dictionary corresponding a single translation example containing the text data.
        language (str): Language of the text to be extracted.

    Returns:
        str: Text extracted from the input dictionary based on the language provided.
    """
    if language == "en":
        return input["src"]
    else:
        return input["tgt"]

# Create a Generator function that yields tokens.
# This function returns a generator object which is a type of iterator and can be used to iterate.
def yield_tokens(data_iterator: datasets.arrow_dataset.Dataset, tokenizer: spacy.tokenizer.Tokenizer, language: str) -> Generator[List[str], None, None]:
    """Yields tokens of the corresponding language for each example in the data_iterator.

    Args:
        data_iterator (datasets.arrow_dataset.Dataset): Input Hugging Face translation dataset shortened for tokenizer training.
        tokenizer (spacy.tokenizer.Tokenizer): Spacy tokenizer to tokenize the text.
        language (str): language of the text for which the tokens need to be tokenized.

    Yields:
        Generator[List[str], None, None]: generator that yields tokens of the corresponding language for each example in the data_iterator.
    """
    for en_te_example in data_iterator:
        yield tokenize(input_text=get_text(input=en_te_example, language=language), tokenizer=tokenizer)

In [9]:
# Run this cell just to see how yield_tokens function works.
# We need to SKIP THIS CELL while building vocabulary below. If not, we loose the data for the rows already iterated 
# in this cell.
en_token_generator = yield_tokens(data_iterator=tokenizer_train_dataset, tokenizer=en_tokenizer, language="en")
print(f"English tokens: {next(en_token_generator)}")
print(f"English tokens: {next(en_token_generator)}\n\n")
te_token_generator = yield_tokens(data_iterator=tokenizer_train_dataset, tokenizer=te_tokenizer, language="te")
print(f"Telugu tokens: {next(te_token_generator)}")
print(f"Telugu tokens: {next(te_token_generator)}")

English tokens: ['Have', 'you', 'heard', 'about', 'Foie', 'gras', '?']
English tokens: ['I', 'never', 'thought', 'of', 'acting', 'in', 'films', '.']


Telugu tokens: ['ఇక', 'ఫ్రూట్', 'ఫ్లైస్', 'గురించి', 'మీరు', 'విన్నారా', '?']
Telugu tokens: ['సూర్య', 'సినిమాల్లో', 'నటించాలని', 'ఎప్పుడూ', 'అనుకోలేదు', '.']


In [10]:
# Build the vocabulary from the training dataset.
# min_freq: Minimum frequency needed for a token to be included in the vocabulary.
# max_tokens: Maximum number of tokens to be included in the vocabulary.
#       -- Unlike in Byte level BPE, here every other token that is not part of the vocabulary is replaced with <unk>.
#       -- So, it is better to keep the vocabulary size as large as possible. However, it is computationally expensive
#          since it increases the number of parameters in the embedding layer for the transformer model.
# specials: Special tokens to be added in the vocabulary.
# special_first: If True, special tokens are added at the beginning of the vocabulary.
en_vocab = build_vocab_from_iterator(iterator=yield_tokens(data_iterator=tokenizer_train_dataset, tokenizer=en_tokenizer, language="en"), 
                                     min_freq=2, 
                                     max_tokens=None,
                                     specials=["<sos>", "<eos>", "<pad>", "<unk>"], 
                                     special_first=True)
te_vocab = build_vocab_from_iterator(iterator=yield_tokens(data_iterator=tokenizer_train_dataset, tokenizer=te_tokenizer, language="te"), 
                                     min_freq=2,  
                                     max_tokens=None,
                                     specials=["<sos>", "<eos>", "<pad>", "<unk>"], 
                                     special_first=True)

In [None]:
# For spacy, if the max_tokens is None, then the vocabulary size is the number of unique tokens in the dataset.
# This is around 1,81,141 for English and 6,68,560 for Telugu.
print(en_vocab, type(en_vocab), len(en_vocab))
print(te_vocab, type(te_vocab), len(te_vocab))

Vocab() <class 'torchtext.vocab.vocab.Vocab'> 181141
Vocab() <class 'torchtext.vocab.vocab.Vocab'> 668560


In [13]:
# Explore the built vocabulary.
te_vocab_dict = te_vocab.get_stoi()
print(type(te_vocab_dict))
print("Length of Telugu vocabulary: ", len(te_vocab_dict))
print(list(te_vocab_dict.items())[:5])
print(te_vocab_dict["<sos>"], te_vocab_dict["<eos>"], te_vocab_dict["<pad>"], te_vocab_dict["<unk>"])
print("-" * 150)
en_vocab_dict = en_vocab.get_stoi()
print(type(en_vocab_dict))
print("Length of English vocabulary: ", len(en_vocab_dict))
print(list(en_vocab_dict.items())[:5])
print(en_vocab_dict["<sos>"], en_vocab_dict["<eos>"], en_vocab_dict["<pad>"], en_vocab_dict["<unk>"])

<class 'dict'>
Length of Telugu vocabulary:  30000
[('ఇంచార్జ్', 29998), ('ఆహ్వానాలు', 29997), ('ఆర్టీసీకి', 29995), ('అన్\u200cలిమిటెడ్', 29987), ('అనుసరిస్తున్నారు', 29985)]
0 1 2 3
------------------------------------------------------------------------------------------------------------------------------------------------------
<class 'dict'>
Length of English vocabulary:  30000
[('leveraged', 29998), ('lavender', 29997), ('jubilee', 29993), ('incorporates', 29991), ('hereafter', 29990)]
0 1 2 3


In [14]:
# Apparently, this word is not part of the vocabulary and is raising an error.
print(te_vocab["ప్రతిష్టానం"])

RuntimeError: Token ప్రతిష్టానం not found and default index is not set
Exception raised from __getitem__ at /__w/text/text/pytorch/text/torchtext/csrc/vocab.cpp:43 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c1de9e897 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7f4c1de4eb25 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #2: torchtext::Vocab::__getitem__(c10::basic_string_view<char> const&) const + 0x384 (0x7f4b6fd420c4 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so)
frame #3: <unknown function> + 0x1e263 (0x7f4c2a8b5263 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/lib/python3.10/site-packages/torchtext/_torchtext.so)
frame #4: <unknown function> + 0x3e757 (0x7f4c2a8d5757 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/lib/python3.10/site-packages/torchtext/_torchtext.so)
frame #5: <unknown function> + 0x15cb2e (0x564d080f2b2e in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #6: _PyObject_MakeTpCall + 0x25b (0x564d080e92db in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #7: <unknown function> + 0x16b55b (0x564d0810155b in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #8: <unknown function> + 0x1c57e1 (0x564d0815b7e1 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #9: <unknown function> + 0x1c52be (0x564d0815b2be in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #10: _PyEval_EvalFrameDefault + 0xbfd (0x564d080dbe0d in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #11: <unknown function> + 0x1c548e (0x564d0815b48e in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #12: <unknown function> + 0x1c52be (0x564d0815b2be in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #13: _PyEval_EvalFrameDefault + 0xbfd (0x564d080dbe0d in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #14: <unknown function> + 0x142016 (0x564d080d8016 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #15: PyEval_EvalCode + 0x86 (0x564d081cd8b6 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #16: <unknown function> + 0x23d5fd (0x564d081d35fd in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #17: <unknown function> + 0x15d689 (0x564d080f3689 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #18: _PyEval_EvalFrameDefault + 0x6bc (0x564d080db8cc in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #19: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #20: _PyEval_EvalFrameDefault + 0x26f4 (0x564d080dd904 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #21: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #22: _PyEval_EvalFrameDefault + 0x26f4 (0x564d080dd904 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #23: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #24: <unknown function> + 0x257fef (0x564d081edfef in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #25: <unknown function> + 0x168d1a (0x564d080fed1a in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #26: _PyEval_EvalFrameDefault + 0x8ab (0x564d080dbabb in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #27: _PyFunction_Vectorcall + 0x7c (0x564d080f342c in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #28: _PyEval_EvalFrameDefault + 0x6bc (0x564d080db8cc in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #29: _PyFunction_Vectorcall + 0x7c (0x564d080f342c in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #30: _PyEval_EvalFrameDefault + 0x8ab (0x564d080dbabb in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #31: <unknown function> + 0x16b281 (0x564d08101281 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #32: PyObject_Call + 0x122 (0x564d08101f22 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #33: _PyEval_EvalFrameDefault + 0x285e (0x564d080dda6e in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #34: <unknown function> + 0x16b281 (0x564d08101281 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #35: _PyEval_EvalFrameDefault + 0x1983 (0x564d080dcb93 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #36: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #37: _PyEval_EvalFrameDefault + 0x26f4 (0x564d080dd904 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #38: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #39: _PyEval_EvalFrameDefault + 0x26f4 (0x564d080dd904 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #40: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #41: _PyEval_EvalFrameDefault + 0x26f4 (0x564d080dd904 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #42: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #43: _PyEval_EvalFrameDefault + 0x26f4 (0x564d080dd904 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #44: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #45: _PyEval_EvalFrameDefault + 0x26f4 (0x564d080dd904 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #46: <unknown function> + 0x17a8b0 (0x564d081108b0 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #47: <unknown function> + 0x928e (0x7f4c8a4ba28e in /usr/lib/python3.10/lib-dynload/_asyncio.cpython-310-x86_64-linux-gnu.so)
frame #48: <unknown function> + 0xa49b (0x7f4c8a4bb49b in /usr/lib/python3.10/lib-dynload/_asyncio.cpython-310-x86_64-linux-gnu.so)
frame #49: <unknown function> + 0x15c574 (0x564d080f2574 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #50: <unknown function> + 0x239505 (0x564d081cf505 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #51: <unknown function> + 0x2b5e82 (0x564d0824be82 in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #52: <unknown function> + 0x15020b (0x564d080e620b in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #53: _PyEval_EvalFrameDefault + 0x285e (0x564d080dda6e in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #54: _PyFunction_Vectorcall + 0x7c (0x564d080f342c in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #55: _PyEval_EvalFrameDefault + 0x8ab (0x564d080dbabb in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #56: _PyFunction_Vectorcall + 0x7c (0x564d080f342c in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #57: _PyEval_EvalFrameDefault + 0x8ab (0x564d080dbabb in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #58: _PyFunction_Vectorcall + 0x7c (0x564d080f342c in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #59: _PyEval_EvalFrameDefault + 0x8ab (0x564d080dbabb in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #60: _PyFunction_Vectorcall + 0x7c (0x564d080f342c in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #61: _PyEval_EvalFrameDefault + 0x8ab (0x564d080dbabb in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #62: _PyFunction_Vectorcall + 0x7c (0x564d080f342c in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)
frame #63: _PyEval_EvalFrameDefault + 0x8ab (0x564d080dbabb in /home/maneesh/Learning/AI/GenAI/Projects/attention_is_all_you_need/.attention_venv/bin/python)


In [37]:
# We are the setting the default index to the index associated with '<unk>' token.
# This makes sure that we get the index corresponding to '<unk>' if some text that is not present in the vocabulary is queried.
te_vocab.set_default_index(te_vocab["<unk>"])

In [38]:
# It rightly prints '3' which is the index corresponding to the token '<unk>'.
print(te_vocab(["ప్రతిష్టానం"]))
# Notice that we can pass a list of tokens to get the corresponding indices all at once.
print(te_vocab(["ప్రతిష్టానం", "హోషంగాబాద్", "హోల్డర్"]))

[3]
[3, 3, 20789]


In [40]:
# Moving from index to tokens.
print(te_vocab.lookup_tokens([3, 545, 6767]))
print(en_vocab.lookup_tokens([3, 545, 6767]))

['<unk>', 'నలుగురు', 'పోతుంది']
['<unk>', 'suicide', '87']


In [None]:
# Lets try to save the vocabulary to the disk in a pickle file.
EN_VOCAB_FILEPATH = f"{AI4_BHARAT_DATA_PATH}/trained_models/tokenizers/spacy/en_vocab.pkl"
file_obj = open(EN_VOCAB_FILEPATH, 'wb')
pickle.dump(en_vocab, file_obj)
file_obj.close()

AttributeError: 'Vocab' object has no attribute 'to_disk'