In [1]:
BRANCH = 'main'

In [2]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell

# install NeMo
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[nlp]

Collecting nemo_toolkit[nlp]
  Cloning https://github.com/NVIDIA/NeMo.git (to revision main) to /tmp/pip-install-gw09d6le/nemo-toolkit
  Running command git clone -q https://github.com/NVIDIA/NeMo.git /tmp/pip-install-gw09d6le/nemo-toolkit
Collecting onnx>=1.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/36/ee/bc7bc88fc8449266add978627e90c363069211584b937fd867b0ccc59f09/onnx-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (7.4MB)
[K     |████████████████████████████████| 7.4MB 7.9MB/s 
[?25hCollecting pytorch-lightning==0.9.0
[?25l  Downloading https://files.pythonhosted.org/packages/ed/af/2f10c8ee22d7a05fe8c9be58ad5c55b71ab4dd895b44f0156bfd5535a708/pytorch_lightning-0.9.0-py3-none-any.whl (408kB)
[K     |████████████████████████████████| 409kB 49.9MB/s 
Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Collecting ruamel.yaml
[?25l  Downloading https://files.pythonhosted.org

In [3]:
import os
import wget
from nemo.collections import nlp as nemo_nlp
from nemo.collections import common as nemo_common
from omegaconf import OmegaConf

[NeMo W 2020-09-09 01:32:32 experimental:28] Module <class 'nemo.collections.nlp.modules.common.megatron.megatron_bert.MegatronBertEncoder'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-09-09 01:32:32 experimental:28] Module <class 'nemo.collections.nlp.modules.common.sequence_token_classifier.SequenceTokenClassifier'> is experimental, not ready for production and is not fully supported. Use at your own risk.




# Tokenizers Background

For Natural Language Processing, tokenization is an essential part of data preprocessing. It is the process of splitting a string into a list of tokens. One can think of token as parts like a word is a token in a sentence.
Dependening on the application, different tokenizers are more suitable than others. 


For example, a WordTokenizer that splits the string on any whitespace, would tokenize the following string 

"My first program, Hello World." -> ["My", "first", "program,", "Hello", "World."]

To turn the tokens into numerical model input, the standard method is use a vocabulary and one-hot vectors for [word embeddings](https://en.wikipedia.org/wiki/Word_embedding). If a token appears in the vocabulary, its index is returned, if not the index of the unknown token is returned to mitigate out-of-vocabulary (OOV).




# Tokenizers in NeMo

In NeMo, we support the most used tokenization algorithms. We offer a wrapper around [HuggingFaces's AutoTokenizer](https://huggingface.co/transformers/model_doc/auto.html#autotokenizer) - a factory class that gives access to all HuggingFace tokenizers. This includes particularly all BERT-like model tokenizers, such as BertTokenizer, AlbertTokenizer, RobertaTokenizer, GPT2Tokenizer. Apart from that, we also support other tokenizers such as WordTokenizer, CharTokenizer, and [Google's SentencePieceTokenizer](https://github.com/google/sentencepiece).  


We make sure that all tokenizers are compatible with BERT-like models, e.g. BERT, Roberta, Albert, and Megatron. For that, we provide a high-level user API `get_tokenizer()`, which allows the user to instantiate a tokenizer model with only four input arguments: 
* `tokenizer_name: str`
* `tokenizer_model: Optional[str] = None`
* `vocab_file: Optional[str] = None`
* `special_tokens: Optional[Dict[str]] = None`

HuggingFace and Megatron tokenizers (which uses HuggingFace underneath) can be automatically instantiated by only `tokenizer_name`, which downloads the corresponding `vocab_file` from the internet in the back. 
For SentencePieceTokenizer, WordTokenizer, and CharTokenizers `tokenizer_model` or/and `vocab_file` can be generated offline in advance using [`scripts/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/main/scripts/process_asr_text_tokenizer.py)

The tokenizers in NeMo are designed to be used interchageably, especially when
used in combination with a BERT-based model.

Let's take a look at the list of available tokenizers:

In [4]:
nemo_nlp.modules.get_tokenizer_list()

['sentencepiece',
 'char',
 'word',
 'TurkuNLP/bert-base-finnish-cased-v1',
 'distilgpt2',
 'megatron-bert-345m-cased',
 'facebook/mbart-large-cc25',
 'facebook/mbart-large-en-ro',
 'xlm-mlm-tlm-xnli15-1024',
 'roberta-large-mnli',
 'albert-xxlarge-v1',
 'allenai/longformer-large-4096',
 'TurkuNLP/bert-base-finnish-uncased-v1',
 'distilbert-base-cased',
 'retribert-base-uncased',
 'bert-base-cased',
 'roberta-base',
 'bert-large-cased-whole-word-masking',
 'camembert-base',
 'allenai/longformer-base-4096',
 'albert-large-v2',
 'bert-large-cased-whole-word-masking-finetuned-squad',
 't5-small',
 'facebook/bart-large',
 'xlm-roberta-large',
 'google/electra-small-discriminator',
 'roberta-base-openai-detector',
 'cl-tohoku/bert-base-japanese-whole-word-masking',
 'transfo-xl-wt103',
 'gpt2',
 'xlm-mlm-ende-1024',
 'yjernite/bart_eli5',
 'xlm-roberta-large-finetuned-conll03-german',
 'xlnet-large-cased',
 'google/electra-large-generator',
 'flaubert/flaubert_large_cased',
 'distilbert-bas

# HuggingFace AutoTokenizer

In [5]:
# instantiate tokenizer wrapper using pretrained model name only
tokenizer1 = nemo_nlp.modules.get_tokenizer(tokenizer_name="bert-base-cased")

# the wrapper has a reference to the original HuggingFace tokenizer
print(tokenizer1.tokenizer)

# check vocabulary
# print(tokenizer1.tokenizer.vocab)

# show all special tokens if it has any
print(tokenizer1.tokenizer.all_special_tokens)

# instantiate tokenizer using custom vocabulary
vocab_file = "myvocab.txt"
vocab = ["he", "llo", "world"]
with open(vocab_file, 'w') as vocab_fp:
  vocab_fp.write("\n".join(vocab))
tokenizer2 = nemo_nlp.modules.get_tokenizer(tokenizer_name="bert-base-cased", vocab_file=vocab_file)

# Since we did not overwrite special tokens they should be the same as before
print(tokenizer1.tokenizer.all_special_tokens == tokenizer2.tokenizer.all_special_tokens )

# We do not recommend the user to overwrite special tokens for HuggingFace models, since these are the commonly used default values. 
# If the user still wants overwrite the special tokens, specify some of the following keys 
special_tokens_dict = {"unk_token": "<UNK>", "sep_token": "<SEP>", "pad_token": "<PAD>", "bos_token": "<CLS>", "mask_token": "<MASK>", "eos_token": "<SEP>", "cls_token": "<CLS>"}
tokenizer3 = nemo_nlp.modules.get_tokenizer(tokenizer_name="bert-base-cased", vocab_file=vocab_file, special_tokens=special_tokens_dict)

# print newly set special tokens
print(tokenizer3.tokenizer.all_special_tokens)
print(tokenizer3.tokenizer.all_special_tokens != tokenizer1.tokenizer.all_special_tokens )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…


<transformers.tokenization_bert.BertTokenizer object at 0x7fe29bee8470>
['[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]']


Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.


True


Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.


[NeMo I 2020-09-09 01:38:28 auto_tokenizer:148] 4 special tokens added, resize your model accordingly.
['<CLS>', '<SEP>', '<UNK>', '<PAD>', '<MASK>']
True


## Megatron model tokenizer

In [6]:
# Megatron tokenizers are instances of the HuggingFace BertTokenizer. 
tokenizer4 = nemo_nlp.modules.get_tokenizer(tokenizer_name="megatron-bert-cased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




# Train custom tokenizer model and vocabulary from text file 

We use the [`scripts/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/main/scripts/process_asr_text_tokenizer.py) script to create a custom tokenizer model with its own vocabulary from an input file

In [7]:
# download tokenizer script
script_file = "process_asr_text_tokenizer.py"

if not os.path.exists(script_file):
    print('Downloading script file...')
    wget.download('https://raw.githubusercontent.com/NVIDIA/NeMo/' + BRANCH + '/scripts/process_asr_text_tokenizer.py')
else:
    print ('Script already exists')

Downloading script file...


In [9]:
# prepare toy text data and run tokenizer script 

tokenizer_spe_type="bpe"
data_text = "NeMo is a toolkit for creating Conversational AI applications. \
NeMo toolkit makes it possible for researchers to easily compose complex neural network architectures \
for conversational AI using reusable components - Neural Modules. \
Neural Modules are conceptual blocks of neural networks that take typed inputs and produce typed outputs. \
Such modules typically represent data layers, encoders, decoders, language models, loss functions, or methods of combining activations. \
The toolkit comes with extendable collections of pre-built modules and ready-to-use models for automatic speech recognition (ASR), \
natural language processing (NLP) and text synthesis (TTS). \
Built for speed, NeMo can utilize NVIDIA's Tensor Cores and scale out training to multiple GPUs and multiple nodes."
vocab_size=32
data_file="data.txt"
with open(data_file, 'w') as data_fp:
  data_fp.write(data_text)

In [10]:
! python process_asr_text_tokenizer.py --data_file=$data_file --data_root=. --vocab_size=$vocab_size --tokenizer=spe --spe_type=$tokenizer_spe_type

2020-09-09 01:39:37.223338: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
[NeMo I 2020-09-09 01:39:38 sentencepiece_tokenizer:211] Processing data.txt and store at ./tokenizer_spe_v32
sentencepiece_trainer.cc(170) LOG(INFO) Running command: --input=data.txt --model_prefix=./tokenizer_spe_v32/tokenizer --vocab_size=32 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --bos_id=-1 --eos_id=-1 --normalization_rule_name=nmt_nfkc_cf
sentencepiece_trainer.cc(75) LOG(INFO) Starts training with : 
trainer_spec {
  input: data.txt
  input_format: 
  model_prefix: ./tokenizer_spe_v32/tokenizer
  model_type: BPE
  vocab_size: 32
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_

In [11]:
# See created tokenizer model and vocabulary
spe_model_dir=f"tokenizer_spe_v{vocab_size}"
! ls $spe_model_dir

tokenizer.model  tokenizer.vocab  vocab.txt


In [12]:
# to tokenize at unigram, char or word boundary, change --spe_type accordingly. More details see https://github.com/google/sentencepiece#train-sentencepiece-model

# Use custom tokenizer for data preprocessing
## Example: SentencePiece for BPE

In [13]:
# initialize tokenizer with created tokenizer model, which inherently includes the vocabulary and specify optional special tokens
tokenizer_spe = nemo_nlp.modules.get_tokenizer(tokenizer_name="sentencepiece", tokenizer_model=spe_model_dir+"/tokenizer.model", special_tokens=special_tokens_dict)

# specified special tokens are added to the vocabuary
print(tokenizer_spe.vocab_size)

37


## Example: WordTokenizer from Vocabulary

In [15]:
# If you want to use a simple tokenizer like WordTokenizer without first generating the tokenizer.model first 
# we provide the alternative class WordTokenizer or CharTokenizer that takes a user vocabulary as input

# initialize tokenizer with vocabulary and specify optional special tokens
tokenizer_word = nemo_nlp.modules.get_tokenizer(tokenizer_name="word", vocab_file=vocab_file, special_tokens=special_tokens_dict)

# specified special tokens are added to the vocabulary
print(tokenizer_word.vocab_size)

8


# Using any tokenizer to tokenize text into BERT compatible input


In [16]:
text="hello world"

# create tokens
tokenized = [tokenizer_word.bos_token] + tokenizer_word.text_to_tokens(text) + [tokenizer_word.eos_token]
print(tokenized)

# turn token into input_ids for a neural model, such as BERTModule

print(tokenizer_word.tokens_to_ids(tokenized))

['<CLS>', '<UNK>', 'world', '<SEP>']
[6, 3, 2, 4]
