In [1]:
# default_exp tokenizers

### Explore Tokenizers

>Exploring tokenizers offered and used by transformers models


In [2]:
#export
from fastai.core import *
from transformers import AutoModel, AutoTokenizer, RobertaTokenizer
import tokenizers

In [3]:
tokenizers.__version__

'0.5.2'

In [3]:
PRETRAINED_TYPE = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_TYPE, do_lower_case=True)

In [4]:
PRETRAINED_TOKENIZERS_PATH = Path("../tokenizers")
os.makedirs(PRETRAINED_TOKENIZERS_PATH, exist_ok=True)
os.makedirs(PRETRAINED_TOKENIZERS_PATH/PRETRAINED_TYPE, exist_ok=True)

In [5]:
(PRETRAINED_TOKENIZERS_PATH).ls(), (PRETRAINED_TOKENIZERS_PATH/PRETRAINED_TYPE).ls()

([PosixPath('../tokenizers/roberta-base'),
  PosixPath('../tokenizers/tf-roberta')],
 [PosixPath('../tokenizers/roberta-base/tokenizer_config.json'),
  PosixPath('../tokenizers/roberta-base/special_tokens_map.json'),
  PosixPath('../tokenizers/roberta-base/merges.txt'),
  PosixPath('../tokenizers/roberta-base/vocab.json')])

In [6]:
tokenizer.save_pretrained(PRETRAINED_TOKENIZERS_PATH/PRETRAINED_TYPE)

('../tokenizers/roberta-base/vocab.json',
 '../tokenizers/roberta-base/merges.txt',
 '../tokenizers/roberta-base/special_tokens_map.json',
 '../tokenizers/roberta-base/added_tokens.json')

In [94]:
s = "Roberta uses GPT-2 tokenizer model - ByteLevelBPETokenizer"

In [95]:
q1 = "Which tokenizer model Roberta uses?"
q2 = "Which model uses GPT-2 tokenizer?"

In [96]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [97]:
tokenizer.decode(tokenizer.encode(s), skip_special_tokens=False)

'<s> Roberta uses GPT-2 tokenizer model - ByteLevelBPETokenizer</s>'

In [98]:
tokenizer.vocab_size

50265

In [99]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [100]:
# we need do_lower_case=False for roberta-base
tokenizer = RobertaTokenizer.from_pretrained(str(PRETRAINED_TOKENIZERS_PATH/PRETRAINED_TYPE), 
                                             do_lower_case=False)

In [101]:
tokenizer.decode(tokenizer.encode(s), skip_special_tokens=False)

'<s> Roberta uses GPT-2 tokenizer model - ByteLevelBPETokenizer</s>'

In [102]:
tokenizer.vocab_size

50265

In [103]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

### Tokenizers

`tokenizers` package have the following tokenizers:

- ByteLevelBPETokenizer
- CharBPETokenizer
- SentencePieceBPETokenizer
- BertWordPieceTokenizer

There are many models offered and each of these pretrained models use a specific tokenizer.

In [110]:
from tse.squad_utils import *

In [107]:
PRETRAINED_TOK_PATH = PRETRAINED_TOKENIZERS_PATH/PRETRAINED_TYPE

In [108]:
PRETRAINED_TOK_PATH.ls()

[PosixPath('../tokenizers/roberta-base/tokenizer_config.json'),
 PosixPath('../tokenizers/roberta-base/special_tokens_map.json'),
 PosixPath('../tokenizers/roberta-base/merges.txt'),
 PosixPath('../tokenizers/roberta-base/vocab.json')]

In [158]:
from tokenizers import (ByteLevelBPETokenizer, CharBPETokenizer,
                        SentencePieceBPETokenizer, BertWordPieceTokenizer)
from tokenizers.processors import RobertaProcessing, BertProcessing

In [146]:
tok_conf_args = read_json_as_dict(PRETRAINED_TOK_PATH/'tokenizer_config.json')
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=str(PRETRAINED_TOK_PATH/'vocab.json'), 
    merges_file=str(PRETRAINED_TOK_PATH/'merges.txt'), 
    lowercase=tok_conf_args['do_lower_case'],
    add_prefix_space=True
)

![](../images/tokenizers_pipeline.png)

From encoded string we can access the following attributes:

- `ids`: token ids
- `attention_mask`: binary indicator of what's padded or not
- `type_ids`: token type ids for some models, such as BERT
- `offsets`: offsets to map tokens back to original string positions
- `original_str`: original string
- `normalized_str`: original string after normalizer step

Also the following methods:

-  `truncate()`: to truncate text to a length
-  `pad()`: to pad text to a length

In [141]:
toks = tokenizer.encode(s)

In [142]:
print(toks)

Encoding(num_tokens=18, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])


In [144]:
array(toks.ids), toks.attention_mask, array(toks.type_ids)

(array([ 4533,  6747,   102,  2939,   821,  3320,    12,   176, 19233,  6315,  1421,   111, 47893,  4483,   428, 13713,
        22036,  6315]),
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

This tokenizer won't have special token handling yet

In [150]:
special_tokens = read_json_as_dict(PRETRAINED_TOK_PATH/'special_tokens_map.json')

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [151]:
for k,v in special_tokens.items(): print(v, tokenizer.token_to_id(v))

<s> 0
</s> 2
<unk> 3
</s> 2
<pad> 1
<s> 0
<mask> 50264


Vocab seems to already have these special tokens since we are using a pretrained tokenizers `vocab.json`. Let's add these as attributes to our `tokenizer` instance for ease of use.

In [154]:
for k,v in special_tokens.items(): setattr(tokenizer, k, v)

In [157]:
(tokenizer.bos_token, tokenizer.eos_token, tokenizer.unk_token, tokenizer.sep_token, tokenizer.pad_token, 
tokenizer.cls_token, tokenizer.mask_token)

('<s>', '</s>', '<unk>', '</s>', '<pad>', '<s>', '<mask>')

In [159]:
BertProcessing.num_special_tokens_to_add

<method 'num_special_tokens_to_add' of 'PostProcessor' objects>

### Transformers Tokenizers

We can alternatively use tokenizer and processors implemented in `transformers` without much hassle. We will use `AutoTokenizer` to load the same pretrained tokenizer. This will handle special tokens for us.

In [176]:
#export
from transformers.data.processors.squad import SquadV2Processor, squad_convert_examples_to_features

In [177]:
PRETRAINED_TOK_PATH.ls()

[PosixPath('../tokenizers/roberta-base/tokenizer_config.json'),
 PosixPath('../tokenizers/roberta-base/special_tokens_map.json'),
 PosixPath('../tokenizers/roberta-base/config.json'),
 PosixPath('../tokenizers/roberta-base/merges.txt'),
 PosixPath('../tokenizers/roberta-base/vocab.json')]

In [168]:
!cp {PRETRAINED_TOK_PATH/'tokenizer_config.json'} {PRETRAINED_TOK_PATH/'config.json'}

In [170]:
tokenizer = AutoTokenizer.from_pretrained(str(PRETRAINED_TOK_PATH))

In [174]:
tokenizer.decode(tokenizer.encode(s, q1))

'<s> roberta uses gpt-2 tokenizer model - bytelevelbpetokenizer</s></s> which tokenizer model roberta uses?</s>'

We can also directly use SQUAD processors.

In [193]:
#export
squad_processor = SquadV2Processor()

In [178]:
SQUAD_DATA_PATH = Path("../squad_data/")

In [180]:
train_examples = squad_processor.get_train_examples(SQUAD_DATA_PATH, 'train_squad_data_0.json')
valid_examples = squad_processor.get_train_examples(SQUAD_DATA_PATH, 'valid_squad_data_0.json')
test_examples = squad_processor.get_dev_examples(SQUAD_DATA_PATH, 'test_squad_data.json')

100%|██████████| 21984/21984 [00:00<00:00, 23792.46it/s]
100%|██████████| 5496/5496 [00:00<00:00, 29336.36it/s]
100%|██████████| 3534/3534 [00:00<00:00, 29409.30it/s]


In [281]:
question = train_examples[0].question_text
answer = train_examples[0].answer_text
context = train_examples[0].context_text 
char_to_offset = array(train_examples[0].char_to_word_offset)

In [282]:
question, answer, context, char_to_offset

('negative',
 'Sooo SAD',
 ' Sooo SAD I will miss you here in San Diego!!!',
 array([-1,  0,  0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,
         6,  6,  6,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9]))

In [191]:
#export
MAX_SEQ_LEN = 104
MAX_QUERY_LEN = 5
DOC_STRIDE = 200 # useful for LM modeling
def get_squad_dataset(examples, tokenizer, is_training):
    return squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        doc_stride=DOC_STRIDE,
        max_seq_length=MAX_SEQ_LEN,
        max_query_length=MAX_QUERY_LEN,
        is_training=is_training,
        return_dataset="pt",
        threads=defaults.cpus,
    )

In [196]:
train_features, train_dataset = get_squad_dataset(train_examples, tokenizer, True)
valid_features, valid_dataset = get_squad_dataset(valid_examples, tokenizer, True)
test_features, test_dataset = get_squad_dataset(test_examples, tokenizer, False)

convert squad examples to features: 100%|██████████| 21984/21984 [00:06<00:00, 3339.75it/s]
add example index and unique id: 100%|██████████| 21984/21984 [00:00<00:00, 838891.33it/s]
convert squad examples to features: 100%|██████████| 5496/5496 [00:01<00:00, 3617.62it/s]
add example index and unique id: 100%|██████████| 5496/5496 [00:00<00:00, 830968.41it/s]
convert squad examples to features: 100%|██████████| 3534/3534 [00:00<00:00, 4321.36it/s]
add example index and unique id: 100%|██████████| 3534/3534 [00:00<00:00, 723374.67it/s]


Here, we can see that we can extract `input_ids`, `attention_mask`, `token_type_ids` (not used by Roberta) for modeling. `token_to_orig_map` can be used for mapping tokens back to original string. Note that this dictionary starts with `4th idx` token since first 4 are our question `['<s>', 'negative', '</s>', '</s>']` tokens, also the last one is not used since it is eos token `'</s>'`. We need take this into account when mapping back from tokens.

In [325]:
token_to_orig_map = train_features[0].token_to_orig_map
context_start, context_end = min(token_to_orig_map), max(token_to_orig_map)
len(token_to_orig_map), token_to_orig_map

(14,
 {4: 0,
  5: 0,
  6: 1,
  7: 1,
  8: 2,
  9: 3,
  10: 4,
  11: 5,
  12: 6,
  13: 7,
  14: 8,
  15: 9,
  16: 9,
  17: 9})

In [328]:
tokens = array(train_features[0].tokens)
len(tokens), tokens

(19,
 array(['<s>', 'negative', '</s>', '</s>', 'so', 'oo', 's', 'ad', 'i', 'will', 'miss', 'you', 'here', 'in', 'san',
        'die', 'go', '!!!', '</s>'], dtype='<U8'))

In [331]:
len(tokens[context_start:context_end+1]), tokens[context_start:context_end+1]

(14,
 array(['so', 'oo', 's', 'ad', 'i', 'will', 'miss', 'you', 'here', 'in', 'san', 'die', 'go', '!!!'], dtype='<U8'))

Start and end predictions may contain indexes from `(0, MAX_SEQ_LEN)` so we will need some post processing to map back to original string for inference.

In [332]:
train_tensors = train_dataset.tensors

input_ids = train_tensors[0][0]
attention_mask = train_tensors[1][0]
token_type_ids = train_tensors[2][0]
start_position = train_tensors[3][0].item()
end_position = train_tensors[4][0].item()

In [333]:
import torch
start_logits = torch.randn_like(input_ids, dtype=torch.float)
end_logits = torch.randn_like(input_ids, dtype=torch.float)
start_logits.shape, end_logits.shape

(torch.Size([104]), torch.Size([104]))

We need to filter `start_logits` and `end_logits` before finding the best start and end idxs:

- Filter by `attention_mask` to exclude padding
- Filter by question tokens [4:] first 4 tokens in our case, which can be also obtained from `min() and max()` keys of `token_to_orig_map` for variable question lengths
- Exclude final token [:-1] which is the `eos` token

After these filters `start_logits` and `end_logits` will both have length of `len(token_to_orig_map)`. So, the best start and idx may have idx values between `(0, len(token_to_orig_map)-1)`.

In [334]:
context_start, context_end = min(token_to_orig_map), max(token_to_orig_map)

_start_logits = start_logits[attention_mask.bool()][context_start:context_end+1]
_end_logits = end_logits[attention_mask.bool()][context_start:context_end+1]

We find out best idxs so that `star_idx <= end_idx` and `start_logit + end_logit` is max

In [338]:
#export
def find_best_start_end_idxs(_start_logits, _end_logits):
    best_logit = -1e6
    best_idxs = None
    for start_idx, start_logit in enumerate(_start_logits):
        for end_idx, end_logit in enumerate(_end_logits[start_idx:]):
            logit_sum = (start_logit + end_logit).item()
            if logit_sum > best_logit:
                best_logit = logit_sum
                best_idxs = (start_idx, start_idx+end_idx)
    return best_idxs

In [340]:
start_idx, end_idx = get_best_start_end_idxs(_start_logits, _end_logits)

In [341]:
tokenizer.decode(context_tokens[start_idx:end_idx+1])

'iwillmissyouherein'

Once we have our best start and end idxs we need to shift it by offset so that we can look up original positions form  `token_to_orig_map`

In [342]:
tok_orig_char_start = token_to_orig_map[start_idx+context_start] 
tok_orig_char_end = token_to_orig_map[end_idx+context_start]

Now we can iterate over original context string char by char to get the answer. This way will be much more robust to retain characteristics of original context and will not be proned to 1-way transformation artifacts.

In [343]:
assert len(context) == len(char_to_offset)

In [344]:
#export
def answer_from_orig_context(context, char_to_offset, tok_orig_char_start, tok_orig_char_end):
    """
    Find answer segment char by char from context in 
    example.context and example.char_to_word_offset
    """
    answer_chars = [char for char, offs_id in zip(context, char_to_offset) 
                if (offs_id >= tok_orig_char_start) & (offs_id <= tok_orig_char_end)]
    predicted_answer = "".join(answer_chars).strip()
    return predicted_answer

In [345]:
answer_from_orig_context(context, char_to_offset, tok_orig_char_start, tok_orig_char_end)

'I will miss you here in'

### Conclusion

For our datasets we will need the following:

#### Examples (Validation/Inference)

- `examples[i].context_text`
- `examples[i].char_to_word_offset`
- `examples[i].answer_text`


#### Features (Validation/Inference)

- `features[i].token_to_orig_map`

#### Dataset Tensors (Validation/Training/Inference)

- `input_ids = train_tensors[0][i]`
- `attention_mask = train_tensors[1][i]`
- `token_type_ids = train_tensors[2][i]`
- `start_position = train_tensors[3][i]`
- `end_position = train_tensors[4][i]`



### export

In [348]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01-squad-utils.ipynb.
Converted 02-tokenizers.ipynb.
Converted 03-datasets.ipynb.
