# directly using tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModel

checkpoint = "bert-base-cased"
raw_text = "using transformer is simple!"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#The tokenizer should be trained with the same checkpoint as the loading model,
#so that the token vacabulary token ID aligns with embedding mapping index

output = tokenizer(raw_text)
print(output)
# input_ids is the tokens_id
# attention_mask is the mask to filter out '[PAD]' token
# token_type_ids sepcifies to which segement the token belongs to, in multiple text segments tasks, such as question-answering

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'input_ids': [101, 1606, 11303, 1200, 1110, 3014, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


# steps behind

In [None]:
# split for text tokens

token = tokenizer.tokenize(raw_text)
print(token)

# Bert use subword tokenizer: it splits the words until it obtains tokens that can be represented by its vocabulary. That’s how to split transformer, into two tokens: transform and ##er.
# Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords.

['using', 'transform', '##er', 'is', 'simple', '!']


In [None]:
# get token id

token_id = tokenizer.convert_tokens_to_ids(token)
print(token_id)

# comparing to the output of tokenizer(raw_text), this result is missing the id at the begining and the end.
# that is because tokenizer(raw_text), as Bert Model built_in method, adds special tokens to the sequences tokenization
# to prove it:
tokenizer.convert_ids_to_tokens(output['input_ids'])

[1606, 11303, 1200, 1110, 3014, 106]


['[CLS]', 'using', 'transform', '##er', 'is', 'simple', '!', '[SEP]']

In [None]:
# sydicate the tokenizer() output

token = ["[CLS]"] + token + ["[SEP]"]
token_id = tokenizer.convert_tokens_to_ids(token)
print('input_ids:', token_id)     #now it is the same as tokenizer(raw_text) output of input_id

attention_mask = [1 if ele != '[PAD]' else 0 for ele in token]
print('attention_mask:', attention_mask)

token_type_ids = [0 for ele in token]
print('token_type_ids:', token_type_ids)

input_ids: [101, 1606, 11303, 1200, 1110, 3014, 106, 102]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1]
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
# crate batch input tensors with different sequence length

from transformers import AutoTokenizer, AutoModel
import torch

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_text1 = "This movie is scary! Great!"
raw_text2 = "This movie is scary!"
raw_text_batch = [raw_text1, raw_text2]

tokenizer.tokenize(raw_text1)
token_batch = [list(tokenizer.tokenize(raw_text)) for raw_text in raw_text_batch]    # tokenizer funcitons cannot handle more than 1D objects, so we need to use list expression
token_ids_batch = [list(tokenizer.convert_tokens_to_ids(token)) for token in token_batch]
print(token_ids_batch)
token_ids_batch = [torch.tensor(token_ids) for token_ids in token_ids_batch]   # torch.tensor() only can take input with same length elements/rows, so we use list expression to output a list of tensors
print(token_ids_batch)
token_ids_batch = torch.nn.utils.rnn.pad_sequence(token_ids_batch, batch_first=True)
print(token_ids_batch)

[[1188, 2523, 1110, 13952, 106, 2038, 106], [1188, 2523, 1110, 13952, 106]]
[tensor([ 1188,  2523,  1110, 13952,   106,  2038,   106]), tensor([ 1188,  2523,  1110, 13952,   106])]
tensor([[ 1188,  2523,  1110, 13952,   106,  2038,   106],
        [ 1188,  2523,  1110, 13952,   106,     0,     0]])


In [1]:
# if tokenier() takes several sentences as direct input, not in a list container (as above),
# then it will see them as a pair, and build tokens by concatenate them with [SEP] token at the
# end of each sentences, instead of seeing them as batches.

from transformers import AutoTokenizer, AutoModel
import torch

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_text1 = "This movie is scary! Great!"
raw_text2 = "This movie is scary!"

output = tokenizer(raw_text1, raw_text2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [7]:
# tokenizer() take two seperate list as inputs, will match elements by elements from lists, and tokenize them as pairs

from transformers import AutoTokenizer, AutoModel
import torch

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_text1 = "This movie is scary! Great!"
raw_text2 = "This movie is scary!"
raw_text3 = "This novel is great! love it!"
raw_text4 = "This novel is great!"

output = tokenizer([raw_text1, raw_text3],
          [raw_text2, raw_text4],
          padding=True,
          truncation=True)

print(output)
print([tokenizer.decode(token_id) for token_id in output['input_ids']])

{'input_ids': [[101, 1188, 2523, 1110, 13952, 106, 2038, 106, 102, 1188, 2523, 1110, 13952, 106, 102, 0], [101, 1188, 2281, 1110, 1632, 106, 1567, 1122, 106, 102, 1188, 2281, 1110, 1632, 106, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
['[CLS] This movie is scary! Great! [SEP] This movie is scary! [SEP] [PAD]', '[CLS] This novel is great! love it! [SEP] This novel is great! [SEP]']


In [None]:
# decode() vs convert_ids_to_tokens()

raw_text1 = "Transformer is NPL!"
raw_text2 = "Transformer is Auto Robat!"
output = tokenizer(raw_text1, raw_text2)

print (tokenizer.decode(output['input_ids']))                  # return original sentences
print (tokenizer.convert_ids_to_tokens(output['input_ids']))   # return a dictionary of tokens

[CLS] Transformer is NPL! [SEP] Transformer is Auto Robat! [SEP]
['[CLS]', 'Trans', '##former', 'is', 'N', '##PL', '!', '[SEP]', 'Trans', '##former', 'is', 'Auto', 'Rob', '##at', '!', '[SEP]']


In [1]:
# Dynamic padding: dynamically padding to the maximum length of the current batch, instead of the maximum of the whole dataset.

from transformers import AutoTokenizer, AutoModel

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## tokenization on wohle dataset
raw_text1 = "This movie is scary!"
raw_text2 = "This movie is scary! Great!"
raw_text3 = "This movie is scary! I love it!"

whole_set = [raw_text1, raw_text2, raw_text3]
output = tokenizer(whole_set,
          padding=True,
          truncation=True,)
print("whole_datas 1st tokens", tokenizer.convert_ids_to_tokens(output['input_ids'][0]), sep=":")


# appy DataCollatorWithPadding on a batch of tokenization outcomes.
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#==================================================================
# 1. DataCollatorWithPadding applies on top of a tokenization outcome batch, padding each item to the maxium length of the batch.
# 2. At instantiation, it takes a tokenizer as input, to know which padding token to use,
#    and whether the model expects padding to be on the left or on the right of the inputs
# 3. When we use it, the input must be the result of tokenization, which is a dictionary include "input_ids" key.
#    (it search for key name "input_ids" in the input dictionary)
#==================================================================

mini_batch1 = tokenizer([raw_text1, raw_text2])
mini_batch2 = tokenizer([raw_text1, raw_text3])
output_batch1 = data_collator(mini_batch1)
output_batch2 = data_collator(mini_batch2)
print("mini_batch1 1st tokens", tokenizer.convert_ids_to_tokens(output_batch1['input_ids'][0]), sep=":")
print("mini_batch2 1st tokens", tokenizer.convert_ids_to_tokens(output_batch2['input_ids'][0]), sep=":")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

whole_datas 1st tokens:['[CLS]', 'This', 'movie', 'is', 'scary', '!', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
mini_batch1 1st tokens:['[CLS]', 'This', 'movie', 'is', 'scary', '!', '[SEP]', '[PAD]', '[PAD]']
mini_batch2 1st tokens:['[CLS]', 'This', 'movie', 'is', 'scary', '!', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


# pros & cons of tokenization methods <br>


 <https://www.https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt> <br>

 <br/>

**Word_based**

<p>pros: 1) Reserve word meanings <br>
cons: 1) Very large vocabulary <br>
      &emsp;&emsp;&ensp; 2) Large number of out-of-vocaulary tokens <br>
      &emsp;&emsp;&ensp; 3) Loss of meaning across similary words, i.e. run vs running; boy vs boys.. <p/>

<br/>

**Character-based** <br>

<p>pros: 1) smaller vocabulary <br>
&emsp;&emsp;&ensp;2) less out-of-vocabulary tokens <br>
cons: 1) very long token_id vector, increasing computational complexity <br>
&emsp;&emsp;&ensp;2) loss meaning of words <p/>

<br/>

**Subword-based** <br>
<p>combine pros of both word_and_character_based tokenizations.<br>
pros: 1) reserve semantic meanings of word, i.e.<br>
&emsp;&emsp;&ensp;“tokenization” split into “token” and “ization”, two tokens that have a semantic meaning while being space-efficient (only two tokens are needed to represent a long word)<br>
&emsp;&emsp;&ensp;2) less out-of-vocabulary tokens <br>
&emsp;&emsp;&ensp;3) smaller vocabulary <br>
&emsp;&emsp;&ensp;4) Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords. <p\>

In [None]:
'''
&nbsp; --> single space
&ensp; --> (2 x &nbsp;)
&emsp; --> (4 x &nbsp;)
'''