## 1. 加载 Tokenizer

In [2]:
from transformers import BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer



BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## 2. Tokenizer 补充 —— encode_plus

encode_plus 方法，会生成 token_type_ids。
- token_type_ids：0，表示第一句；1，表示第二句。
- 句子对，一般使用在 nsp（next sentence predict，bert 预训练任务）。

In [3]:
# 测试数据
test_senteces = [
    'Life is too short to spend time with people who suck the happiness out of you.', 
    'In the flood of darkness, hope is the light. It brings comfort, faith, and confidence.', 
]

In [4]:
# single sentence 级别的（token_type_ids 均为 0）
tokenizer(test_senteces[0], truncation=True, max_length=32)

{'input_ids': [101, 2166, 2003, 2205, 2460, 2000, 5247, 2051, 2007, 2111, 2040, 11891, 1996, 8404, 2041, 1997, 2017, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
# sentence pair 级别（token_type_ids 包括 0 和 1）
batch_input = tokenizer.encode_plus(text=test_senteces[0], text_pair=test_senteces[1], 
                                    max_length=32, truncation=True)
batch_input

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'input_ids': [101, 2166, 2003, 2205, 2460, 2000, 5247, 2051, 2007, 2111, 2040, 11891, 1996, 8404, 2041, 1997, 102, 1999, 1996, 7186, 1997, 4768, 1010, 3246, 2003, 1996, 2422, 1012, 2009, 7545, 7216, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
batch_input.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [7]:
tokenizer.decode(batch_input['input_ids'])

'[CLS] life is too short to spend time with people who suck the happiness out of [SEP] in the flood of darkness, hope is the light. it brings comfort [SEP]'