In [17]:
from transformers import AutoTokenizer

In [18]:
text = "我爱玩英雄联盟"

In [19]:
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [20]:
tokens = tokenizer.tokenize(text)
tokens

['我', '爱', '玩', '英', '雄', '联', '盟']

In [21]:
tokenizer.vocab

{'##よう': 10177,
 '##颳': 20651,
 '扇': 2794,
 '萼': 5861,
 '链': 7216,
 '##δ': 13383,
 '氤': 3707,
 '辯': 6800,
 '##oman': 13127,
 '##携': 16082,
 '##ean': 10096,
 '宵': 2156,
 '##dr': 11582,
 '偶': 981,
 '##min': 9844,
 'unit': 12816,
 'はしめまして': 13012,
 '罄': 5378,
 '辫': 6797,
 'amana': 12406,
 '揄': 2985,
 'el': 10245,
 '##re': 8358,
 '##椭': 16552,
 '駿': 7695,
 '实': 2141,
 '荫': 5789,
 '##楮': 16569,
 '##泗': 16849,
 '##竖': 18047,
 '##親': 19274,
 '冢': 1095,
 '30': 8114,
 '##辣': 19850,
 '##譁': 19405,
 '評': 6268,
 '境': 1862,
 '##窜': 18029,
 '##選': 19965,
 'cio': 9351,
 '##頷': 20590,
 '##績': 18302,
 '泛': 3793,
 '##腔': 18636,
 '刻': 1174,
 '媾': 2062,
 '禽': 4896,
 '##乓': 13786,
 '513': 13310,
 '滁': 3991,
 '胸': 5541,
 '##凋': 14175,
 '広': 2410,
 '舂': 5642,
 '##my': 9009,
 '麟': 7929,
 'top100': 10124,
 '##咬': 14547,
 '口': 1366,
 'special': 9969,
 '413': 12561,
 '##徠': 15595,
 '忧': 2569,
 '即': 1315,
 '##sio': 12266,
 '##庆': 15469,
 '审': 2144,
 '蘊': 5980,
 '揸': 3003,
 '##恬': 15677,
 '琺': 4436,
 '埃': 1812,
 '

In [22]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2769, 4263, 4381, 5739, 7413, 5468, 4673]

In [23]:
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['我', '爱', '玩', '英', '雄', '联', '盟']

In [24]:
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'我 爱 玩 英 雄 联 盟'

In [25]:
ids = tokenizer.encode(text)
ids #首尾分别加CLS和SEP

[101, 2769, 4263, 4381, 5739, 7413, 5468, 4673, 102]

In [26]:
str_sen = tokenizer.decode(ids)
str_sen

'[CLS] 我 爱 玩 英 雄 联 盟 [SEP]'

In [27]:
ids = tokenizer.encode(text,padding="max_length",max_length=15)
ids

[101, 2769, 4263, 4381, 5739, 7413, 5468, 4673, 102, 0, 0, 0, 0, 0, 0]

In [28]:
test = tokenizer.encode(text,max_length=5,truncation=True)
test

[101, 2769, 4263, 4381, 102]

In [29]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids,attention_mask,token_type_ids

([101, 2769, 4263, 4381, 5739, 7413, 5468, 4673, 102, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [30]:
inputs = tokenizer.encode_plus(text,padding="max_length",max_length=15)
inputs

{'input_ids': [101, 2769, 4263, 4381, 5739, 7413, 5468, 4673, 102, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}

In [31]:
inputs = tokenizer(text,padding="max_length",max_length=15)
inputs

{'input_ids': [101, 2769, 4263, 4381, 5739, 7413, 5468, 4673, 102, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}

对Batch

In [37]:
sen = "我爱你"
sens = ["我爱你",
        "我想跟你共度余生",
        "你是我的另一半"]
res = tokenizer(sens)
res

{'input_ids': [[101, 2769, 4263, 872, 102], [101, 2769, 2682, 6656, 872, 1066, 2428, 865, 4495, 102], [101, 872, 3221, 2769, 4638, 1369, 671, 1288, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [38]:
%%time

for i in range(1000):
    tokenizer(sens)

CPU times: total: 328 ms
Wall time: 120 ms


In [39]:
%%time

res = tokenizer([sen]*1000)

CPU times: total: 0 ns
Wall time: 9.75 ms


In [40]:
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [41]:
%%time

for i in range(1000):
    res = tokenizer(sen)

CPU times: total: 46.9 ms
Wall time: 33.2 ms


In [42]:
inputs = tokenizer(text,return_offsets_mapping=True)
inputs

{'input_ids': [101, 2769, 4263, 4381, 5739, 7413, 5468, 4673, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (0, 0)]}

In [44]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, None]