Tokenizer 基本使用

In [2]:
from transformers import AutoTokenizer

Step1 加载和保存

In [28]:
# 从HuggingFace加载，输入模型名称，即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
# 保存到本地
tokenizer.save_pretrained("./my_tokenizer")

('./my_tokenizer/tokenizer_config.json',
 './my_tokenizer/special_tokens_map.json',
 './my_tokenizer/vocab.txt',
 './my_tokenizer/added_tokens.json',
 './my_tokenizer/tokenizer.json')

In [None]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./my_tokenizer/")

Step2 句子分词

In [5]:
tokens = tokenizer.tokenize("你好，世界！")
print(tokens)

['你', '好', '，', '世', '界', '！']


Step3 查看词典

In [None]:
# 词和他的索引号
tokenizer.vocab

{'lan': 11461,
 '##蠻': 19173,
 '1993': 8516,
 '##滬': 17071,
 'cpa': 11963,
 '2014': 8127,
 '##裙': 19227,
 '##忏': 15614,
 '杂': 3325,
 '##篑': 18122,
 '##贡': 19624,
 '簽': 5087,
 '矿': 4771,
 '##某': 16435,
 '隍': 7388,
 '##fig': 11392,
 '##co': 8588,
 '##gan': 10622,
 'twice': 13054,
 '##豊': 19547,
 '煦': 4211,
 '##；': 21085,
 '##舉': 18704,
 '##呤': 14508,
 '##dio': 12159,
 '６': 8034,
 '##偈': 14027,
 '##赢': 19674,
 '##015': 8851,
 '##с': 13416,
 'victoria': 12445,
 '230': 9111,
 '2765': 9513,
 '漉': 4025,
 '##印': 14370,
 '##畫': 17586,
 'lohas': 9757,
 '##癫': 17683,
 '##gg': 9949,
 '濡': 4091,
 '鑲': 7146,
 '##疼': 17620,
 '##郡': 20008,
 '##搡': 16076,
 '億': 1023,
 '办': 1215,
 '槳': 3551,
 '##cal': 10384,
 '##骋': 20797,
 '祈': 4857,
 '訕': 6247,
 '##輟': 19798,
 '##㎡': 9236,
 '珉': 4395,
 '##噁': 14736,
 '耶': 5456,
 'joseph': 11151,
 '##使': 13943,
 '祉': 4858,
 '##陡': 20424,
 '##前': 14241,
 '熙': 4224,
 '蒜': 5886,
 '##映': 16273,
 '卸': 1319,
 '1200': 8552,
 '功': 1216,
 '蓆': 5899,
 '袂': 6146,
 '##想': 15739,
 

In [7]:
tokenizer.vocab_size

21128

Step4 索引转换

In [None]:
# 将词序列转换为id序列
# 这里只是转换成词典对应的索引,没有起始和结束标记
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[872, 1962, 8024, 686, 4518, 8013]

In [9]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['你', '好', '，', '世', '界', '！']

In [10]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'你 好 ， 世 界 ！'

In [None]:
# 更便捷的方式
# 将字符串转换为id序列，又称之为编码
# add_special_tokens 每个句子包含开始和结束标记符, 默认为True
sen = "你好，世界！"
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101, 872, 1962, 8024, 686, 4518, 8013, 102]

In [15]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 你 好 ， 世 界 ！ [SEP]'

Step5 填充和截断

In [None]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=10)
ids

[101, 872, 1962, 8024, 686, 4518, 8013, 102, 0, 0]

In [None]:
# 截断
# [cls] + max_length - 2 + [sep]
ids = tokenizer.encode(sen, truncation=True, max_length=5)
ids

[101, 872, 1962, 8024, 102]

Step6 其他输入部分

In [19]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 872, 1962, 8024, 686, 4518, 8013, 102, 0, 0, 0, 0, 0, 0, 0]

In [21]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
attention_mask

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]

In [22]:
token_type_ids = [0] * len(ids)
token_type_ids

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Step7 快速调用方式

In [None]:
# 一次性将step6中的三个部分都生成,以字典的形式返回
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 872, 1962, 8024, 686, 4518, 8013, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [None]:
# 一次性将step6中的三个部分都生成,以字典的形式返回
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 872, 1962, 8024, 686, 4518, 8013, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

Step8 处理batch数据

In [None]:
sens = ["你好，世界！", "你好，中国中国！"]
inputs = tokenizer(sens)    # 不使用padding,每个句子长度不一致
inputs

{'input_ids': [[101, 872, 1962, 8024, 686, 4518, 8013, 102], [101, 872, 1962, 8024, 704, 1744, 704, 1744, 8013, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [33]:
%%time
# 以batch的方式处理数据更快
# 单条循环处理
for i in range(10000):
    tokenizer(sen)

CPU times: user 220 ms, sys: 0 ns, total: 220 ms
Wall time: 220 ms


In [34]:
%%time
# 处理batch数据
res = tokenizer([sen] * 1000)

CPU times: user 44.9 ms, sys: 0 ns, total: 44.9 ms
Wall time: 5.05 ms


Fast/Slow Tokenizer

In [35]:
sen = "弱小的我也有大Dreaming!"

In [36]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [37]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [38]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

CPU times: user 270 ms, sys: 0 ns, total: 270 ms
Wall time: 270 ms


In [39]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

CPU times: user 943 ms, sys: 0 ns, total: 943 ms
Wall time: 943 ms


In [40]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

CPU times: user 475 ms, sys: 12.8 ms, total: 488 ms
Wall time: 112 ms


In [41]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

CPU times: user 794 ms, sys: 0 ns, total: 794 ms
Wall time: 794 ms


In [None]:
# return_offsets_mapping是一个可选参数，如果设置为True，则返回每个token在原始句子中的位置
# fast_tokenizer专有的
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}

In [None]:
# sen = "弱小的我也有大Dreaming!"
inputs.word_ids()
# 输出是一个列表，列表中的每个元素表示该token在原始句子中的位置

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

特殊Tokenizer的加载

In [None]:
# trust_remote_code要加上
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer