# Tokenizer 基本使用

In [1]:
from transformers import AutoTokenizer

In [2]:
sen = "弱小的我也有大梦想!"

## Step1 加载与保存

In [3]:
# 从HuggingFace加载，输入模型名称，即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [None]:
# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

In [None]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer

## Step2 句子分词

In [4]:
tokens = tokenizer.tokenize(sen)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

## Step3 查看词典

In [5]:
tokenizer.vocab

{'##況': 16842,
 '##爐': 17314,
 'ｔ': 8070,
 '漣': 4032,
 '##更': 16348,
 'vision': 10762,
 '阎': 7330,
 '##垃': 14853,
 '##懶': 15811,
 '##渥': 17001,
 '##种': 17962,
 '##辊': 19836,
 '##掸': 16038,
 '鳩': 7853,
 '##host': 12227,
 '桨': 3444,
 '##」': 13655,
 '##拧': 15940,
 'evernote': 13138,
 '蔑': 5912,
 '##均': 14829,
 '颡': 7586,
 '##韻': 20569,
 'iv': 10573,
 'も': 571,
 '##賂': 19590,
 '嗬': 1637,
 '籠': 5096,
 '殿': 3671,
 '伉': 822,
 '##{': 13344,
 '##ц': 13421,
 '##籌': 18149,
 '閡': 7282,
 '绾': 5343,
 '##lia': 10336,
 'mems': 12530,
 '##満': 17027,
 'vc': 9438,
 '##尘': 15269,
 '##賠': 19600,
 '##饋': 20694,
 '##駿': 20752,
 '軸': 6729,
 '##单': 14353,
 '台': 1378,
 '##な': 8730,
 '001': 9263,
 '381': 12185,
 '##唐': 14595,
 '##焚': 17247,
 '##钉': 20209,
 '３０': 10684,
 '##鹉': 20959,
 'ж': 239,
 '100g': 10606,
 '##ena': 12000,
 '##虧': 19057,
 '卿': 1321,
 '壞': 1889,
 '懵': 2753,
 '↓↓': 12558,
 '##蕻': 19001,
 '贪': 6576,
 '瘢': 4605,
 '鹤': 7911,
 '1922': 10209,
 'some': 13048,
 '##疮': 17612,
 '诏': 6405,
 '覇': 6209,
 

In [6]:
tokenizer.vocab_size

21128

## Step4 索引转换

In [7]:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]

In [8]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

In [9]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'弱 小 的 我 也 有 大 梦 想!'

###  更便捷的实现方式

In [10]:
# 将字符串转换为id序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]

In [11]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'

## Step5 填充与截断

In [12]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [13]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 2483, 2207, 4638, 102]

## Step6 其他输入部分

In [14]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [15]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Step7 快速调用方式

In [16]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

In [17]:
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

## Step8 处理batch数据

In [18]:
sens = ["弱小的我也有大梦想",
        "有梦想谁都了不起",
        "追逐梦想的心，比梦想本身，更可贵"]
res = tokenizer(sens)
res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [19]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

CPU times: user 73.3 ms, sys: 661 µs, total: 73.9 ms
Wall time: 73.1 ms


In [20]:
%%time
# 处理batch数据
res = tokenizer([sen] * 1000)

CPU times: user 139 ms, sys: 16.3 ms, total: 156 ms
Wall time: 13.7 ms


In [21]:
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

# Fast / Slow Tokenizer

In [22]:
sen = "弱小的我也有大Dreaming!"

In [23]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [24]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [25]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

CPU times: user 656 ms, sys: 80 µs, total: 656 ms
Wall time: 656 ms


In [26]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

CPU times: user 1.82 s, sys: 0 ns, total: 1.82 s
Wall time: 1.82 s


In [27]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

CPU times: user 924 ms, sys: 107 ms, total: 1.03 s
Wall time: 191 ms


In [28]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

CPU times: user 1.68 s, sys: 0 ns, total: 1.68 s
Wall time: 1.68 s


In [29]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}

In [30]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [31]:
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674

# 特殊Tokenizer的加载

In [34]:
import os
os.environ['http_proxy'] = "http://127.0.0.1:7890"
os.environ['https_proxy'] = "http://127.0.0.1:7890"


In [35]:
from transformers import AutoTokenizer

In [36]:
# 新版本的transformers（>4.34），加载 THUDM/chatglm 会报错，因此这里替换为了天宫的模型
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer

tokenizer_config.json:   0%|          | 0.00/857 [00:00<?, ?B/s]

tokenization_skywork.py:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Skywork/Skywork-13B-base:
- tokenization_skywork.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/994k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers_modules.Skywork.Skywork-13B-base.bc35915066fbbf15b77a1a4a74e9b574ab167816.tokenization_skywork.SkyworkTokenizer'>. This means that tokens that come after special tokens will not be properly handled. 


SkyworkTokenizer(name_or_path='Skywork/Skywork-13B-base', vocab_size=65519, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [None]:
tokenizer.save_pretrained("skywork_tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("skywork_tokenizer", trust_remote_code=True)

In [37]:
tokenizer.decode(tokenizer.encode(sen))

'<s>弱小的我也有大Dreaming!'