In [3]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:8889"
os.environ["https_proxy"] = "http://127.0.0.1:8889"

In [3]:
 from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sen = "大大的太阳在天上照耀着大地"

## load and save

In [4]:
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
# save tokenizer
# tokenizer.save_pretrained("./tokenizer")
# tokenizer = AutoTokenizer.from_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

## tokenize the sentence

In [5]:
tokens = tokenizer.tokenize(sen)
tokens

['大', '大', '的', '太', '阳', '在', '天', '上', '照', '耀', '着', '大', '地']

## dictionary

In [7]:
tokenizer.vocab

{'50ml': 11538,
 'study': 12685,
 '洁': 3815,
 '##臓': 18682,
 '##揩': 16054,
 '噻': 1699,
 'xxx': 8790,
 '##纰': 18341,
 '##相': 17742,
 '戊': 2763,
 '##佘': 13921,
 '副': 1199,
 '艳': 5683,
 '701': 12656,
 '選': 6908,
 '##拎': 15922,
 '##卿': 14378,
 '旖': 3185,
 '##椿': 16556,
 '##舫': 18719,
 '##13': 8679,
 'sweet': 10598,
 '##賣': 19603,
 '##艺': 18743,
 'pre': 11685,
 '##copyright': 13291,
 '舶': 5667,
 '##休': 13885,
 '##炳': 17211,
 '##鱷': 20879,
 '##ier': 9181,
 'tim': 10505,
 '##lter': 12171,
 '荷': 5792,
 '##疇': 17596,
 '愫': 2701,
 '1902': 11487,
 '︱': 7993,
 'williams': 11475,
 '煎': 4203,
 '##首': 20731,
 '儕': 1028,
 '##vic': 11956,
 '╞': 441,
 '犧': 4304,
 'november': 9756,
 '砷': 4789,
 '麾': 7940,
 '布': 2357,
 '##覺': 19278,
 'chan': 9318,
 '##rt': 8716,
 '遗': 6890,
 '羣': 5407,
 'erp': 9529,
 '##下': 13735,
 '埕': 1816,
 '⑷': 418,
 '駱': 7694,
 '濬': 4094,
 'ｑ': 8067,
 '钥': 7170,
 '##侍': 13949,
 '诲': 6431,
 '┊': 433,
 '##啰': 14631,
 '##嗡': 14688,
 '8': 129,
 '##ab': 11008,
 'cbd': 11249,
 '##徙': 15592

In [8]:
tokenizer.vocab_size

21128

## index transformation

In [9]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765]

In [10]:
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['大', '大', '的', '太', '阳', '在', '天', '上', '照', '耀', '着', '大', '地']

In [11]:
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'大 大 的 太 阳 在 天 上 照 耀 着 大 地'

In [12]:
ids = tokenizer.encode(sen) # add_special_tokens=True
ids

[101,
 1920,
 1920,
 4638,
 1922,
 7345,
 1762,
 1921,
 677,
 4212,
 5438,
 4708,
 1920,
 1765,
 102]

In [13]:
str_sen = tokenizer.decode(ids)
str_sen

'[CLS] 大 大 的 太 阳 在 天 上 照 耀 着 大 地 [SEP]'

In [14]:
str_sen = tokenizer.decode(ids, skip_special_tokens=True)
str_sen

'大 大 的 太 阳 在 天 上 照 耀 着 大 地'

## padding and truncating

In [19]:
ids = tokenizer.encode(sen, padding="max_length", max_length=20)
ids 

[101,
 1920,
 1920,
 4638,
 1922,
 7345,
 1762,
 1921,
 677,
 4212,
 5438,
 4708,
 1920,
 1765,
 102,
 0,
 0,
 0,
 0,
 0]

In [17]:
ids = tokenizer.encode(sen, max_length=10, truncation=True)
ids 

[101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 102]

## other inputs

In [21]:
ids = tokenizer.encode(sen, padding="max_length", max_length=20, truncation=True)
attention_mask = [1 if idx !=0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

([101,
  1920,
  1920,
  4638,
  1922,
  7345,
  1762,
  1921,
  677,
  4212,
  5438,
  4708,
  1920,
  1765,
  102,
  0,
  0,
  0,
  0,
  0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [22]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=20, truncation=True)
inputs

{'input_ids': [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}

In [23]:
inputs = tokenizer(sen, padding="max_length", max_length=20, truncation=True)
inputs

{'input_ids': [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}

## batch

In [25]:
sens = ["今天是星期一",
        "这只小狗好可爱",
        "华为遥遥领先，苹果不太行"]

res = tokenizer(sens, padding="max_length", max_length=15, truncation=True)
res

{'input_ids': [[101, 791, 1921, 3221, 3215, 3309, 671, 102, 0, 0, 0, 0, 0, 0, 0], [101, 6821, 1372, 2207, 4318, 1962, 1377, 4263, 102, 0, 0, 0, 0, 0, 0], [101, 1290, 711, 6898, 6898, 7566, 1044, 8024, 5741, 3362, 679, 1922, 6121, 102, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]}

In [26]:
%%time
for i in range(1000):
    tokenizer(sen)
    

CPU times: user 40.1 ms, sys: 0 ns, total: 40.1 ms
Wall time: 39.9 ms


In [28]:
%%time
tokenizer([sen]*1000)

CPU times: user 49.4 ms, sys: 0 ns, total: 49.4 ms
Wall time: 7.5 ms


{'input_ids': [[101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 

## Fast / Slow Tokenizer

In [29]:
# fast
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")

# slow
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)


In [30]:
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [31]:
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [32]:
%%time
for i in range(1000):
    fast_tokenizer(sen)

CPU times: user 36.3 ms, sys: 303 µs, total: 36.6 ms
Wall time: 36.2 ms


In [33]:
%%time
for i in range(1000):
    slow_tokenizer(sen)

CPU times: user 80.8 ms, sys: 0 ns, total: 80.8 ms
Wall time: 80.8 ms


In [34]:
%%time
fast_tokenizer([sen]*1000)

CPU times: user 43.6 ms, sys: 0 ns, total: 43.6 ms
Wall time: 7.04 ms


{'input_ids': [[101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 

In [35]:
%%time
slow_tokenizer([sen]*1000)

CPU times: user 104 ms, sys: 0 ns, total: 104 ms
Wall time: 104 ms


{'input_ids': [[101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 1920, 1920, 4638, 1922, 7345, 1762, 1921, 677, 4212, 5438, 4708, 1920, 1765, 102], [101, 

In [45]:
sen = "我正在dreaming"
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2769, 3633, 1762, 10252, 8221, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 8), (8, 11), (0, 0)]}

In [46]:
inputs.offset_mapping

[(0, 0), (0, 1), (1, 2), (2, 3), (3, 8), (8, 11), (0, 0)]

In [47]:
inputs.word_ids()

[None, 0, 1, 2, 3, 3, None]

## special tokens

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer

ChatGLMTokenizer(name_or_path='THUDM/chatglm-6b', vocab_size=130344, model_max_length=2048, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<sop>', 'eos_token': '<eop>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [5]:
tokenizer.decode(tokenizer.encode("我正在dreaming"))

'我正在dreaming'