# 子词词元化过程

In [2]:
# 测试数据
s1 = 'albums sold 124443286539 copies'
s2 = 'technically perfect, melodically correct'
s3 = 'featuring a previously unheard track'
s4 = 'bestselling music artist'
s5 = 's1 d1 o1 and o2'
s6 = 'asbofwheohwbeif'

## 0. 实例化 tokenizer

In [3]:
from transformers import BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)



## 1. vocab

- `tokenizer.vocab`：OrderedDict 类型，键为 token，值为 id
- `tokenizer.ids_to_tokens`：OrderedDict 类型，键为 id，值为 token
- `len(tokenizer.vocab) == len(tokenizer.ids_to_tokens) == 30522`

In [4]:
# tokenizer.vocab
# tokenizer.ids_to_tokens

In [5]:
len(tokenizer.vocab), len(tokenizer.ids_to_tokens)

(30522, 30522)

In [6]:
tokenizer.vocab['[UNK]']   # 获取特定词的 id

100

In [7]:
tokenizer.tokenize('bestselling')   # tokenize 将 word 尽可能地映射为 vocab 中的 keys

['best', '##sell', '##ing']

In [8]:
cnt_sharp = 0   # 统计 ## 开头的词汇的数量
for token, id in tokenizer.vocab.items():
    if token.startswith('##'):
        # print(token)
        cnt_sharp += 1
print(cnt_sharp)

5828


## 2. 样本子词测试

In [9]:
inputs = tokenizer(s6)
print(inputs['input_ids'])
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

[101, 2004, 5092, 2546, 2860, 5369, 11631, 2860, 19205, 2546, 102]
['[CLS]', 'as', '##bo', '##f', '##w', '##he', '##oh', '##w', '##bei', '##f', '[SEP]']


## 3. 小结

- tokenizer 轻易不会将一个词处理为 `[UNK] (100)`
- 基于词汇表，tokenize, encode, decode 为三个重要方法
    - tokenize: word => tokens（将 word 尽可能地映射为 vocab 中的 keys）
    - encode: token => id
    - decode: id => token => word（decode 要能很好地将 id 还原，尽可能与输入的 word 对齐）