# Tokenizer 详解

- tokenizer 的作用：构造模型需要的输入。
- tokenizer 和 model 相匹配，tokenizer 的输出将作为对应 model 的输入。
- tokenizer 内部执行的过程：
    - `tokenizer(test_senteces[0], )`：执行了 `tokenizer.__call__：encode`
    - `tokenizer.encode == tokenizer.tokenize + tokenizer.convert_tokens_to_ids`
    - `tokenizer.decode`：tokenizer 的解码函数
- tokenizer 工作的原理其实就是 `tokenizer.vocab`：字典，存储了 `token => id` 的映射关系
    - `tokenizer.special_tokens_map` 是 tokenizer 字典中特殊的 token
- tokenizer 得到的结果示例：
    - input_ids：得到的每一个 token 的 id
    - attention_mask：和 padding 相匹配，如句子的长度不够填充 padding，则对应位置 attention_mask 为 0，`len(input_ids) == len(attention_mask)`

> `AutoTokenizer`, `AutoModel` 是通用类型，可以用于加载预训练好的模型。

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'   # 模型名

# 加载 tokenizer 和 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)



In [4]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## 1. tokenizer, 构造模型输入

In [5]:
# 测试数据
test_senteces = [
    'today is not that bad', 
    'today is so bad', 
    'so good',
    'such a wonderful day',
    'this is a white table'
]

In [6]:
batch_input = tokenizer(test_senteces, truncation=True, padding=True, return_tensors='pt')
batch_input   # tokenizer 得到的结果

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102],
        [ 101, 2651, 2003, 2061, 2919,  102,    0],
        [ 101, 2061, 2204,  102,    0,    0,    0],
        [ 101, 2107, 1037, 6919, 2154,  102,    0],
        [ 101, 2023, 2003, 1037, 2317, 2795,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]])}

### 细节探究

In [7]:
tokenizer(test_senteces[0], )   # 对单个句子的 tokenizer

{'input_ids': [101, 2651, 2003, 2025, 2008, 2919, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokenizer.encode(test_senteces[0], )   # 执行 encode 函数也可以得到 input_ids

[101, 2651, 2003, 2025, 2008, 2919, 102]

In [9]:
# tokenizer.encode 实际是执行了 tokenizer.tokenize + tokenizer.convert_tokens_to_ids
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_senteces[0]))

[2651, 2003, 2025, 2008, 2919]

In [10]:
tokenizer.decode([101, 2651, 2003, 2025, 2008, 2919, 102])   # 解码函数

'[CLS] today is not that bad [SEP]'

In [11]:
type(tokenizer.vocab), len(tokenizer.vocab)   # 词典的长度为 30522

(dict, 30522)

In [12]:
tokenizer.special_tokens_map   # 特殊词词典

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [13]:
list(tokenizer.special_tokens_map.values())

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [14]:
# 特殊词的 id
tokenizer.convert_tokens_to_ids(list(tokenizer.special_tokens_map.values()))

[100, 102, 0, 101, 103]

### 2. model，调用模型

In [15]:
import torch
import torch.nn.functional as F

In [16]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.40.2",
  "vocab_size": 30522
}

In [17]:
with torch.no_grad():
    outputs = model(**batch_input)
    print(outputs)   # 预训练模型的输出
    
    scores = F.softmax(outputs.logits, dim=1)
    print(scores)    # 转化为概率值
    
    # 映射到最终的结果
    labels = torch.argmax(scores, dim=1)
    print(labels)
    labels = [model.config.id2label[id] for id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.4620,  3.6118],
        [ 4.7508, -3.7899],
        [-4.1938,  4.5566],
        [-4.3905,  4.7151],
        [ 1.7080, -1.4675]]), hidden_states=None, attentions=None)
tensor([[8.4632e-04, 9.9915e-01],
        [9.9980e-01, 1.9531e-04],
        [1.5837e-04, 9.9984e-01],
        [1.1103e-04, 9.9989e-01],
        [9.5990e-01, 4.0099e-02]])
tensor([1, 0, 1, 1, 0])
['POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE']
