# tokenizer

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
model_path = '../model/distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [3]:
test_senteces = ['today is not that bad', 'today is so bad', 'so good']

test_inputs = tokenizer(test_senteces, padding=True, truncation=True, return_tensors="pt")

In [36]:
tokenizer(test_senteces, padding=True, truncation=True, return_tensors="pt")

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102],
        [ 101, 2651, 2003, 2061, 2919,  102,    0],
        [ 101, 2061, 2204,  102,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0, 0, 0]])}

In [22]:
tokenizer.tokenize(test_senteces[0])

['today', 'is', 'not', 'that', 'bad']

In [23]:
tokenizer.encode(test_senteces[0])

[101, 2651, 2003, 2025, 2008, 2919, 102]

In [26]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(test_senteces[0]))

['[CLS]', 'today', 'is', 'not', 'that', 'bad', '[SEP]']

In [28]:
tokenizer.convert_tokens_to_ids(tokenizer.convert_ids_to_tokens(tokenizer.encode(test_senteces[0])))

[101, 2651, 2003, 2025, 2008, 2919, 102]

In [35]:
# 查看特殊字符
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

# 调用模型

In [46]:
import torch
import torch.nn.functional as F

In [62]:
with torch.no_grad():
    output = model(**test_inputs)
    sorces = F.softmax(output.logits, dim=-1)
    # print(sorces, torch.sum(sorces, dim=-1), sep='\n') # 每个句子的概率和为1
    # print(torch.argmax(sorces, dim=-1)) # 每个句子的预测结果
    ids = torch.argmax(sorces, dim=-1).tolist()
    labels = [model.config.id2label[_id] for _id in ids]
    print(ids, labels, sep='\n')

[1, 0, 1]
['POSITIVE', 'NEGATIVE', 'POSITIVE']
