In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
classes = ["not paraphrase", "is paraphrase"]
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
 
"""
01.tokenizer([]) 与 tokenizer(sequence_0,sequence_2)的效果是不一样的。也就是说tokenizer()函数
中传入的参数形式不同将会得到一个不同的结果【前者会得到一个attention_mask；而后者是没有的】

02.如果我将 tokenizer(sequence_0, sequence_2, return_tensors="pt") 变化成
tokenizer(sequence_2, sequence_0, return_tensors="pt")
那么得到的结果将有很大的不同，搞不懂为啥差别这么大？

03.这里的return_tensors="pt" 是什么意思？  => 返回的tensor类型是pytorch 的
"""
paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")

print(paraphrase)
print(not_paraphrase)

"""
01.**paraphrase 是什么意思来着？ => unpack the dictionary。即将整个字典的值传入到model 中。可以结合
 model函数的定义是：  model(*input, **kwargs) 来理解这个问题
02.可以看一下model 函数的返回值是什么样子的 
"""
paraphrase_classification_logits = model(**paraphrase)[0]
not_paraphrase_classification_logits = model(**not_paraphrase)[0]


paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]


# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

I0911 15:10:05.097481 139933133829952 file_utils.py:39] PyTorch version 1.5.1 available.
I0911 15:10:15.613948 139933133829952 configuration_utils.py:264] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json from cache at /home/liushen/.cache/torch/transformers/70bd73627499d3e6af9210cb6b919b8288952641f5f19da57fa5b72a9938781e.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391
I0911 15:10:15.615402 139933133829952 configuration_utils.py:300] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vo

{'input_ids': tensor([[  101,  1109,  1419, 20164, 10932,  2271,  7954,  1110,  1359,  1107,
          1203,  1365,  1392,   102, 20164, 10932,  2271,  7954,   112,   188,
          3834,  1132,  3629,  1107,  6545,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}
{'input_ids': tensor([[  101,  1109,  1419, 20164, 10932,  2271,  7954,  1110,  1359,  1107,
          1203,  1365,  1392,   102,  7302,  1116,  1132,  2108,  2213,  1111,
          1240,  2332,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
not paraphrase: 10%
is paraphrase: 90%
not paraphrase: 94%
is paraphrase: 6%
