## 使用pipeline完成任务

In [1]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
model.safetensors: 100%|██████████| 268M/268M [00:39<00:00, 6.75MB/s] 


[{'label': 'POSITIVE', 'score': 0.9598050713539124},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

### pipeline的作用
- 对数据进行预处理
- 将处理后的数据输入模型
- 将模型输出的结果进行后处理

### 预处理使用tokenizer
- 将输入拆分为token
- 将token转换为id
- 将id转换为tensor

In [2]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"  # 从huggingface上选择需要的模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  # 通过指定的模型，加载对应的tokenizer

In [3]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, 
                   padding=True,
                   truncation=True, 
                   return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


### 加载模型

In [4]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

In [5]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)
# 结果的表示每个批次有两个数据，每个数据有16个token，每个token有768个特征

torch.Size([2, 16, 768])


### 多分类

In [6]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([2, 2])


### 输出后的处理

In [7]:
print(outputs.logits)  # 输出的是两个数据的logits值

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [8]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)  # 将logits值转换为概率值
print(predictions)

tensor([[4.0195e-02, 9.5981e-01],
        [9.9946e-01, 5.4419e-04]], grad_fn=<SoftmaxBackward0>)


In [10]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}