In [2]:
!pip install datasets evaluate transformers[sentencepiece]



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

#This line will fail 因为input_ids 的形状不符合模型的要求。
#BERT 和类似的模型（包括 DistilBERT）期望输入的张量具有 batch_size 和 sequence_length 两个维度，
#而当前传递的 input_ids 仅具有一个维度。
model(input_ids)

In [None]:
#上面一段代码运输异常，这部分是修改后的代码
#这段代码可以成功运行的原因是，在 input_ids 变量中添加了一个额外的维度，使其符合模型输入的要求。
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)


Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [6]:
# 以下分别展示单个序列和批量数据的预测
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

#第一个单个序列，不含padding
sequence1_ids = [[200, 200, 200]]

#第二个单个序列，含padding
padding_id = 100
sequence2_ids = [[200, 200, padding_id]]

#第一个批量序列，含padding
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.9907, -0.9139]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [None]:
#控制sequence长度,以确保它的长度不超过指定的max_sequence_length
sequence = sequence[:max_sequence_length]

In [7]:
#以下做个小复习

In [9]:
# tokenizer可以传入单个对象，也可以传入多个
#这个代码片演示传入单个对象

from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)
print(model_inputs)

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
#这个代码片演示传入多个对象

sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

model_inputs = tokenizer(sequences)
print(model_inputs)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [11]:
#更多参数的调整如下
model_inputs = tokenizer(sequences, return_tensors="pt", padding=True, truncation=True, return_token_type_ids=True)
print(model_inputs)


{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [12]:
#演示各种padding的方式

#Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

#Will pad the sequences up to the model max length
#512 for BERT or DistilBERT
model_inputs = tokenizer(sequences, padding="max_length")

#Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)


In [15]:
#演示各种truncate方式
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

#Will truncate the sequences that are longer than the model max length
#512 for BERT or DistilBERT
model_inputs = tokenizer(sequences, truncation=True)

#Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [16]:
#演示通过设置return_tensors参数，将输入的文本序列转换成不同深度学习框架或数据处理库所需的格式
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Return PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

#Return Tensorflow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")

#Return Numpy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

In [24]:
sequence = "I've been waiting for a HuggingFace course my whole life."

model_input = tokenizer(sequence)
print(model_input["input_ids"])

#对单个文本序列进行tokenization（分词）和token ID转换
tokens = tokenizer.tokenize(sequence)
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)


[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [26]:
#用tokenizer.decode方法将token ID转换回可读的文本
print(tokenizer.decode(model_input["input_ids"]))
print(tokenizer.decode(ids))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.
