In [2]:
!pip install datasets evaluate transformers[sentencepiece]



In [4]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

#Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

#This is new

#将labels添加到输入的批次（batch）中去
batch["labels"] = torch.tensor([1,1])

#创建一个AdamW优化器实例，并将其应用于模型的参数，以便在训练中更新模型权重
optimizer = AdamW(model.parameters())

#计算损失,分类任务中通常会用交叉熵损失
loss = model(**batch).loss
#反向传播
loss.backward()
#优化器更新
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
#glue是一个用于评估自然语言理解能力的基准数据集集合，包含了一些常见的NLP任务
#如文本分类、句子对任务、情感分析等。每个任务有不同的目标和数据集

#mrpc是glue数据集中一个句子对任务，包含来自新闻文章的成对句子，每对句子都标注为
#"paraphrase（语义相同）"或"non-paraphrase（语义不同）"。这是一个二分类任务，用于测试模型的句子对理解能力

from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [10]:
#看看这个数据集长什么样

"""
输出中：
  sentence1是数据集中第一个句子；sentence2是数据集中第二个句子；
  label表示这对句子的标签，1表示同义句（paraphrase），0表示不是同义句（non-paraphrase）
  idx是每个样本的索引，在这里，idx:0表示这是数据集中的第一个样本
"""

raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [11]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
# 开始处理数据

from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [13]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [15]:
tokenzied_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True
)

In [16]:
def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [18]:
# .map()会将指定的函数应用到数据集中的每一个元素或者每一个批次
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [20]:
#下面演示如何将数据填充到该批次最长长度而不是模型的最长长度，这样可以节省计算空间
#DataCollatorWithPadding是一个自动填充的工具，用于处理数据集以确保输入模型的数据批次具有相同的长度

from transformers import DataCollatorWithPadding
#为了知道该如何填充，我们需要传入分词器
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
#获取 train 数据集的前 8 个样本
samples = tokenized_datasets["train"][:8]

#字典推导式
#{k: v for k, v in ...}：这部分表示创建一个新的字典，其中 k 是键，v 是值
#for k, v in ... 是迭代原字典的每一个键值对，并且将它们放入新字典
#samples.items() 是字典 samples 的一个方法，它返回字典中所有的键值对
#if k not in ["idx", "sentence1", "sentence2"] 这是一个 过滤条件
samples = {k:v for k,v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}

#输出每个样本被分词后的token数量
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [23]:
#开始padding

batch = data_collator(samples)
{k:v.shape for k,v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}