In [3]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6

In [8]:
#Same as before

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [1]:
#TrainingArguments用于配置训练过程中的设置，如学习率、训练周期、批次大小等
from transformers import TrainingArguments

#指定模型训练的输出目录为test-trainer
#TrainingArguments会在该目录下存储文件模型和训练的输出，如checkpoint,log等
training_args = TrainingArguments("test-trainer")

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#利用Trainer进行fine-tuning

"""
fine-tuning就是在一个已经预训练好的模型基础上，使用特定任务的数据进行再训练，让模型更好的适应你要做的任务；
fine-tuning过程中，主要更改的是模型的参数（数据如训练集、模型权重、偏置等），但并不仅仅只能更改它，还能改：
  超参数（learning rate、batch size、epochs、optimizers）；
  对training data和validation data进行不同的预处理（数据清洗、数据增强、tokenization）；
  模型架构（修改模型输出层——二分类变多分类、层数/隐藏单元数，添加dropout层）；
"""

from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [None]:
trainer.train()

In [None]:
#用validation数据进行预测
predictions = trainer.predict(tokenized_datasets["validation"])
#打印出预测结果和标签的形状
print(predictions.predictions.shape, predictions.label_ids.shape)

上面这行代码运行输出为：(408,2) (408,)

  (408,2)表示408个样本，每个样本有2个类别；
  (408,)表示408个样本的标签（0或1）

In [None]:
#找到每个样本预测结果中 概率最大的类别
#preds是一个数组，里面包含了模型对所有样本的预测类别
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import evaluate

#加载了MRPC任务的评估标准（准确率、F1分数等指标）
metric = evaluate.load("glue", "mrpc")
#计算模型在MRPC任务上的评估指标
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}

In [None]:
def compute_metrics(eval_preds):
  metric = evaluate.load("glue", "mrpc")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
#每训练完一个epoch后，在validation data上评估一次性能
training_args = TrainingArguments("test,trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_label=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenzier,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()