# 模型微调 - MRPC句子对分类任务

## 任务说明
- **数据集**: GLUE MRPC (Microsoft Research Paraphrase Corpus)
- **任务类型**: 句子对二分类（判断两个句子是否语义相同）
- **模型**: BERT-base-uncased（预训练模型）+ 分类头
- **微调目标**: 让BERT学会判断句子对的语义相似性

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(tokenized_datasets["train"].column_names)
print(data_collator)

['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']
DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
),

In [None]:
# 查看数据集示例
print("数据集示例：")
print("="*50)
for i in range(3):
    example = raw_datasets["train"][i]
    print(f"示例 {i+1}:")
    print(f"句子1: {example['sentence1']}")
    print(f"句子2: {example['sentence2']}")
    print(f"标签: {example['label']} ({'语义相同' if example['label'] == 1 else '语义不同'})")
    print("-"*50)

In [2]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 模型架构说明

BERT模型的不同用途：
1. **BertModel**: 原始BERT，输出hidden states
2. **BertForMaskedLM**: 用于掩码语言模型任务（MLM）
3. **BertForSequenceClassification**: 用于序列分类任务（本例使用）
4. **BertForTokenClassification**: 用于token分类（如NER）
5. **BertForQuestionAnswering**: 用于问答任务

In [4]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [5]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=1377, training_loss=0.4420890170937443, metrics={'train_runtime': 38.0527, 'train_samples_per_second': 289.178, 'train_steps_per_second': 36.187, 'total_flos': 405114969714960.0, 'train_loss': 0.4420890170937443, 'epoch': 3.0})

In [6]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)


In [7]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [9]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8333333333333334, 'f1': 0.8839590443686007}

In [10]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
training_args = TrainingArguments("test-trainer", eval_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.412911,0.840686,0.885764
2,0.528800,0.495231,0.852941,0.89547
3,0.307100,0.645139,0.865196,0.905336


TrainOutput(global_step=1377, training_loss=0.35462320586781443, metrics={'train_runtime': 43.9034, 'train_samples_per_second': 250.641, 'train_steps_per_second': 31.364, 'total_flos': 405114969714960.0, 'train_loss': 0.35462320586781443, 'epoch': 3.0})

In [13]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

(408, 2) (408,)


{'accuracy': 0.8651960784313726, 'f1': 0.9053356282271945}

## 原始模型（未微调）的预测准确率

In [None]:
# 加载原始模型（未微调）
original_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# 创建用于评估的Trainer（不训练）
original_trainer = Trainer(
    original_model,
    training_args,
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# 在验证集上进行预测
original_predictions = original_trainer.predict(tokenized_datasets["validation"])
print("原始模型（未微调）的性能：")
print(f"Accuracy: {original_predictions.metrics['test_accuracy']:.4f}")
print(f"F1 Score: {original_predictions.metrics['test_f1']:.4f}")

## 性能对比总结

In [1]:
# 性能对比
print("="*50)
print("模型性能对比：")
print("="*50)

# 原始模型预测
original_preds = np.argmax(original_predictions.predictions, axis=-1)
original_metrics = metric.compute(predictions=original_preds, references=original_predictions.label_ids)

print(f"1. 原始模型（未微调）：")
print(f"   - Accuracy: {original_metrics['accuracy']:.4f}")
print(f"   - F1 Score: {original_metrics['f1']:.4f}")
print()

print(f"2. 第一次微调（无evaluation策略）：")
print(f"   - Accuracy: 0.8333")
print(f"   - F1 Score: 0.8840")
print(f"   - Training Loss: 0.4421")
print()

print(f"3. 第二次微调（有evaluation策略）：")
print(f"   - Accuracy: 0.8652")
print(f"   - F1 Score: 0.9053")
print(f"   - Training Loss: 0.3546")
print()

print("="*50)
print("提升幅度：")
print(f"第二次微调相比原始模型：")
print(f"   - Accuracy提升: {(0.8652 - original_metrics['accuracy']):.4f}")
print(f"   - F1提升: {(0.9053 - original_metrics['f1']):.4f}")

模型性能对比：


NameError: name 'np' is not defined