In [1]:
# finetune_bert_large.py

import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate # Hugging Face 的评估库

In [2]:
# =======================================================
# 1. 配置
# =======================================================
MODEL_NAME = "bert-large-uncased"
DATASET_NAME = "glue"
DATASET_CONFIG = "sst2"
OUTPUT_DIR = "./bert-large-sst2-finetuned" # 模型保存的目录
LOGGING_DIR = "./logs"

# =======================================================
# 2. 加载数据和分词器
# =======================================================
print(f"Loading tokenizer for '{MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"Loading dataset '{DATASET_NAME}:{DATASET_CONFIG}'...")
raw_datasets = load_dataset(DATASET_NAME, DATASET_CONFIG)

# =======================================================
# 3. 数据预处理
# =======================================================
def tokenize_function(examples):
    # 对句子进行分词，并进行填充和截断
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

print("Tokenizing datasets...")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# 移除不需要的列，并重命名 label 列以符合模型期望
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # Trainer 默认需要 'labels'
tokenized_datasets.set_format("torch")

# 从数据集中分离出训练集、验证集
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(10000)) # 可以先用一小部分数据快速测试
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42) #.select(range(400))

# =======================================================
# 4. 加载模型和定义评估指标
# =======================================================
print(f"Loading model '{MODEL_NAME}' for sequence classification...")
# num_labels=2 因为 SST-2 是一个二分类任务 (正面/负面)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Loading tokenizer for 'bert-large-uncased'...
Loading dataset 'glue:sst2'...
Tokenizing datasets...
Loading model 'bert-large-uncased' for sequence classification...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 加载评估指标
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# =======================================================
# 5. 设置训练参数并开始训练
# =======================================================
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,  # 对于大型模型和SST-2任务，1-3个epoch通常足够
    per_device_train_batch_size=128, # 如果显存不足(OOM), 请减小这个值, e.g., 8
    per_device_eval_batch_size=128,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGGING_DIR,
    logging_steps=100,
    eval_strategy="steps", # 每个 epoch 结束后进行一次评估
    save_strategy="steps",       # 每个 epoch 结束后保存一次模型
    eval_steps=50,
    save_steps=100,
    load_best_model_at_end=True, # 训练结束后加载最佳模型
    fp16=torch.cuda.is_available(), # 如果有GPU，开启fp16加速训练
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print("Starting training...")
trainer.train()

print("Training complete.")
print(f"Best model saved to {training_args.output_dir}")

# 你也可以手动保存最终的模型
# trainer.save_model(f"{OUTPUT_DIR}/final_model")

Setting up training arguments...


  trainer = Trainer(
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Starting training...


[34m[1mwandb[0m: Currently logged in as: [33mmaniaamaeovo[0m ([33mmaniaamaeovo-mania[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.629704,0.708716
100,0.573200,0.300542,0.90367
150,0.573200,0.216218,0.923165
200,0.249700,0.212816,0.925459
250,0.249700,0.205768,0.918578
300,0.211800,0.187358,0.931193
350,0.211800,0.184492,0.936927
400,0.174300,0.226334,0.922018
450,0.174300,0.224527,0.91055
500,0.175300,0.2379,0.920872


Training complete.
Best model saved to ./bert-large-sst2-finetuned
