### 基于Transformers库全量微调大模型

In [4]:
# 用我们提供的两句话作为样本，来训练句子分类器

import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 和之前一样
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# 带分类头的模型
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# 新增部分
batch["labels"] = torch.tensor([1, 1])  # 两个句子都是positive，给1和1
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 下载实际数据集并观察数据集

from datasets import load_dataset
raw_datasets = load_dataset("glue","mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [6]:
raw_train_dataset = raw_datasets['train']
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [7]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

### 预处理数据集

In [8]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentence1 = tokenizer(raw_train_dataset[0]['sentence1'])
sentence2 = tokenizer(raw_train_dataset[0]['sentence2'])

In [9]:
sentence1

{'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
# 这里说明下 token_type_ids ，一个样本有两个句子时，token_type_ids可以表明哪些token是句子1，哪些是句子2
test = tokenizer(raw_train_dataset[0]['sentence1'],raw_train_dataset[0]['sentence2'],padding=True,truncation=True)
print(test['input_ids'])
print(tokenizer.convert_ids_to_tokens(test['input_ids']))
# 可以看到两个句子之间加入了SEP作为分割

[101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102]
['[CLS]', 'am', '##ro', '##zi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '"', 'the', 'witness', '"', ',', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]', 'referring', 'to', 'him', 'as', 'only', '"', 'the', 'witness', '"', ',', 'am', '##ro', '##zi', 'accused', 'his', 'brother', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]']


In [11]:
# 处理整个数据集
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

# 以上这种方式会将所有数据加载到内存，效率不高，采用下面这种方式
# 定义处理函数，接受数据字典
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True) 
tokenized_dataset = raw_datasets.map(tokenize_function,batched=True) # map() 方法的工作原理是使用一个函数处理数据集的每个元素
# batched=True  按批次处理数据，将多个样本打包成一个批次（batch），一次性传入tokenize_function处理

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [12]:
tokenized_dataset # 注意到返回的数据集是没有padding的，且相当于给每个子数据集添加了三个字段

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [13]:
# 将每个batch句子长度填充到正确长度
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
samples = tokenized_dataset['train'][:8] # 访问前八个元素
samples = {k:v for k,v in samples.items() if k not in ['idx','sentence1','sentence2']}
print(samples)
# 查看这8个样本的每个长度
print([len(x) for x in samples['input_ids']])
# 使用collator进行数据填充
batch = data_collator(samples)

print([len(x) for x in batch['input_ids']]) # 可以看到全都填充到了最大长度

{'label': [1, 0, 1, 0, 1, 1, 0, 1], 'input_ids': [[101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], [101, 9805, 3540, 11514, 2050, 3079, 11282, 2243, 1005, 1055, 2077, 4855, 1996, 4677, 2000, 3647, 4576, 1999, 2687, 2005, 1002, 1016, 1012, 1019, 4551, 1012, 102, 9805, 3540, 11514, 2050, 4149, 11282, 2243, 1005, 1055, 1999, 2786, 2005, 1002, 6353, 2509, 2454, 1998, 2853, 2009, 2000, 3647, 4576, 2005, 1002, 1015, 1012, 1022, 4551, 1999, 2687, 1012, 102], [101, 2027, 2018, 2405, 2019, 15147, 2006, 1996, 4274, 2006, 2238, 2184, 1010, 5378, 1996, 6636, 2005, 5096, 1010, 2002, 2794, 1012, 102, 2006, 2238, 2184, 1010, 1996, 2911, 1005, 1055, 5608, 2018, 2405, 2019, 15147, 2006, 1996, 4274, 1010, 5378, 1996, 14792, 2005, 5096, 1012, 102], [101, 21

### 进行微调（Transformer.Trainer）

In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# !conda install -c huggingface transformers accelerate



In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer") # 加载超参数配置

In [17]:
# 加载模型
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from transformers import Trainer

trainer = Trainer(
    model
    ,training_args
    ,train_dataset=tokenized_datasets['train']
    ,eval_dataset=tokenized_datasets['validation']
    ,data_collator=data_collator
    ,tokenizer=tokenizer
)

  trainer = Trainer(


In [19]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [20]:
trainer.train()

  0%|          | 0/1377 [00:00<?, ?it/s]

{'loss': 0.5034, 'grad_norm': 1.5853326320648193, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}
{'loss': 0.2727, 'grad_norm': 23.290136337280273, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}
{'train_runtime': 126.259, 'train_samples_per_second': 87.154, 'train_steps_per_second': 10.906, 'train_loss': 0.3264000808837713, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.3264000808837713, metrics={'train_runtime': 126.259, 'train_samples_per_second': 87.154, 'train_steps_per_second': 10.906, 'total_flos': 405114969714960.0, 'train_loss': 0.3264000808837713, 'epoch': 3.0})

### 模型评估

In [21]:
predictions = trainer.predict(tokenized_datasets["validation"])

  0%|          | 0/51 [00:00<?, ?it/s]

In [None]:

print(predictions.predictions.shape, predictions.label_ids.shape)

[1 0 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0
 0 1 1 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1
 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 1 0 0 1 1 0
 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1
 0 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0
 0 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 1 0
 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 0 1 1 0 1 1 0 1 0 0
 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0
 1]
(408, 2) (408,)


In [None]:
import numpy as np
preds = np.argmax(predictions.predictions,axis=1)
preds # 实际预测结果

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,

In [33]:
# 评估预测和真实结果对比
import evaluate

metric = evaluate.load("glue","mrpc")
metric.compute(predictions=preds,references=predictions.label_ids)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8455882352941176, 'f1': 0.8930390492359932}

In [None]:
# 打包评估函数
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")  # 加载与 GLUE 基准测试中 MRPC 任务相关的评估指标（Metric）是一组计算逻辑，用于衡量模型预测结果与真实标签的匹配程度（如计算predictions和references的准确率）
    # MRPC：微软研究释义语料库（Microsoft Research Paraphrase Corpus），任务为判断两个句子是否为释义（二分类任务）。
    # 对应指标：MRPC 任务的官方评估指标是准确率（Accuracy）和F1 分数（F1-Score）
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [35]:
# 为了查看模型在每个训练周期结束时的好坏，下面是我们如何使用 compute_metrics() 函数定义一个新的 Trainer
training_args = TrainingArguments("test-trainer",evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

trainer = Trainer(
    model
    ,training_args
    ,train_dataset=tokenized_datasets['train']
    ,eval_dataset=tokenized_datasets['validation']
    ,data_collator=data_collator
    ,tokenizer=tokenizer
    ,compute_metrics=compute_metrics # 添加自己的评估方法，每个epoc评估一次
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [36]:
trainer.train()

  0%|          | 0/1377 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.49888285994529724, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.8366412213740458, 'eval_runtime': 4.9545, 'eval_samples_per_second': 82.349, 'eval_steps_per_second': 10.294, 'epoch': 1.0}
{'loss': 0.5846, 'grad_norm': 11.974019050598145, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.4442324638366699, 'eval_accuracy': 0.8186274509803921, 'eval_f1': 0.8724137931034482, 'eval_runtime': 4.6248, 'eval_samples_per_second': 88.219, 'eval_steps_per_second': 11.027, 'epoch': 2.0}
{'loss': 0.4481, 'grad_norm': 1.7519233226776123, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.5072584748268127, 'eval_accuracy': 0.8382352941176471, 'eval_f1': 0.8842105263157894, 'eval_runtime': 3.2548, 'eval_samples_per_second': 125.352, 'eval_steps_per_second': 15.669, 'epoch': 3.0}
{'train_runtime': 138.4389, 'train_samples_per_second': 79.486, 'train_steps_per_second': 9.947, 'train_loss': 0.4726305970612348, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.4726305970612348, metrics={'train_runtime': 138.4389, 'train_samples_per_second': 79.486, 'train_steps_per_second': 9.947, 'total_flos': 405114969714960.0, 'train_loss': 0.4726305970612348, 'epoch': 3.0})

# 一个完整的全量微调训练（Pytorch手动版）

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [None]:
# 模型数据预处理，删除不用的字段，模型训练时默认的标签字段是labels
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1','sentence2','idx'])
tokenized_datasets = tokenized_datasets.rename_column('label','labels')
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [54]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets['train'],shuffle=True,batch_size=8,collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'],shuffle=True,batch_size=8,collate_fn=data_collator
)

# check 一下
for batch in train_dataloader:
    break
print({k:v.shape for k,v in batch.items()})

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 70]), 'token_type_ids': torch.Size([8, 70]), 'attention_mask': torch.Size([8, 70])}


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# check一下
outputs = model(**batch)
print(outputs)
# 现在有了模型和数据加载器，还需要优化器和学习率调度器

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SequenceClassifierOutput(loss=tensor(0.7860, grad_fn=<NllLossBackward0>), logits=tensor([[0.2098, 0.5178],
        [0.2238, 0.4991],
        [0.2280, 0.5289],
        [0.1553, 0.5280],
        [0.2202, 0.4910],
        [0.2120, 0.4698],
        [0.1951, 0.5146],
        [0.2557, 0.4992]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(),lr=5e-5) # 学习率从最大值 （5e-5） 到 0 的线性衰减

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs*len(train_dataloader)
# 定义线性均分的调度器，（5e-5） 到 0 是全部变化，总共迭代的数据量*迭代次数 = 总共训练了多少个batch，每个batch的学习率变化量 = 5e-5/总batch次数
lr_scheduler = get_scheduler(
    "linear"
    ,optimizer=optimizer
    ,num_warmup_steps=0
    ,num_training_steps=num_training_steps
)
print(num_training_steps)

1377


In [58]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [59]:
# 开始训练 tqdm是进度条
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train() # 模型设置到训练模式

for epoch in range(num_epochs):
    for batch in train_dataloader:
        # 张量要转移到gpu上
        batch = {k:v.to(device) for k,v in batch.items()} #{labels:张量数据,intpus:张量数据}
        outputs = model(**batch)
        loss = outputs.loss  # softmax以及crossentropy的损失函数内置在了模型内了
        loss.backward() # 损失函数反向递归，w和b都积累了变动参数
        
        optimizer.step()  # 优化器根据变动参数更新 w，b
        lr_scheduler.step() # 调度器修改优化器的学习率参数
        optimizer.zero_grad() # 更新完了w和b后清空累积的变动参数
        progress_bar.update(1) # 一个批次训练完了进度+1


  0%|          | 0/1377 [00:00<?, ?it/s]

In [66]:
print(range(num_training_steps))

range(0, 1377)


In [67]:
# 评估
import evaluate

metric = evaluate.load("glue","mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits,dim=-1)
    metric.add_batch(predictions=predictions,references=batch['labels'])
metric.compute()

{'accuracy': 0.8578431372549019, 'f1': 0.9003436426116839}

In [None]:
# 以上代码的完全体
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from tqdm.auto import tqdm
from transformers import get_scheduler
import evaluate

model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
dataset = load_dataset("glue","mrpc")
# data process
def tokenize_func(input):
    return tokenizer(input['sentence1'],input['sentence2'],truncation=True)
dataset = dataset.map(tokenize_func)
dataset = dataset.remove_columns(['sentence1','sentence2','idx'])
dataset = dataset.rename_column('label','labels')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    dataset['train']
    ,batch_size=8
    ,shuffle=True
    ,collate_fn=data_collator
    )

valid_dataloader = DataLoader(
    dataset['validation']
    ,batch_size=8
    ,shuffle=True
    ,collate_fn=data_collator
    )

# train
model.train()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(),lr=5e-5)
lr_scheduler = get_scheduler(
    "linear"
    ,optimizer=optimizer
    ,num_warmup_steps=0
    ,num_training_steps=num_epochs*len(train_dataloader)
)

num_epochs = 3
process_bar = tqdm(range(num_epochs*len(train_dataloader)))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        outputs.loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        process_bar.update(1)

# evaluate
metric = evaluate.load("glue","mrpc")
model.eval()
for batch in valid_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    predictions = torch.argmax(outputs.logits,dim=-1)
    metric.add_batch(predictions=predictions,references=batch['labels'])
metric.compute()

# Accelerate加速训练（分布式训练）

In [None]:
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, get_scheduler
from accelerate import Accelerator

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(),3e-5)
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
train_dataloader,eval_dataloader,model,optimizer = accelerator.prepare(
    train_dataloader,eval_dataloader,model,optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        # loss.backward()
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the latest cached version of the dataset since glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'mrpc' at C:\Users\tassa\.cache\huggingface\datasets\glue\mrpc\0.0.0\bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c (last modified on Thu May 22 15:04:29 2025).


Map:   0%|          | 0/1725 [00:00<?, ? examples/s]