In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
#accelerate用于优化计算资源的使用，加速深度学习模型的训练
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.

In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
#删除指定的列：sentence1,sentence2,idx
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
#将label列重命名为labels
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
#将数据设置为Pytorch格式
tokenized_datasets.set_format("torch")
#输出训练集的列名称
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [13]:
#用Dataloader把train和validation中的数据转换成可供模型训练和评估的批量数据
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [15]:
#取出一个batch看看
for batch in train_dataloader:
  break
{k:v.shape for k,v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 73]),
 'token_type_ids': torch.Size([8, 73]),
 'attention_mask': torch.Size([8, 73])}

In [18]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
#训练一个batch看看
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7298, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [20]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [26]:
from transformers import get_scheduler

num_epochs = 3
#计算总训练步数
num_training_steps = num_epochs * len(train_dataloader)
#get_schefuler用于创建学习率调度器，which控制训练过程中学习率变化的机制，可以根据设置的策略调整lr（线性衰减、余弦衰减等）
#此处是线性衰减
#预热步骤数（warmup_steps）：预热阶段指在训练开始时，lr从0逐渐增加到初始学习率。
#这里设置为0，意味着没有预热阶段，学习率一开始就是初始学习率。
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

1377


In [22]:
#cuda代表的是GPU
#检查是否有可用的GPU（CUDA设备），若无则继续使用CPU，并将模型转移到相应的设备上进行训练或推理
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [None]:
#开始训练模型，tqdm是进度条库
from tqdm.auto import tqdm
#根据训练步数创建进度条
progress_bar = tqdm(range(num_training_steps))

#将模型设置为训练模式，因为训练和评估时，模型的行为可能不同（dropout和batch normalization）
model.train()

for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()} #将batch中的所有数据移动到之前选择的设备上，bc模型和数据必须在同一个设备上进行计算
    outputs = model(**batch) #执行前向传播
    loss = outputs.loss #提取损失
    loss.backward() #执行反向传播，计算损失相对于模型参数的梯度
    optimizer.step() #根据计算出来的梯度更新模型参数，调整模型权重
    lr_scheduler.step() #根据设置的调度策略调整当前学习率
    optimizer.zero_grad() #zero_grad()清0之前梯度，否则梯度会累积
    progress_bar.update(1) #更新进度条


In [None]:
#开始进行模型评估
import evaluate

metric = evaluate.load("glue", "mrpc")
#将模型设置为评估模式
model.eval()
for batch in eval_dataloader:
  batch = {k:v.to(device) for k,v in batch.items()}
  with torch.no_grad(): #在torch.no_grad块内进行推理，不计算梯度，因为在评估阶段不需要反向传播
    outputs = model(**batch)

  logits = outputs.logits #logits是未归一化的原始预测分数
  predictions = torch.argmax(logits, dim=-1)
  metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8431372549019608, 'f1': 0.8907849829351535}

In [None]:
#使用Accelerate库对之前的代码进行一些调整，就可以在多个GPU或TPU上启用分布式训练

from accelerate import Accelerator #new
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator() #new

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

#之前这里有一段被去掉了：
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)

train_dl, eval_dl, model, optimizer = accelerator.prepare( #new
    train_dataloader, eval_dataloader, model, optimizer #new
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss) #new

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
from accelerate import notebook_launcher
#使用accelerate库中的notebook_launcher启动训练函数，并显式地在jupyter notebook环境中进行多设备训练
notebook_launcher(training_function)