https://huggingface.co/docs/accelerate

https://huggingface.co/docs/accelerate/quicktour

In [33]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from accelerate import Accelerator
from torchmetrics.functional import accuracy
from tqdm import tqdm
from pathlib import Path

In [34]:
epochs = 5
batch_size = 100
in_features = 10
num_classes = 5
data_len = 10000

In [35]:
model = nn.Linear(in_features, num_classes)

In [36]:
optimizer = optim.AdamW(
    model.parameters(),
    lr=0.001,
    betas=(0.9, 0.999),
    weight_decay=1e-2,
)

In [37]:
lr_sche = optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=epochs, eta_min=0.001 * 100
)

In [38]:
loss_fn = nn.CrossEntropyLoss()

In [39]:
class Dataset(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.x = torch.randn(data_len, in_features)
        self.y = torch.randint(0, num_classes, (data_len,))

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [40]:
train_datalaoder = DataLoader(
    dataset=Dataset(),
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
)
val_datalaoder = DataLoader(
    dataset=Dataset(),
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
)

# Accelerator

In [41]:
output_dir = Path("accelerator_checkpoint")
output_dir

WindowsPath('accelerator_checkpoint')

In [42]:
accelerator = Accelerator()

In [43]:
accelerator.device

device(type='cuda')

In [44]:
# accelerator可以帮助获取state
accelerator.get_state_dict(model, unwrap=True)

OrderedDict([('weight',
              tensor([[-0.0582, -0.1446, -0.2358, -0.1155,  0.2035, -0.2924, -0.2298,  0.0446,
                        0.0732, -0.3009],
                      [ 0.1364, -0.2627, -0.2545, -0.2182,  0.2711,  0.2517,  0.2459,  0.0006,
                        0.1983, -0.2905],
                      [ 0.1668,  0.2241, -0.2459,  0.1288,  0.0057,  0.1226,  0.1944,  0.1659,
                       -0.1588,  0.1786],
                      [ 0.1793, -0.1016,  0.3151,  0.0429, -0.0895,  0.0295, -0.1313, -0.0883,
                        0.1461, -0.1601],
                      [-0.3099, -0.1955,  0.0575, -0.0787,  0.3042,  0.2530, -0.0462,  0.2949,
                       -0.0340,  0.0074]])),
             ('bias', tensor([ 0.1206, -0.0559, -0.1660, -0.1939,  0.0456]))])

In [45]:
model.state_dict()

OrderedDict([('weight',
              tensor([[-0.0582, -0.1446, -0.2358, -0.1155,  0.2035, -0.2924, -0.2298,  0.0446,
                        0.0732, -0.3009],
                      [ 0.1364, -0.2627, -0.2545, -0.2182,  0.2711,  0.2517,  0.2459,  0.0006,
                        0.1983, -0.2905],
                      [ 0.1668,  0.2241, -0.2459,  0.1288,  0.0057,  0.1226,  0.1944,  0.1659,
                       -0.1588,  0.1786],
                      [ 0.1793, -0.1016,  0.3151,  0.0429, -0.0895,  0.0295, -0.1313, -0.0883,
                        0.1461, -0.1601],
                      [-0.3099, -0.1955,  0.0575, -0.0787,  0.3042,  0.2530, -0.0462,  0.2949,
                       -0.0340,  0.0074]])),
             ('bias', tensor([ 0.1206, -0.0559, -0.1660, -0.1939,  0.0456]))])

## is_main_process is_local_main_process

is_local_main_process 和 is_main_process 就跟Pytorch的分布式训练中的LOCAL_RANK和RANK的区别

> node
物理节点，就是一台机器，节点内部可以有多个GPU(一台机器有多卡)。

> rank & local_rank
>
> 用于表示进程的序号，用于进程间通信。每一个进程对应了一个rank。
>
> rank=0的进程就是master进程。
>
> local_rank： rank是指在整个分布式任务中进程的序号；local_rank是指在一台机器上(一个node上)进程的相对序号，例如机器一上有0,1,2,3,4,5,6,7，机器二上也有0,1,2,3,4,5,6,7。local_rank在node之间相互独立。
>
> 单机多卡时，rank就等于local_rank

> nnodes
>
> 物理节点数量

> node_rank
>
> 物理节点的序号

> nproc_per_node
>
> 每个物理节点上面进程的数量。

> group
>
> 进程组。默认只有一个组

> world size 全局的并行数
>
> 全局（一个分布式任务）中，rank的数量。
>
> 每个node包含16个GPU，且nproc_per_node=8，nnodes=3，机器的node_rank=5，请问world_size是多少？
>
> 答案：world_size = 3*8 = 24


```yaml
# 一共有12个rank, nnodes=3, nproc_per_node=4,每个节点都对应一个node_rank

machine0:
    node_rank: 0
        GPU0:
            rank: 0
            local_rank: 0
        GPU1:
            rank: 1
            local_rank: 1
        GPU2:
            rank: 2
            local_rank: 2
        GPU3:
            rank: 3
            local_rank: 3

machine1:
    node_rank: 1
        GPU0:
            rank: 4
            local_rank: 0
        GPU1:
            rank: 5
            local_rank: 1
        GPU2:
            rank: 6
            local_rank: 2
        GPU3:
            rank: 7
            local_rank: 3

machine2:
    node_rank: 2
        GPU0:
            rank: 8
            local_rank: 0
        GPU1:
            rank: 9
            local_rank: 1
        GPU2:
            rank: 10
            local_rank: 2
        GPU3:
            rank: 11
            local_rank: 3
```

In [46]:
print(accelerator.is_main_process)
print(accelerator.is_local_main_process)

True
True


## prepare

In [47]:
model, optimizer, train_datalaoder, val_datalaoder = accelerator.prepare(
    model, optimizer, train_datalaoder, val_datalaoder
)

## clip gradients

In [48]:
accelerator.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
torch.nn.utils.clip_grad.clip_grad_norm_(
    parameters=model.parameters(), max_norm=1, norm_type=2
)

tensor(0.)

In [49]:
# accelerator.clip_grad_value_(parameters=model.parameters(), clip_value=0.1)
# torch.nn.utils.clip_grad.clip_grad_value_(parameters=model.parameters(), clip_value=0.1)

## train loop

In [50]:
for epoch in range(1, epochs + 1):
    # train
    model.train()
    with tqdm(
        total=len(train_datalaoder),
        desc=f"{epoch}/{epochs}",
        disable=not accelerator.is_main_process,
    ) as pbar:
        all_predictions = []
        all_targets = []
        all_losses = []
        for x, y in train_datalaoder:
            optimizer.zero_grad()
            with accelerator.autocast():
                y_pred: torch.Tensor = model(x)
                loss: torch.Tensor = loss_fn(y_pred, y)
            accelerator.backward(loss)  # replace loss.backward()
            accelerator.clip_grad_norm_(  # 梯度裁剪
                parameters=model.parameters(),
                max_norm=1.0,
                norm_type=2,
            )
            optimizer.step()

            pbar.set_postfix({"train/loss": f"{loss.item():.4f}"})
            pbar.update(1)

            # 获取所有数据上的预测值和真实值,用来验证
            all_pred, all_tar, all_loss = accelerator.gather_for_metrics(
                (y_pred, y, loss)
            )
            all_predictions.append(all_pred)
            all_targets.append(all_tar)
            all_losses.append(all_loss)

        train_acc = accuracy(
            preds=torch.cat(all_predictions, dim=0),
            target=torch.cat(all_targets, dim=0),
            task="multiclass",
            num_classes=num_classes,
        )
        train_avg_loss = torch.mean(torch.tensor(all_losses))
        pbar.set_postfix({"train/acc": f"{train_acc.item():.4f}"})

    lr_sche.step()

    # val
    model.eval()
    with tqdm(
        total=len(val_datalaoder),
        desc=f"{epoch}/{epochs}",
        disable=not accelerator.is_main_process,
    ) as pbar:
        all_predictions = []
        all_targets = []
        all_losses = []
        for x, y in val_datalaoder:
            with torch.inference_mode():
                y_pred: torch.Tensor = model(x)
            loss: torch.Tensor = loss_fn(y_pred, y)

            pbar.set_postfix({"val/loss": f"{loss.item():.4f}"})
            pbar.update(1)

            # 获取所有数据上的预测值和真实值,用来验证
            all_pred, all_tar, all_loss = accelerator.gather_for_metrics(
                (y_pred, y, loss)
            )
            all_predictions.append(all_pred)
            all_targets.append(all_tar)
            all_losses.append(all_loss)

        val_acc = accuracy(
            preds=torch.cat(all_predictions, dim=0),
            target=torch.cat(all_targets, dim=0),
            task="multiclass",
            num_classes=num_classes,
        )
        val_avg_loss = torch.mean(torch.tensor(all_losses))
        pbar.set_postfix({"val/acc": f"{val_acc.item():.4f}"})

    # 本地主进程才保存
    if accelerator.is_local_main_process:
        # like torch.distributed.barrier, wait for all processes to enter this call.
        accelerator.wait_for_everyone()
        unwrapped_model: nn.Module = accelerator.unwrap_model(model)
        # save
        accelerator.save(
            obj={
                "model": unwrapped_model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "lr_sche": lr_sche.state_dict(),
            },
            f=output_dir / "accelerator.last.pth",
        )
accelerator.print("training finish")

1/5: 100%|██████████| 100/100 [00:00<00:00, 201.79it/s, train/acc=0.1976]
1/5:  69%|██████▉   | 69/100 [00:00<00:00, 600.00it/s, val/loss=1.6828]

1/5: 100%|██████████| 100/100 [00:00<00:00, 581.40it/s, val/acc=0.2001]
2/5: 100%|██████████| 100/100 [00:00<00:00, 318.47it/s, train/acc=0.1975]
2/5: 100%|██████████| 100/100 [00:00<00:00, 588.24it/s, val/acc=0.1966]
3/5: 100%|██████████| 100/100 [00:00<00:00, 350.18it/s, train/acc=0.1984]
3/5: 100%|██████████| 100/100 [00:00<00:00, 598.81it/s, val/acc=0.1958]
4/5: 100%|██████████| 100/100 [00:00<00:00, 346.81it/s, train/acc=0.2063]
4/5: 100%|██████████| 100/100 [00:00<00:00, 575.92it/s, val/acc=0.1910]
5/5: 100%|██████████| 100/100 [00:00<00:00, 337.69it/s, train/acc=0.1936]
5/5: 100%|██████████| 100/100 [00:00<00:00, 572.85it/s, val/acc=0.1925]

training finish





In [51]:
torch.load(output_dir / "accelerator.last.pth", map_location="cpu")

{'model': OrderedDict([('weight',
               tensor([[-0.0316, -0.0060, -0.0392,  0.1107,  0.0719,  0.0515,  0.0762,  0.0085,
                         0.0586, -0.1150],
                       [ 0.0977, -0.0298,  0.1236, -0.1378,  0.1329,  0.1308, -0.0111,  0.1061,
                         0.0507, -0.1544],
                       [ 0.0623, -0.1914, -0.3090, -0.1462,  0.0033, -0.1074, -0.0763, -0.0846,
                         0.0541, -0.1999],
                       [ 0.1505,  0.0085,  0.1900, -0.0033,  0.1027,  0.0931, -0.1493,  0.2024,
                         0.0396, -0.0359],
                       [-0.2538, -0.0443, -0.1529, -0.0206,  0.2259, -0.0468,  0.1679,  0.1371,
                        -0.0441,  0.0632]])),
              ('bias',
               tensor([ 0.0975,  0.0359, -0.1707, -0.1255, -0.0479]))]),
 'optimizer': {'state': {0: {'step': tensor(500.),
    'exp_avg': tensor([[-9.7995e-03,  5.6651e-03, -2.9284e-03,  3.8839e-03, -1.2792e-02,
              1.5966e-02, -7.905

## save_state和load_state配合使用

In [52]:
# 必须为dir
accelerator.save_state(output_dir=output_dir)

WindowsPath('accelerator_checkpoint')

In [53]:
# 必须为dir
accelerator.load_state(input_dir=output_dir)

In [54]:
torch.load(output_dir / "pytorch_model.bin")

OrderedDict([('weight',
              tensor([[-0.0316, -0.0060, -0.0392,  0.1107,  0.0719,  0.0515,  0.0762,  0.0085,
                        0.0586, -0.1150],
                      [ 0.0977, -0.0298,  0.1236, -0.1378,  0.1329,  0.1308, -0.0111,  0.1061,
                        0.0507, -0.1544],
                      [ 0.0623, -0.1914, -0.3090, -0.1462,  0.0033, -0.1074, -0.0763, -0.0846,
                        0.0541, -0.1999],
                      [ 0.1505,  0.0085,  0.1900, -0.0033,  0.1027,  0.0931, -0.1493,  0.2024,
                        0.0396, -0.0359],
                      [-0.2538, -0.0443, -0.1529, -0.0206,  0.2259, -0.0468,  0.1679,  0.1371,
                       -0.0441,  0.0632]], device='cuda:0')),
             ('bias',
              tensor([ 0.0975,  0.0359, -0.1707, -0.1255, -0.0479], device='cuda:0'))])

In [55]:
torch.load(output_dir / "optimizer.bin")

{'state': {0: {'step': tensor(500.),
   'exp_avg': tensor([[-9.7995e-03,  5.6651e-03, -2.9284e-03,  3.8839e-03, -1.2792e-02,
             1.5966e-02, -7.9051e-04, -2.7192e-03, -3.2121e-03,  1.2161e-03],
           [ 7.7052e-03, -1.4970e-02, -1.2600e-03, -1.4953e-03,  4.0345e-06,
             4.8517e-03, -2.3221e-04, -1.5730e-02, -5.3769e-03, -5.2220e-03],
           [ 5.4523e-03,  6.2075e-03, -2.1817e-03,  4.4775e-03,  6.1500e-03,
            -3.0440e-04, -2.4824e-03,  2.3290e-03,  4.3086e-03,  4.2023e-03],
           [-4.7727e-03, -5.3276e-03, -1.2861e-02,  4.9050e-03, -2.3012e-03,
            -1.1457e-02,  1.2899e-02,  5.3900e-03,  2.1217e-03,  6.1982e-03],
           [ 1.4147e-03,  8.4253e-03,  1.9231e-02, -1.1771e-02,  8.9392e-03,
            -9.0566e-03, -9.3943e-03,  1.0731e-02,  2.1588e-03, -6.3945e-03]],
          device='cuda:0'),
   'exp_avg_sq': tensor([[0.0007, 0.0007, 0.0007, 0.0007, 0.0008, 0.0011, 0.0007, 0.0007, 0.0007,
            0.0008],
           [0.0007, 0.0007, 0