本代码重点关注基于 ulysses 序列并行和普通训练的实现区别。

In [22]:
from verl.utils.device import get_device_id, get_device_name, is_cuda_available, is_npu_available
device = get_device_name()
device

'cuda'

In [54]:
import torch
from torch.utils.data import Dataset, DistributedSampler

class DummyDataset(Dataset):
    def __init__(self, size=1000):
        self.size = size
    
    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor([[20, 42, 1, 19, 85], [25, 33, 7, 68, 9]], dtype=torch.float32),
            'attention_mask': torch.tensor([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], dtype=torch.float32),
            'position_ids': torch.tensor([[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], dtype=torch.float32),
            'loss_mask': torch.tensor([[1, 1, 0, 1, 1], [0, 1, 0, 1, 1]], dtype=torch.float32)
        }

class DummyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(5, 10)

    def forward(self, input_ids, attention_mask, position_ids):
        ids = (input_ids + position_ids) * attention_mask
        logits = self.fc(ids)
        return logits

model = DummyModel().to(device)
loss_fct = nn.CrossEntropyLoss(reduction="none")

train_dataset = DummyDataset(size=1000)
val_dataset = DummyDataset(size=200)

In [55]:
from torch import nn

input_ids = train_dataset[0]["input_ids"].to(device)
attention_mask = train_dataset[0]["attention_mask"].to(device)
position_ids = train_dataset[0]["position_ids"].to(device)
loss_mask = train_dataset[0]["loss_mask"][:, 1:].reshape(-1).to(device)
loss_mask

tensor([1., 0., 1., 1., 1., 0., 1., 1.], device='cuda:0')

In [56]:
labels = input_ids[:, 1:].contiguous()
labels

tensor([[42.,  1., 19., 85.],
        [33.,  7., 68.,  9.]], device='cuda:0')

In [None]:
logits = model(input_ids, attention_mask, position_ids)
logits
logits.shape

torch.Size([2, 10])

In [62]:
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels.contiguous()

vocab_size = 10
shift_logits = shift_logits.view(-1, vocab_size)
shift_labels = shift_labels.view(-1)
shift_logits.shape

torch.Size([1, 10])

In [None]:
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
loss = loss * loss_mask.to(loss.device)

这里 SP 就不直接基于源代码了，写一些 demo code 作为演示。

In [63]:
input_ids = [
    [1, 2, 3, 4, 0, 0, 0],  # 4 个有效 token，3 个 padding
    [5, 6, 7, 0, 0, 0, 0],  # 3 个有效 token，4 个 padding
]
attention_mask = [
    [1, 1, 1, 1, 0, 0, 0],
    [1, 1, 1, 0, 0, 0, 0],
]

# 首先需要 Unpad
input_ids_rmpad = [[1, 2, 3, 4, 5, 6, 7]]  # (1, 7) 只保留有效的 7 个token
indices = [0, 1, 2, 3, 7, 8, 9]  # 记录原始位置

In [65]:
#  然后切分到多个设备
input_ids_rmpad = [[1, 2, 3, 4, 5, 6, 7]]
pad_size = 1

# Device 0: [1, 2]
# Device 1: [3, 4]
# Device 2: [5, 6]
# Device 3: [7, <pad>]

In [None]:
# roll -1 向左滚动
input_ids_rmpad_rolled = torch.roll(torch.tensor(input_ids_rmpad), shifts=-1, dims=1)
input_ids_rmpad_rolled

tensor([[2, 3, 4, 5, 6, 7, 1]])