In [None]:
from typing import List, Dict, Sequence
import torch
from torch.nn.utils.rnn import pad_sequence
import transformers
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
from dataclasses import dataclass

IGNORE_INDEX = -100
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_dir = r"Qwen/Qwen2.5-0.5B"

model = AutoModelForCausalLM.from_pretrained(model_dir)
model = model.to("cuda:0")

tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side="right")

In [4]:
tokenizer.add_special_tokens({
    "pad_token": "[PAD]"
})

1

In [5]:
tokenizer.special_tokens_map

{'eos_token': '<|endoftext|>',
 'pad_token': '[PAD]',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

In [6]:
tokenizer.pad_token, tokenizer.pad_token_id

('[PAD]', 151665)

## 加载数据集

In [8]:
import json

with open("data.json", "r") as f:
    data = json.load(f)
data

[{'instruct': '请你给哪吒写一首诗：',
  'input': '哪吒降世，意气飞扬。\n逆天改命，破障冲霄。',
  'label': '红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。'},
 {'instruct': '请你给敖丙写一首诗：', 'input': '碧海生龙子，云中舞雪霜。', 'label': '恩仇难两忘，何处是家乡？'},
 {'instruct': '请你给殷夫人写一首诗：',
  'input': '十月怀胎盼子生，柔心铁骨两相承。',
  'label': '甘将慈爱护天地，不惧风雷不惧征。'},
 {'instruct': '请你给太乙真人写一首诗：', 'input': '仙风道骨，骑兽遨游。', 'label': '炉中炼术，指点神童。'},
 {'instruct': '请你给申公豹写一首诗：',
  'input': '阴谋藏心，步步为营。\n狂傲不羁，志向高冥。',
  'label': '欲翻天命，终难遂行。\n困局自招，悔恨难平。'}]

## 自定义数据集

我有一个问题，在data_collator中，attention_mask 能填充 0 吗？ 

In [None]:
class PreTrainDataset(Dataset):

    def __init__(self, data: List):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx) -> List[Dict]:
        item = self.data[idx]
        text = item["instruct"] + item["input"] + item["label"] + tokenizer.eos_token
        text_token = tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        label = text_token["input_ids"].clone()

        instruct = item["instruct"] + item["input"]
        instruct_token = tokenizer(
            instruct,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        instruct_len = instruct_token["input_ids"].size(-1)

        label[:, :instruct_len] = -100
        text_token["labels"] = label

        return text_token


dataset = PreTrainDataset(data)
dataset[0]

{'input_ids': tensor([[112720,  89012,  99459, 122157,  61443, 108462, 100045,   5122,  99459,
         122157,  99457,  99244,   3837,  36589,  99180, 115449,   8997, 100531,
          35727,  22418,  50509,   3837,  99577,  99884,  99907, 109564,   1773,
          99425, 120827, 103073, 103610,   3837,  99208,  79599, 100875,  99964,
           8997,  16530, 102683,  16530, 103020,   3837,  48738, 102744, 102635,
         100619,   1773, 151643]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          99425, 120827, 103073, 103610,   3837,  99208,  79599, 100875,  99964,
           8997,  16530, 1026

验证一下，label 设置为-100 的文本都是什么？ 

In [57]:
# 查看第一个原始数据
data[0]

{'instruct': '请你给哪吒写一首诗：',
 'input': '哪吒降世，意气飞扬。\n逆天改命，破障冲霄。',
 'label': '红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。'}

In [59]:
# 查看需要计算loss的文本
test_label = dataset[0][0]["label"]
test_label = test_label[test_label != -100]
tokenizer.decode(test_label)

'红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。<|endoftext|>'

In [None]:
# 查看label -100位置对应的input_ids的文本
test_input_ids = dataset[0][0]["input_ids"]
test_label = dataset[0][0]["labels"]
test_input_ids = test_input_ids[test_label == -100]
tokenizer.decode(test_input_ids)
# label -100 位置的都是用户的指令不参与 loss 计算 

'请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。'

In [82]:
pad_sequence(
    [torch.randn(2), torch.randn(3), torch.randn(4)],
    batch_first=True,
    padding_value=0,
    padding_side="right",
)

tensor([[-0.3421,  0.4131,  0.0000,  0.0000],
        [-0.1345,  1.2843,  1.0892,  0.0000],
        [-0.0567, -0.6993, -0.9386,  1.1316]])

In [89]:
dataset[0][0]

{'input_ids': tensor([[112720,  89012,  99459, 122157,  61443, 108462, 100045,   5122,  99459,
         122157,  99457,  99244,   3837,  36589,  99180, 115449,   8997, 100531,
          35727,  22418,  50509,   3837,  99577,  99884,  99907, 109564,   1773,
          99425, 120827, 103073, 103610,   3837,  99208,  79599, 100875,  99964,
           8997,  16530, 102683,  16530, 103020,   3837,  48738, 102744, 102635,
         100619,   1773, 151643]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          99425, 120827, 103073, 103610,   3837,  99208,  79599, 100875,  99964,
           8997,  16530, 1026

In [107]:
@dataclass
class DataCollatorForSFTDataset(object):
    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, items: Sequence) -> Dict[str, torch.Tensor]:
        # pad_sequence 不支持多维tensor，进行维度压缩 squeeze
        # input_ids, attention_mask = [
        #     [item.squeeze(0) for item in tokens[k]]
        #     for k in ["input_ids", "attention_mask"]
        # ]

        input_ids = [item["input_ids"].squeeze(0) for item in items]
        attention_mask = [item["attention_mask"].squeeze(0) for item in items]
        label = [item["label"].squeeze(0) for item in items]

        input_ids = pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=tokenizer.pad_token_id,
            padding_side="right",
        )
        attention_mask = pad_sequence(
            attention_mask,
            batch_first=True,
            padding_value=0,
            padding_side="right",
        )
        label = pad_sequence(
            label,
            batch_first=True,
            padding_value=-100,
            padding_side="right",
        )

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label,
        }

In [108]:
DataCollatorForSFTDataset(tokenizer=tokenizer)([dataset[0], dataset[1], dataset[2]])

{'input_ids': tensor([[112720,  89012,  99459, 122157,  61443, 108462, 100045,   5122,  99459,
          122157,  99457,  99244,   3837,  36589,  99180, 115449,   8997, 100531,
           35727,  22418,  50509,   3837,  99577,  99884,  99907, 109564,   1773,
           99425, 120827, 103073, 103610,   3837,  99208,  79599, 100875,  99964,
            8997,  16530, 102683,  16530, 103020,   3837,  48738, 102744, 102635,
          100619,   1773, 151643],
         [112720,  89012, 113735, 106980,  61443, 108462, 100045,   5122, 102461,
           55135,  21287,  99465,  44729,   3837,  99718,  15946, 100066, 100167,
          105401,   1773, 100697, 100956,  99349,  77540,  99980,   3837, 114216,
           20412, 105686,  11319, 151643, 151665, 151665, 151665, 151665, 151665,
          151665, 151665, 151665, 151665, 151665, 151665, 151665, 151665, 151665,
          151665, 151665, 151665],
         [112720,  89012, 106824, 105700,  61443, 108462, 100045,   5122, 113286,
           9970

In [102]:
from pprint import pprint

In [None]:
tokenizer.eos_token_id, tokenizer.pad_token_id

(151643, 151665)

In [103]:
data_collator = DataCollatorForSFTDataset(tokenizer=tokenizer)
prompt_tokenizer = data_collator([dataset[0], dataset[1]])
pprint(prompt_tokenizer)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'input_ids': tensor([[112720,  89012,  99459, 122157,  61443, 108462, 100045,   5122,  99459,
         122157,  99457,  99244,   3837,  36589,  99180, 115449,   8997, 100531,
          35727,  22418,  50509,   3837,  99577,  99884,  99907, 109564,   1773,
          99425, 120827, 103073, 103610,   3837,  99208,  79599, 100875,  99964,
           8997,  16530, 102683,  16530, 103020,   3837,  48738, 102744, 102635,
         100619,   1773, 151643],
        [112720,  89012, 113735, 106980,  61443, 108462, 100045,   5122, 102461,
          55135,  21287,  99465,  44729,   3837,  99718,  15946, 100066, 100167,
         105401,   1773, 100697, 10095

## train

In [109]:
args = TrainingArguments(
    output_dir=r"C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_safetensors=True,
    logging_strategy="epoch",
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [110]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    train_dataset=dataset,
    eval_dataset=None,
    data_collator=DataCollatorForSFTDataset(tokenizer=tokenizer),
)

10分12秒

In [111]:
train_result = trainer.train()

***** Running training *****
  Num examples = 5
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 30
  Number of trainable parameters = 494,032,768


Step,Training Loss
3,5.708
6,1.874
9,0.4857
12,0.4602
15,0.141
18,0.0244
21,0.0033
24,0.0007
27,0.0001
30,0.0001


Saving model checkpoint to C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\checkpoint-30
Configuration saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\checkpoint-30\config.json
Configuration saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\checkpoint-30\generation_config.json
Model weights saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\checkpoint-30\model.safetensors
tokenizer config file saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\checkpoint-30\tokenizer_config.json
Special tokens file saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\checkpoint-30\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [112]:
train_result.metrics

{'train_runtime': 20.9744,
 'train_samples_per_second': 2.384,
 'train_steps_per_second': 1.43,
 'total_flos': 4670570361600.0,
 'train_loss': 0.8697530678648036,
 'epoch': 10.0}

In [113]:
trainer.save_state()
trainer.save_model(output_dir=args.output_dir)
tokenizer.save_pretrained(args.output_dir)

Saving model checkpoint to C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output
Configuration saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\config.json
Configuration saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\generation_config.json
Model weights saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\model.safetensors
tokenizer config file saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\tokenizer_config.json
Special tokens file saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\special_tokens_map.json
tokenizer config file saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\tokenizer_config.json
Special tokens file saved in C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output\special_tokens_map.json


('C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\SFT_output\\tokenizer_config.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\SFT_output\\special_tokens_map.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\SFT_output\\vocab.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\SFT_output\\merges.txt',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\SFT_output\\added_tokens.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\SFT_output\\tokenizer.json')