In [1]:
from typing import List, Dict, Sequence
import torch
import transformers
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
from dataclasses import dataclass

IGNORE_INDEX = -100
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_dir = r"Qwen/Qwen2.5-0.5B"

model = AutoModelForCausalLM.from_pretrained(model_dir)
model = model.to("cuda:0")

tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side="right")

In [3]:
tokenizer.add_special_tokens({
    "pad_token": "[PAD]"
})

1

In [4]:
tokenizer.special_tokens_map

{'eos_token': '<|endoftext|>',
 'pad_token': '[PAD]',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

In [5]:
tokenizer.pad_token, tokenizer.pad_token_id

('[PAD]', 151665)

## 加载数据集

In [None]:
import json

with open("data.json", "r") as f:
    data = json.load(f)
data

[{'instruct': '请你给哪吒写一首诗：',
  'input': '哪吒降世，意气飞扬。\n逆天改命，破障冲霄。',
  'label': '红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。'},
 {'instruct': '请你给敖丙写一首诗：', 'input': '碧海生龙子，云中舞雪霜。', 'label': '恩仇难两忘，何处是家乡？'},
 {'instruct': '请你给殷夫人写一首诗：',
  'input': '十月怀胎盼子生，柔心铁骨两相承。',
  'label': '甘将慈爱护天地，不惧风雷不惧征。'},
 {'instruct': '请你给太乙真人写一首诗：', 'input': '仙风道骨，骑兽遨游。', 'label': '炉中炼术，指点神童。'},
 {'instruct': '请你给申公豹写一首诗：',
  'input': '阴谋藏心，步步为营。\n狂傲不羁，志向高冥。',
  'label': '欲翻天命，终难遂行。\n困局自招，悔恨难平。'}]

## 自定义数据集

我有一个问题，在data_collator中，attention_mask 能填充 0 吗？ 

先尝试好的写法

In [7]:
class PreTrainDataset(Dataset):
    
    def __init__(self, data: List):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx) -> List[Dict]:
        item = self.data[idx]
        text = item["instruct"] + item["input"] + item["label"] + tokenizer.eos_token
        return text
    
dataset = PreTrainDataset(data)
dataset[0]

'请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。<|endoftext|>'

In [8]:
tokenizer(dataset[0])

{'input_ids': [112720, 89012, 99459, 122157, 61443, 108462, 100045, 5122, 99459, 122157, 99457, 99244, 3837, 36589, 99180, 115449, 8997, 100531, 35727, 22418, 50509, 3837, 99577, 99884, 99907, 109564, 1773, 99425, 120827, 103073, 103610, 3837, 99208, 79599, 100875, 99964, 8997, 16530, 102683, 16530, 103020, 3837, 48738, 102744, 102635, 100619, 1773, 151643], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
@dataclass
class DataCollatorForPretrainDataset(object):
    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, items: Sequence[Dict]) -> Dict[str, torch.Tensor]:

        prompt = [item for item in items]

        prompt_tokenizer = tokenizer(
            prompt,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )

        labels = prompt_tokenizer["input_ids"].clone()
            
        # 不对 pad 计算 loss
        pad_idx = labels.eq(tokenizer.pad_token_id)
        labels[pad_idx] = -100
        
        prompt_tokenizer["labels"] = labels
        return prompt_tokenizer

In [10]:
from pprint import pprint

In [12]:
tokenizer.eos_token_id, tokenizer.pad_token_id, 

(151643, 151665)

In [13]:
data_collator = DataCollatorForPretrainDataset(tokenizer=tokenizer)
prompt_tokenizer = data_collator([dataset[0], dataset[1]])
pprint(prompt_tokenizer)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'input_ids': tensor([[112720,  89012,  99459, 122157,  61443, 108462, 100045,   5122,  99459,
         122157,  99457,  99244,   3837,  36589,  99180, 115449,   8997, 100531,
          35727,  22418,  50509,   3837,  99577,  99884,  99907, 109564,   1773,
          99425, 120827, 103073, 103610,   3837,  99208,  79599, 100875,  99964,
           8997,  16530, 102683,  16530, 103020,   3837,  48738, 102744, 102635,
         100619,   1773, 151643],
        [112720,  89012, 113735, 106980,  61443, 108462, 100045,   5122, 102461,
          55135,  21287,  99465,  44729,   3837,  99718,  15946, 100066, 100167,
         105401,   1773, 100697, 10095

## train

In [15]:
args = TrainingArguments(
    output_dir=r"C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\CLM_output",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_safetensors=True,
    logging_strategy="epoch",
)

In [16]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    train_dataset=dataset,
    eval_dataset=None,
    data_collator=DataCollatorForPretrainDataset(tokenizer=tokenizer),
)

10分12秒

In [17]:
train_result = trainer.train()

Step,Training Loss
3,3.9102
6,1.344
9,0.4426
12,0.2315
15,0.1244
18,0.0877
21,0.071
24,0.0766
27,0.0535
30,0.0499


In [18]:
train_result.metrics

{'train_runtime': 8.5468,
 'train_samples_per_second': 5.85,
 'train_steps_per_second': 3.51,
 'total_flos': 4670570361600.0,
 'train_loss': 0.6391463776429495,
 'epoch': 10.0}

In [19]:
trainer.save_state()
trainer.save_model(output_dir=args.output_dir)
tokenizer.save_pretrained(args.output_dir)

('C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\tokenizer_config.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\special_tokens_map.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\vocab.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\merges.txt',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\added_tokens.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\tokenizer.json')