In [1]:
from typing import List, Dict, Sequence
import torch
import transformers
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
from dataclasses import dataclass, field

IGNORE_INDEX = -100

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_dir = r"Qwen/Qwen2.5-0.5B"

model = AutoModelForCausalLM.from_pretrained(model_dir)
model = model.to("cuda:0")

tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side="right")

In [5]:
tokenizer.pad_token, tokenizer.pad_token_id

('<|endoftext|>', 151643)

In [7]:
tokenizer.add_special_tokens({
    "pad_token": "[pad]"
})

1

In [8]:
tokenizer.pad_token, tokenizer.pad_token_id

('[pad]', 151665)

In [9]:
tokenizer.special_tokens_map

{'eos_token': '<|endoftext|>',
 'pad_token': '[pad]',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

In [10]:
tokenizer.pad_token, tokenizer.eos_token

('[pad]', '<|endoftext|>')

In [11]:
tokenizer.eos_token

'<|endoftext|>'

In [12]:
import pandas as pd

file = "data.xlsx"
df = pd.read_excel(file)

def build_prompt(name, text, label):
    instruct = f"请你给{name}写一首诗：{text}"
    label = f"{label}"
    return {"input_ids": instruct, "labels": instruct + label + tokenizer.eos_token}


data = []
for _, row in df.iterrows():
    data.append(build_prompt(row["name"], row["text"], row["label"]))
data[0]

{'input_ids': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。',
 'labels': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。<|endoftext|>'}

## 自定义数据集

我有一个问题，在data_collator中，attention_mask 能填充 0 吗？ 

先尝试好的写法

In [13]:
class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data: List):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx) -> List[Dict]:
        return self.data[idx]

In [14]:
dataset = SupervisedDataset(data)

In [16]:
dataset[0]

{'input_ids': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。',
 'labels': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。<|endoftext|>'}

In [17]:
@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, items: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        
        prompt, instruct = [
            [item[key] for item in items] for key in ("labels", "input_ids")
        ]

        prompt_tokenizer = tokenizer(
            prompt,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,  # for trunc
            truncation=True,
        )

        instruct_tokenizer = tokenizer(
            instruct,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,  # for trunc
            truncation=True,
        )

        labels = prompt_tokenizer["input_ids"].clone()

        pad_idx = labels.eq(tokenizer.pad_token_id)
        labels[pad_idx] = IGNORE_INDEX

        instruct_lens = instruct_tokenizer["attention_mask"].sum(dim=-1)
        assert len(instruct_lens.shape) == 1
        for row, col in enumerate(instruct_lens.tolist()):
            labels[row][:col] = IGNORE_INDEX

        prompt_tokenizer["labels"] = labels
        return prompt_tokenizer

In [18]:
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
prompt_tokenizer = data_collator(
    [dataset[0], dataset[3]]
)

## train

In [None]:
args = TrainingArguments(
    output_dir=r"C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\SFT_output",
    num_train_epochs=20,
    per_device_train_batch_size=2,
    save_safetensors=True,
    logging_strategy="epoch",
)

In [20]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    train_dataset=dataset,
    eval_dataset=None,
    data_collator=DataCollatorForSupervisedDataset(tokenizer=tokenizer),
)

10分12秒

In [21]:
train_result = trainer.train()

Step,Training Loss
3,5.7106
6,1.9132
9,0.5017
12,0.4654
15,0.1779
18,0.0281
21,0.0125
24,0.0116
27,0.012
30,0.0


In [22]:
trainer.save_state()
trainer.save_model(output_dir=args.output_dir)

In [23]:
tokenizer.save_pretrained(args.output_dir)

('C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\output\\tokenizer_config.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\output\\special_tokens_map.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\output\\vocab.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\output\\merges.txt',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\output\\added_tokens.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\output\\tokenizer.json')

In [None]:
# trainer.log_metrics("train", metrics)
# trainer.save_metrics("train", metrics)

# trainer.save_model(output_dir=training_args.output_dir)

In [24]:
train_result.metrics

{'train_runtime': 11.7335,
 'train_samples_per_second': 8.523,
 'train_steps_per_second': 5.114,
 'total_flos': 9276719063040.0,
 'train_loss': 0.441707559516423,
 'epoch': 20.0}