In [1]:
from typing import List, Dict, Sequence
import torch
import transformers
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
from dataclasses import dataclass, field

IGNORE_INDEX = -100

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_dir = r"Qwen/Qwen2.5-0.5B"

model = AutoModelForCausalLM.from_pretrained(model_dir)
model = model.to("cuda:0")

tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side="right")

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id

In [3]:
tokenizer.add_special_tokens({
    "pad_token": "[pad]"
})

1

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id

In [None]:
tokenizer.special_tokens_map

In [4]:
import pandas as pd

file = "../data.xlsx"
df = pd.read_excel(file)

def build_prompt(name, text, label):
    instruct = f"请你给{name}写一首诗：{text}"
    label = f"{label}"
    return {"labels": instruct + label + tokenizer.eos_token}


data = []
for _, row in df.iterrows():
    data.append(build_prompt(row["name"], row["text"], row["label"]))
data[0]

{'labels': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。<|endoftext|>'}

## 自定义数据集

我有一个问题，在data_collator中，attention_mask 能填充 0 吗？ 

先尝试好的写法

In [5]:
class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data: List):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx) -> List[Dict]:
        return self.data[idx]

In [6]:
dataset = SupervisedDataset(data)

In [None]:
dataset[0]

In [7]:
@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, items: Sequence[Dict]) -> Dict[str, torch.Tensor]:

        prompt = [item["labels"] for item in items]

        prompt_tokenizer = tokenizer(
            prompt,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,  # for trunc
            truncation=True,
        )

        labels = prompt_tokenizer["input_ids"].clone()
        prompt_tokenizer["labels"] = labels
        return prompt_tokenizer

In [None]:
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
prompt_tokenizer = data_collator(
    [dataset[0], dataset[3]]
)
prompt_tokenizer

## train

In [8]:
args = TrainingArguments(
    output_dir=r"C:\Users\1\Desktop\train_model_output\Qwen2.5-0.5B\CLM_output",
    num_train_epochs=20,
    per_device_train_batch_size=2,
    save_safetensors=True,
    logging_strategy="epoch",
)

In [9]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    train_dataset=dataset,
    eval_dataset=None,
    data_collator=DataCollatorForSupervisedDataset(tokenizer=tokenizer),
)

10分12秒

In [10]:
train_result = trainer.train()

Step,Training Loss
3,4.7712
6,1.6861
9,0.863
12,1.1061
15,0.4242
18,0.7519
21,0.589
24,0.4593
27,0.4149
30,0.2921


In [14]:
args.output_dir

'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output'

In [15]:
trainer.save_state()
trainer.save_model(output_dir=args.output_dir)

In [16]:
tokenizer.save_pretrained(args.output_dir)

('C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\tokenizer_config.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\special_tokens_map.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\vocab.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\merges.txt',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\added_tokens.json',
 'C:\\Users\\1\\Desktop\\train_model_output\\Qwen2.5-0.5B\\CLM_output\\tokenizer.json')

In [None]:
# trainer.log_metrics("train", metrics)
# trainer.save_metrics("train", metrics)

# trainer.save_model(output_dir=training_args.output_dir)

In [17]:
train_result.metrics

{'train_runtime': 11.6631,
 'train_samples_per_second': 8.574,
 'train_steps_per_second': 5.144,
 'total_flos': 9276719063040.0,
 'train_loss': 0.6067279808223247,
 'epoch': 20.0}