In [None]:
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -U --user datasets accelerate peft trl tensorboard bitsandbytes langchain sentencepiece transformers

In [1]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th
# import torch_npu as th_npu
import transformers

from pprint import pp
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                          AutoModel, 
                          AutoModelForCausalLM, 
                          AutoModelForSequenceClassification,
                          DataCollatorWithPadding, 
                          DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, 
                          DataCollatorForTokenClassification,
                          TrainingArguments, Trainer)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
# device = th.device("npu" if th.npu.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
# devive_cnt = th.npu.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(f"torch version = {th.__version__}")
print(f"cuda version = {th.version.cuda}")
print(f"transformers version = {transformers.__version__}")

device = cuda; devive_cnt = 1
torch version = 2.5.1+cu121
cuda version = 12.1
transformers version = 4.49.0


In [3]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(path_model, "output")

## step-1: 数据源

In [4]:
filename = "alpaca/train-00000-of-00001-a09b74b3ef9c3b56.parquet"

In [5]:
dataset = load_dataset(
    path="parquet",
    data_files=os.path.join(path_data, filename),
    split="all"
)

In [6]:
dataset = dataset.select(range(2000))  # 预研
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0)
train_dataset, eval_dataset = dataset["train"], dataset["test"]

In [7]:
pp(train_dataset[2])
'''
注：这是一个指令微调数据集，要用作预训练可以只对 text 做预研
'''

{'instruction': 'List 3 possible reasons why the given website is not '
                'performing as expected.',
 'input': 'A website for an e-commerce store',
 'output': '1. The website has a slow loading time. \n'
           '2. The website has a weak user interface and design. \n'
           '3. The website is lacking in SEO optimization.',
 'text': 'Below is an instruction that describes a task, paired with an input '
         'that provides further context. Write a response that appropriately '
         'completes the request.\n'
         '\n'
         '### Instruction:\n'
         'List 3 possible reasons why the given website is not performing as '
         'expected.\n'
         '\n'
         '### Input:\n'
         'A website for an e-commerce store\n'
         '\n'
         '### Response:\n'
         '1. The website has a slow loading time. \n'
         '2. The website has a weak user interface and design. \n'
         '3. The website is lacking in SEO optimization.'}


'\n注：这是一个指令微调数据集，要用作预训练可以只对 text 做预研\n'

## step-2: tokenizer

In [8]:
checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)
tokenizer.add_special_tokens({"bos_token": "<|im_start|>"})

0

In [10]:
pp(f"bos_token = {tokenizer.bos_token}")
pp(f"eos_token = {tokenizer.eos_token}")
pp(f"pad_token = {tokenizer.pad_token}")
pp(f"padding_side = {tokenizer.padding_side}")

'bos_token = <|im_start|>'
'eos_token = <|im_end|>'
'pad_token = <|endoftext|>'
'padding_side = right'


## step-3: 量化参数（可选）

In [11]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
    bnb_4bit_use_double_quant=True
)  # QLoRA

## step-4: 载入基模

In [12]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=th.bfloat16,
    # attn_implementation="sdpa",  # flash_attention_2, sdpa
    # quantization_config=config_bnb,
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
for i, (name, parm) in enumerate(base_model.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

In [13]:
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()
base_model.config.use_cache = False

if th.cuda.device_count() > 1:
    base_model.is_parallelizable = True
    base_model.model_parallel = True

In [14]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：0.93G, 已缓存的GPU内存：0.97G


In [15]:
tokenizer_size = len(tokenizer)
embedding_size = base_model.get_input_embeddings().weight.shape[0]
if tokenizer_size > embedding_size:
    base_model.resize_token_embeddings(tokenizer_size)

## step-5: 模型参数

In [19]:
config_model = {
    "rank": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "use_rslora": True,
    "epochs": 2,
    "batch_size": 8,
    "gradient_steps": 1,
    "learning_rate": 0.0001,
    "weight_decay": 0.0,
    "max_seq_length": 512
}

## step-6: LoRA参数（可选）

In [16]:
pp(base_model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [None]:
# LoRA: Low-Rank Adaptation of Large Language Models
# config_lora = LoraConfig(target_modules=["0"])
# config_lora = LoraConfig(target_modules=["query_key_value", "dense_4h_to_h"])
# config_lora = LoraConfig(target_modules=[".*\.1.*query_key_value"])
# config_lora = LoraConfig(target_modules=["query_key_value"], modules_to_save=["word_embeddings"])
config_lora = LoraConfig(
    r=config_model.get("rank"),
    lora_alpha=config_model.get("lora_alpha"),
    lora_dropout=config_model.get("lora_dropout"),
    use_rslora=config_model.get("use_rslora"),
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # self_attn
        "gate_proj", "up_proj", "down_proj",  # mlp
        "lm_head"  # lm_head
        ]
)

In [21]:
lora_model = get_peft_model(model=base_model, peft_config=config_lora)

In [22]:
# print_trainable_parameters - 1
print(lora_model.print_trainable_parameters())

# print_trainable_parameters - 2
# trainable_params = 0
# all_params = 0

# for param in lora_model.parameters():
#     if param.requires_grad:
#         trainable_params += param.numel()
#     all_params += param.numel()

# print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params:.4f}")

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497
None


In [None]:
get_peft_model_state_dict(lora_model)

## step-7: 整理函数

In [23]:
# 针对 PT，进行数据结构整理; SFT 使用 tokenizer.apply_chat_template
def apply_pretrain_template(sample):
    sample["text"] += tokenizer.eos_token  # <|im_end|>
    return sample

def tokenize_function(sample):
    inputs = tokenizer(text=sample["text"], max_length=128, truncation=True, padding=True, return_tensors="pt")
    inputs["labels"] = inputs["input_ids"]
    return inputs

In [25]:
dataset = dataset.map(apply_pretrain_template)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [26]:
dataset_t = dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [27]:
train_dataset_t = dataset_t["train"]
test_dataset_t = dataset_t["test"]

## step-8: 模型训练

In [28]:
args_train = TrainingArguments(
    output_dir=os.path.join(path_output, "model_pt"),
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,  # True, False
    optim="adamw_torch",
    learning_rate=0.0001,  # 0.00001
    weight_decay=0.0,
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True
)

In [29]:
collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 
# collate_fn = DataCollatorWithPadding(tokenizer)
# collate_fn = DataCollatorForSeq2Seq(tokenizer, padding=True)
# collate_fn = DataCollatorForTokenClassification(tokenizer)

In [30]:
trainer = Trainer(
    model=lora_model,
    tokenizer=tokenizer,
    args=args_train,
    data_collator=collate_fn,
    train_dataset=train_dataset_t,
    eval_dataset=test_dataset_t,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [31]:
training_result = trainer.train()

Epoch,Training Loss,Validation Loss
1,1.3299,1.254612
2,1.002,1.295628
3,0.7799,1.394593


## step-9: 模型评估

In [32]:
evaluating_result = trainer.evaluate()
# testing_result = trainer.evaluate(dataset_test)
pp(evaluating_result)

{'eval_loss': 1.254611849784851,
 'eval_runtime': 14.112,
 'eval_samples_per_second': 28.345,
 'eval_steps_per_second': 3.543,
 'epoch': 3.0}


## step-10: 模型保存

In [None]:
pt_model = lora_model.merge_and_unload(adapter_names=None)

In [None]:
# 1 - 使用 Trainer 训练时保存整个训练模型(包含训练状态（模型权重、配置文件、优化器等）)
trainer.save_model(output_dir=os.path.join(path_output, "model_pt_1"))

# 2 - 通常用于非 Trainer 环境下保存模型(只保存模型权重、配置文件和分词器等)
pt_model.save_pretrained(save_directory=os.path.join(path_output, "model_pt_2"), max_shard_size="4GB")