In [None]:
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -U --user datasets accelerate peft trl tensorboard bitsandbytes langchain sentencepiece transformers

In [1]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th
# import torch_npu as th_npu
import transformers

from pprint import pp
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                          AutoModel, 
                          AutoModelForCausalLM, 
                          AutoModelForSequenceClassification,
                          DataCollatorWithPadding, 
                          DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, 
                          DataCollatorForTokenClassification,
                          TrainingArguments, Trainer)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import (SFTConfig, SFTTrainer)

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
# device = th.device("npu" if th.npu.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
# devive_cnt = th.npu.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(f"torch version = {th.__version__}")
print(f"cuda version = {th.version.cuda}")
print(f"transformers version = {transformers.__version__}")

device = cuda; devive_cnt = 1
torch version = 2.5.1+cu121
cuda version = 12.1
transformers version = 4.45.0


In [None]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(path_model, "output")

## step-1: 数据源

In [4]:
filename = "alpaca/train-00000-of-00001-a09b74b3ef9c3b56.parquet"

In [5]:
dataset = load_dataset(
    path="parquet",
    data_files=os.path.join(path_data, filename),
    split="all"
)

In [6]:
dataset = dataset.select(range(2000))  # 预研
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0)
train_dataset, eval_dataset = dataset["train"], dataset["test"]

In [7]:
pp(train_dataset[2])

{'instruction': 'List 3 possible reasons why the given website is not '
                'performing as expected.',
 'input': 'A website for an e-commerce store',
 'output': '1. The website has a slow loading time. \n'
           '2. The website has a weak user interface and design. \n'
           '3. The website is lacking in SEO optimization.',
 'text': 'Below is an instruction that describes a task, paired with an input '
         'that provides further context. Write a response that appropriately '
         'completes the request.\n'
         '\n'
         '### Instruction:\n'
         'List 3 possible reasons why the given website is not performing as '
         'expected.\n'
         '\n'
         '### Input:\n'
         'A website for an e-commerce store\n'
         '\n'
         '### Response:\n'
         '1. The website has a slow loading time. \n'
         '2. The website has a weak user interface and design. \n'
         '3. The website is lacking in SEO optimization.'}


## step-2: tokenizer

In [8]:
checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)
tokenizer.add_special_tokens({"bos_token": "<|im_start|>"})

0

In [10]:
pp(f"bos_token = {tokenizer.bos_token}")
pp(f"eos_token = {tokenizer.eos_token}")
pp(f"pad_token = {tokenizer.pad_token}")
pp(f"padding_side = {tokenizer.padding_side}")

'bos_token = <|im_start|>'
'eos_token = <|im_end|>'
'pad_token = <|endoftext|>'
'padding_side = right'


## step-3: 量化参数（可选）

In [11]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
    bnb_4bit_use_double_quant=True
)  # QLoRA

## step-4: 载入基模

In [12]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=th.bfloat16,
    # attn_implementation="sdpa",  # flash_attention_2, sdpa
    # quantization_config=config_bnb,
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
for i, (name, parm) in enumerate(base_model.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

In [13]:
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()
base_model.config.use_cache = False

if th.cuda.device_count() > 1:
    base_model.is_parallelizable = True
    base_model.model_parallel = True

In [14]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：0.93G, 已缓存的GPU内存：0.97G


In [15]:
tokenizer_size = len(tokenizer)
embedding_size = base_model.get_input_embeddings().weight.shape[0]
if tokenizer_size > embedding_size:
    base_model.resize_token_embeddings(tokenizer_size)

## step-5: 模型参数

In [None]:
model_config = {
    "rank": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "use_rslora": True,
    "epochs": 3,
    "batch_size": 1,
    "gradient_steps": 4,
    "learning_rate": 0.00002,
    "warmup_ratio": 0.03,  # 3% of steps used for warmup
    "lr_scheduler_type": "cosine_with_min_lr",  # use cosine decay
    "lr_scheduler_kwargs": {"min_lr": 0.000002}, 
    "weight_decay": 0.0,
    "max_seq_length": 512,
    "packing": True,
}

In [None]:
train_args = SFTConfig(
    output_dir=os.path.join(path_output, "model_fft"),
    num_train_epochs=model_config.get("epochs"),
    per_device_train_batch_size=model_config.get("batch_size"),
    per_device_eval_batch_size=model_config.get("batch_size"),
    gradient_accumulation_steps=model_config.get("gradient_steps"),
    gradient_checkpointing=False,  # True, False 
    optim="adamw_torch",
    learning_rate=model_config.get("learning_rate"),
    warmup_ratio=model_config.get("warmup_ratio"),  # 预热
    lr_scheduler_type=model_config.get("lr_scheduler_type"),  # 退火
    lr_scheduler_kwargs=model_config.get("lr_scheduler_kwargs"),  # 退火参数
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    dataset_text_field="text",
    # max_seq_length=model_config.get("max_seq_length"),  # defaults to `1024`
    # packing=model_config.get("packing"),  # Whether to pack multiple sequences into a fixed-length format.
)

## step-6: LoRA参数

不涉及

## step-7: 整理函数

In [18]:
system_prompt = "You are a helpful assistant."

In [None]:
# 例子
user_prompt = train_dataset[0]["instruction"] + "\n" + train_dataset[0]["input"]
assistant_prompt = train_dataset[0]["output"]

messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant_prompt}
    ]

text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False  # train
    )
# text += tokenizer.pad_token  # 初看不需要
print(text)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Convert the given temperatures from Fahrenheit to Celsius.
80°F<|im_end|>
<|im_start|>assistant
26.67°C<|im_end|>



In [None]:
def apply_sft_template(sample):
    user_prompt = sample["instruction"] + "\n" + sample["input"]
    assistant_prompt = sample["output"]
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant_prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    sample["text"] = text
    return sample

In [22]:
dataset = dataset.map(apply_sft_template)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [23]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

## step-8: 模型训练

In [25]:
collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 
# collate_fn = DataCollatorWithPadding(tokenizer)
# collate_fn = DataCollatorForSeq2Seq(tokenizer, padding=True)
# collate_fn = DataCollatorForTokenClassification(tokenizer)

In [None]:
trainer = SFTTrainer(
    model=base_model,
    tokenizer=tokenizer,
    args=train_args,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # compute_metrics=compute_metrics
)

Converting train dataset to ChatML:   0%|          | 0/1600 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/400 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

In [None]:
training_result = trainer.train()

## step-9: 模型评估

In [26]:
evaluating_result = trainer.evaluate()
# testing_result = trainer.evaluate(dataset_test)
pp(evaluating_result)

{'eval_loss': 1.3469866514205933,
 'eval_runtime': 10.6115,
 'eval_samples_per_second': 18.847,
 'eval_steps_per_second': 18.847}


## step-10: 模型保存

In [None]:
# 1 - 使用 Trainer 训练时保存整个训练模型(包含训练状态（模型权重、配置文件、优化器等）)
trainer.save_model(output_dir=os.path.join(path_output, "model_fft_1"))

# 2 - 通常用于非 Trainer 环境下保存模型(只保存模型权重、配置文件和分词器等)
base_model.save_pretrained(save_directory=os.path.join(path_output, "model_fft_2"), max_shard_size="4GB")

## step-11: 模型推理

In [36]:
user_prompt = train_dataset[0]["instruction"] + "\n" + train_dataset[0]["input"]

messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

In [37]:
# 方法-1
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True  # inference
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# model_inputs = tokenizer([text1, text2], return_tensors="pt", padding=True, truncation=True).to(device)

In [None]:
# 方法-2
# model_inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize=True,
#     add_generation_prompt=True,
#     return_tensors="pt",
#     return_dict=True
# ).to(device)

In [49]:
# max_new_tokens = 1024  # 取训练样本答案的最长值
# top_p = 0.9
# temperature = 0.1  # 0.5，0.35，0.1，0.01
# # repetition_penalty = 1.5
gen_kwargs = {
    "max_new_tokens": 1024,  # 取训练样本答案的最长值
    "top_p": 0.5,
    "temperature": 0.5,  # 0.5，0.35，0.1，0.01
    "do_sample": True
}

In [52]:
# inference 模板
t0 = pd.Timestamp.now()
base_model.eval()
with th.inference_mode():
    complete_ids = base_model.generate(
        input_ids=model_inputs.input_ids,  # 针对 tokenizer.padding_side
        attention_mask=model_inputs.attention_mask,  # 针对 tokenizer.padding_side
        **gen_kwargs
    )
    # also OK
    # complete_ids = model_sft.generate(
    #     **model_inputs,  # 针对 tokenizer.padding_side
    #     **gen_kwargs
    # )
t1 = pd.Timestamp.now()
print(t1 - t0)

input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
print(response)

0 days 00:00:00.721776
The total surface area of a cube is 150 cm².


In [40]:
train_dataset[0]["output"]

'Total surface area of the cube: 150 cm^2'