In [1]:
# !pip install -U datasets accelerate peft trl tensorboard bitsandbytes langchain sentencepiece transformers
# !pip install transformers==4.37.2 --user
# !pip install tiktoken einops transformers_stream_generator

In [1]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th
from torch.utils.tensorboard import SummaryWriter
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig,
                          TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification)
from transformers.integrations import TensorBoardCallback
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import SFTTrainer

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.0.1+cu118
11.8


In [3]:
if th.cuda.is_bf16_supported():
    th.set_default_tensor_type(th.cuda.BFloat16Tensor)
else:
    th.set_default_tensor_type(th.cuda.HalfTensor)

In [4]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = os.path.join(os.path.dirname(path_project), "model")
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 载入数据源

In [10]:
filename = "tatsu-lab/alpaca/train-00000-of-00001-a09b74b3ef9c3b56.parquet"

In [8]:
dataset = load_dataset(
    path="parquet",
    data_files=os.path.join(path_data, filename),
    split="all"
)

In [9]:
dataset = dataset.select(range(2000))
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0) 
dataset_train, dataset_test = dataset["train"], dataset["test"]

## step-2: tokenizer

In [5]:
checkpoint = "Qwen1.5-4B-Chat"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# tokenizer.pad_token
# tokenizer.eos_token
# tokenizer.pad_token = tokenizer.eos_token  # 半精度训练时需要
# tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
# tokenizer.padding_side = "right"  # llama2

In [7]:
tokenizer.padding_side

'right'

## step-3: 配置量化参数

In [8]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
    bnb_4bit_use_double_quant=True
)

## step-4: 载入基础/任务大模型

In [9]:
model_base = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    torch_dtype=th.bfloat16,  # "auto"
    quantization_config=config_bnb
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
# 注：glm3 没有了 lm_head，有一个 output_layer，这个时候可能会分配到两个 device，导致计算 loss 的时候报错
# if th.cuda.device_count() > 1:
# 	model_base.hf_device_map["transformer.output_layer"] = model_base.hf_device_map["transformer.embedding"]  # 1 <- 0
# 	dct_device_map = model_base.hf_device_map
 
# 	model_base.cpu()
# 	del model_base
# 	th.cuda.empty_cache()

In [34]:
# re-load
# model_base = AutoModelForCausalLM.from_pretrained(
#     pretrained_model_name_or_path=path_model,
#     cache_dir=path_model,
#     force_download=False,
#     local_files_only=True,
#     trust_remote_code=True,
#     device_map=dct_device_map,
#     torch_dtype=th.float16,
#     # quantization_config=config_bnb
# )

Loading checkpoint shards: 100%|██████████| 7/7 [02:49<00:00, 24.28s/it]


In [15]:
# note: use gradient checkpointing to save memory at the expense of slower backward pass.
# if TrainingArguments(gradient_checkpointing=True)
model_base.gradient_checkpointing_enable()
# note: Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping the model weights fixed. 
# if TrainingArguments(gradient_checkpointing=True)
model_base.enable_input_require_grads()
model_base.config.use_cache = False

if th.cuda.device_count() > 1:
    model_base.is_parallelizable = True
    model_base.model_parallel = True

In [None]:
for i, (name, parm) in enumerate(model_base.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

In [16]:
print(model_base.dtype)

torch.bfloat16


In [17]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：8.01G, 已缓存的GPU内存：8.40G


In [18]:
# check embedding_size
tokenizer_size = len(tokenizer)
embedding_size = model_base.get_input_embeddings().weight.shape[0]
if tokenizer_size > embedding_size:
    model_base.resize_token_embeddings(tokenizer_size)

## step-5: 配置模型参数

In [19]:
config_model = {
    "rank": 64,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "use_rslora": True,
    "epochs": 5,
    "batch_size": 4,
    "gradient_steps": 1,
    "learning_rate": 0.00005,
    "weight_decay": 0.01,
    "max_seq_lenght": 512
}

## step-6: 配置LoRA模型

In [21]:
# LoRA: Low-Rank Adaptation of Large Language Models
# config_lora = LoraConfig(target_modules=["0"])
# config_lora = LoraConfig(target_modules=["query_key_value", "dense_4h_to_h"])
# config_lora = LoraConfig(target_modules=[".*\.1.*query_key_value"])
# config_lora = LoraConfig(target_modules=["query_key_value"], modules_to_save=["word_embeddings"])
config_lora = LoraConfig(
    r=config_model.get("rank"),
    lora_alpha=config_model.get("lora_alpha"),
    lora_dropout=config_model.get("lora_dropout"),
    use_rslora=config_model.get("use_rslora"),
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

In [22]:
model_lora = get_peft_model(model=model_base, peft_config=config_lora)

In [23]:
# print_trainable_parameters - 1
print(model_lora.print_trainable_parameters())

# print_trainable_parameters - 2
# trainable_params = 0
# all_params = 0

# for param in model_lora.parameters():
#     if param.requires_grad:
#         trainable_params += param.numel()
#     all_params += param.numel()

# print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params:.4f}")

trainable params: 52,428,800 || all params: 4,002,798,080 || trainable%: 1.309803766069559
None


In [None]:
get_peft_model_state_dict(model_lora)

## step-7: 模型训练

In [45]:
args_train = TrainingArguments(
    output_dir=os.path.join(path_output, "model_sft"),
    num_train_epochs=config_model.get("epochs"),
    per_device_train_batch_size=config_model.get("batch_size"),
    per_device_eval_batch_size=config_model.get("batch_size"),
    gradient_accumulation_steps=config_model.get("gradient_steps"),
    gradient_checkpointing=True, 
    optim="adamw_torch",
    learning_rate=config_model.get("learning_rate"),
    weight_decay=config_model.get("weight_decay"),
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True
)

In [46]:
collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 
# collate_fn = DataCollatorWithPadding(tokenizer)
# collate_fn = DataCollatorForSeq2Seq(tokenizer, padding=True)
# collate_fn = DataCollatorForTokenClassification(tokenizer)

In [47]:
trainer = SFTTrainer(
    model=model_lora,
    tokenizer=tokenizer,
    args=args_train,
    peft_config=config_lora,
    data_collator=collate_fn,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    dataset_text_field="text", 
    packing=True,
    max_seq_length=config_model.get("max_seq_length"),
    # compute_metrics=compute_metrics
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 180 examples [00:01, 149.07 examples/s]
Generating train split: 45 examples [00:00, 147.36 examples/s]


In [None]:
res_train = trainer.train()

## step-8: 模型评估

In [29]:
res_eval = trainer.evaluate()
# res_eval = trainer.evaluate(dataset_train)
# res_eval = trainer.evaluate(dataset_test)
res_eval

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.78125,
 'eval_runtime': 11.3416,
 'eval_samples_per_second': 0.353,
 'eval_steps_per_second': 0.088,
 'epoch': 3.0}

## step-9: 模型保存

In [30]:
trainer.save_model(output_dir=os.path.join(path_model, "model_sft"))

## step-10: 模型加载

In [None]:
print(th.cuda.memory_summary())

In [50]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：5.97G, 已缓存的GPU内存：6.26G


In [45]:
# 释放不再使用的GPU内存
model_base.cpu()
del model_base
th.cuda.empty_cache()

In [None]:
# reload model_base
model_base = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=th.bfloat16,
    quantization_config=config_bnb
)

In [10]:
# load model_sft
model_sft = PeftModel.from_pretrained(
    model=model_base,
    model_id=os.path.join(path_model, "model_sft"),
    is_trainable=False
)
model_sft = model_sft.merge_and_unload()  # W + BA, speed up, but errors when use 8-bit

In [11]:
# save merged model to local
model_sft.save_pretrained(save_directory=path_model)

In [None]:
# save merged model to hf
model_sft.save_pretrained_merged("model_sft", tokenizer, save_method="merged_16bit")  # merged_16bit, merged_4bit, lora
model_sft.push_to_hub_merged("hf/model_sft", tokenizer, save_method="merged_16bit", token=your_token)

In [None]:
# save merged model as gguf
model_sft.save_pretrained_gguf("model_sft", tokenizer, quantization_method="f16")  # f16, q4_k_m
model_sft.push_to_hub_gguf("hf/model_sft", tokenizer, quantization_method="f16", token=your_token)

## step-11: 模型推理

In [10]:
content_usr = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Design a database to record employee salaries.

### Response:
"""

In [11]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": content_usr}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [26]:
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# model_inputs = tokenizer([text1, text2], return_tensors="pt", padding=True, truncation=True).to(device)

In [None]:
# inference
max_new_tokens = 128  # 取训练样本答案的最长值
top_p = 0.9
temperature = 0.1  # 0.5，0.35，0.1，0.01
# repetition_penalty = 1.5

t0 = pd.Timestamp.now()
model_sft.eval()
with th.inference_mode():
    complete_ids = model_sft.generate(
        input_ids=model_inputs.input_ids,  # 针对 tokenizer.padding_side
        attention_mask=model_inputs.attention_mask,  # 针对 tokenizer.padding_side
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        temperature=temperature
    )
    # also OK
    # complete_ids = model_sft.generate(
    #     max_new_tokens=max_new_tokens,
    #     top_p=top_p,
    #     temperature=temperature,
    #     **model_inputs  # 针对 tokenizer.padding_side
    # )
t1 = pd.Timestamp.now()
print(t1 - t0)

input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
print(response)