In [1]:
# !pip install -U datasets accelerate peft trl tensorboard bitsandbytes langchain sentencepiece transformers
# !pip install transformers==4.37.2 --user
# !pip install tiktoken einops transformers_stream_generator

In [2]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th
from torch.utils.tensorboard import SummaryWriter
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig,
                          TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification)
from transformers.integrations import TensorBoardCallback
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import SFTTrainer

In [3]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.0.1+cu118
11.8


In [4]:
# if th.cuda.is_bf16_supported():
#     th.set_default_tensor_type(th.cuda.BFloat16Tensor)
# else:
#     th.set_default_tensor_type(th.cuda.HalfTensor)

In [5]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = os.path.join(os.path.dirname(path_project), "model")
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 载入数据源

In [6]:
filename = "tatsu-lab/alpaca/train-00000-of-00001-a09b74b3ef9c3b56.parquet"

In [7]:
dataset = load_dataset(
    path="parquet",
    data_files=os.path.join(path_data, filename),
    split="all"
)

In [8]:
dataset = dataset.select(range(200))
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0) 
dataset_train, dataset_test = dataset["train"], dataset["test"]

## step-2: tokenizer

In [9]:
checkpoint = "Qwen1.5-4B-Chat"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)
print(tokenizer.padding_side)

<|endoftext|>
<|im_end|>
right


In [12]:
# tokenizer.pad_token = tokenizer.eos_token  # 半精度训练时需要
# tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})

## step-3: 配置量化参数

In [13]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
    bnb_4bit_use_double_quant=True
)  # QLoRA

## step-4: 载入基础/任务大模型

In [14]:
model_base = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    torch_dtype=th.bfloat16,  # "auto", th.bfloat16
    quantization_config=(config_bnb if config_bnb else None)
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
for i, (name, parm) in enumerate(model_base.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

0  name: model.embed_tokens.weight;  shape: torch.Size([151936, 2560]);  dtype: torch.bfloat16;  device: cuda:0
1  name: model.layers.0.self_attn.q_proj.weight;  shape: torch.Size([3276800, 1]);  dtype: torch.uint8;  device: cuda:0
2  name: model.layers.0.self_attn.q_proj.bias;  shape: torch.Size([2560]);  dtype: torch.bfloat16;  device: cuda:0
3  name: model.layers.0.self_attn.k_proj.weight;  shape: torch.Size([3276800, 1]);  dtype: torch.uint8;  device: cuda:0
4  name: model.layers.0.self_attn.k_proj.bias;  shape: torch.Size([2560]);  dtype: torch.bfloat16;  device: cuda:0
5  name: model.layers.0.self_attn.v_proj.weight;  shape: torch.Size([3276800, 1]);  dtype: torch.uint8;  device: cuda:0
6  name: model.layers.0.self_attn.v_proj.bias;  shape: torch.Size([2560]);  dtype: torch.bfloat16;  device: cuda:0
7  name: model.layers.0.self_attn.o_proj.weight;  shape: torch.Size([3276800, 1]);  dtype: torch.uint8;  device: cuda:0
8  name: model.layers.0.mlp.gate_proj.weight;  shape: torch.Siz

In [16]:
# 注：glm3 没有了 lm_head，有一个 output_layer，这个时候可能会分配到两个 device，导致计算 loss 的时候报错
# if th.cuda.device_count() > 1:
# 	model_base.hf_device_map["transformer.output_layer"] = model_base.hf_device_map["transformer.embedding"]  # 1 <- 0
# 	dct_device_map = model_base.hf_device_map
 
# 	model_base.cpu()
# 	del model_base
# 	th.cuda.empty_cache()

In [17]:
# re-load
# model_base = AutoModelForCausalLM.from_pretrained(
#     pretrained_model_name_or_path=path_model,
#     cache_dir=path_model,
#     force_download=False,
#     local_files_only=True,
#     trust_remote_code=True,
#     device_map=dct_device_map,
#     torch_dtype=th.float16,
#     # quantization_config=config_bnb
# )

In [18]:
# note: use gradient checkpointing to save memory at the expense of slower backward pass.
# if TrainingArguments(gradient_checkpointing=True)
model_base.gradient_checkpointing_enable()
# note: Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping the model weights fixed. 
# if TrainingArguments(gradient_checkpointing=True)
model_base.enable_input_require_grads()
model_base.config.use_cache = False

if th.cuda.device_count() > 1:
    model_base.is_parallelizable = True
    model_base.model_parallel = True

In [19]:
print(model_base.dtype)

torch.bfloat16


In [20]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：3.61G, 已缓存的GPU内存：3.85G


In [21]:
# check embedding_size
tokenizer_size = len(tokenizer)
embedding_size = model_base.get_input_embeddings().weight.shape[0]
if tokenizer_size > embedding_size:
    model_base.resize_token_embeddings(tokenizer_size)

## step-5: 配置模型参数

In [22]:
config_model = {
    "rank": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "use_rslora": True,
    "epochs": 2,
    "batch_size": 4,
    "gradient_steps": 1,
    "learning_rate": 0.00005,
    "weight_decay": 0.01,
    "max_seq_lenght": 512
}

## step-6: 配置LoRA模型

In [23]:
model_base

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-39): 40 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm

In [24]:
# LoRA: Low-Rank Adaptation of Large Language Models
# config_lora = LoraConfig(target_modules=["0"])
# config_lora = LoraConfig(target_modules=["query_key_value", "dense_4h_to_h"])
# config_lora = LoraConfig(target_modules=[".*\.1.*query_key_value"])
# config_lora = LoraConfig(target_modules=["query_key_value"], modules_to_save=["word_embeddings"])
config_lora = LoraConfig(
    r=config_model.get("rank"),
    lora_alpha=config_model.get("lora_alpha"),
    lora_dropout=config_model.get("lora_dropout"),
    use_rslora=config_model.get("use_rslora"),
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    # "gate_proj", "up_proj", "down_proj",
                    # "lm_head"
                    ]
)

In [25]:
model_lora = get_peft_model(model=model_base, peft_config=config_lora)

In [27]:
# print_trainable_parameters - 1
# print(model_lora.print_trainable_parameters())

# print_trainable_parameters - 2
trainable_params = 0
all_params = 0

for param in model_lora.parameters():
    if param.requires_grad:
        trainable_params += param.numel()
    all_params += param.numel()

print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params:.4f}")

trainable params: 6553600 || all params: 2370951680 || trainable%: 0.2764


In [28]:
get_peft_model_state_dict(model_lora)

{'base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight': tensor([[-0.0074,  0.0054,  0.0045,  ...,  0.0008, -0.0104, -0.0091],
         [ 0.0059,  0.0129,  0.0136,  ..., -0.0137,  0.0157,  0.0183],
         [ 0.0138, -0.0016,  0.0058,  ..., -0.0149, -0.0082, -0.0069],
         ...,
         [-0.0046, -0.0159,  0.0037,  ..., -0.0007,  0.0109,  0.0016],
         [ 0.0070, -0.0074,  0.0131,  ...,  0.0006, -0.0087,  0.0071],
         [-0.0118, -0.0109,  0.0006,  ...,  0.0073,  0.0127, -0.0052]],
        device='cuda:0'),
 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),
 'base_model.model.model.layers.0.self_attn.k_proj.lora_A.weight': tensor([[ 1.8852e-02, -1.1636e-02,  1.1881e-03,  ..., -

## step-7: 模型训练

In [29]:
args_train = TrainingArguments(
    output_dir=os.path.join(path_output, "model_sft"),
    num_train_epochs=config_model.get("epochs"),
    per_device_train_batch_size=config_model.get("batch_size"),
    per_device_eval_batch_size=config_model.get("batch_size"),
    gradient_accumulation_steps=config_model.get("gradient_steps"),
    gradient_checkpointing=True, 
    optim="adamw_torch",
    learning_rate=config_model.get("learning_rate"),
    weight_decay=config_model.get("weight_decay"),
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True
)

In [30]:
collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 
# collate_fn = DataCollatorWithPadding(tokenizer)
# collate_fn = DataCollatorForSeq2Seq(tokenizer, padding=True)
# collate_fn = DataCollatorForTokenClassification(tokenizer)

In [31]:
trainer = SFTTrainer(
    model=model_lora,
    tokenizer=tokenizer,
    args=args_train,
    peft_config=config_lora,
    data_collator=collate_fn,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    dataset_text_field="text", 
    packing=True,
    max_seq_length=config_model.get("max_seq_length"),
    # compute_metrics=compute_metrics
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [32]:
res_train = trainer.train()

  0%|          | 0/8 [00:00<?, ?it/s]

{'loss': 1.4522, 'learning_rate': 2.5e-05, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.179024338722229, 'eval_runtime': 6.7116, 'eval_samples_per_second': 0.447, 'eval_steps_per_second': 0.149, 'epoch': 1.0}
{'loss': 1.3085, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1574249267578125, 'eval_runtime': 6.736, 'eval_samples_per_second': 0.445, 'eval_steps_per_second': 0.148, 'epoch': 2.0}
{'train_runtime': 202.8191, 'train_samples_per_second': 0.148, 'train_steps_per_second': 0.039, 'train_loss': 1.3803266286849976, 'epoch': 2.0}


## step-8: 模型评估

In [49]:
res_eval = trainer.evaluate()
# res_eval = trainer.evaluate(dataset_train)
# res_eval = trainer.evaluate(dataset_test)
print(res_eval)

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1574249267578125, 'eval_runtime': 6.8337, 'eval_samples_per_second': 0.439, 'eval_steps_per_second': 0.146, 'epoch': 2.0}


## step-9: 模型保存

In [50]:
trainer.save_model(output_dir=os.path.join(path_model, "model_sft"))

## step-10: 模型加载

In [51]:
print(th.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   3785 MiB |  11811 MiB |   3465 GiB |   3461 GiB |
|       from large pool |   3660 MiB |  11686 MiB |   3461 GiB |   3458 GiB |
|       from small pool |    124 MiB |    199 MiB |      3 GiB |      3 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   3785 MiB |  11811 MiB |   3465 GiB |   3461 GiB |
|       from large pool |   3660 MiB |  11686 MiB |   3461 GiB |   3458 GiB |
|       from small pool |    124 MiB |    199 MiB |      3 GiB |      3 GiB |
|---------------------------------------------------------------

In [36]:
# allocated_memory = th.cuda.memory_allocated()
# cached_memory = th.cuda.memory_cached()
# print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

In [52]:
# 释放不再使用的GPU内存
model_base.cpu()
del model_base
th.cuda.empty_cache()

In [53]:
# reload model_base
model_base = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=th.bfloat16,
    quantization_config=config_bnb
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [54]:
# load model_sft
model_sft = PeftModel.from_pretrained(
    model=model_base,
    model_id=os.path.join(path_model, "model_sft"),
    is_trainable=False
)
model_sft = model_sft.merge_and_unload()  # W + BA, speed up, but errors when use 8-bit

In [55]:
# save merged model to local
model_sft.save_pretrained(save_directory=os.path.join(path_model, "model_sft_merged"), max_shard_size="4GB")

In [41]:
# save merged model to hf
# model_sft.save_pretrained_merged("model_sft", tokenizer, save_method="merged_16bit")  # merged_16bit, merged_4bit, lora
# model_sft.push_to_hub_merged("hf/model_sft", tokenizer, save_method="merged_16bit", token=your_token)

In [42]:
# save merged model as gguf
# model_sft.save_pretrained_gguf("model_sft", tokenizer, quantization_method="f16")  # f16, q4_k_m
# model_sft.push_to_hub_gguf("hf/model_sft", tokenizer, quantization_method="f16", token=your_token)

## step-11: 模型推理

In [56]:
content_usr = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Design a database to record employee salaries.

### Response:
"""

In [57]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": content_usr}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [58]:
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# model_inputs = tokenizer([text1, text2], return_tensors="pt", padding=True, truncation=True).to(device)

In [59]:
# inference
max_new_tokens = 128  # 取训练样本答案的最长值
top_p = 0.9
temperature = 0.1  # 0.5，0.35，0.1，0.01
# repetition_penalty = 1.5

t0 = pd.Timestamp.now()
model_sft.eval()
with th.inference_mode():
    complete_ids = model_sft.generate(
        input_ids=model_inputs.input_ids,  # 针对 tokenizer.padding_side
        attention_mask=model_inputs.attention_mask,  # 针对 tokenizer.padding_side
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        temperature=temperature
    )
    # also OK
    # complete_ids = model_sft.generate(
    #     max_new_tokens=max_new_tokens,
    #     top_p=top_p,
    #     temperature=temperature,
    #     **model_inputs  # 针对 tokenizer.padding_side
    # )
t1 = pd.Timestamp.now()
print(t1 - t0)

input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
print(response)

0 days 00:00:12.542673
To design a database to record employee salaries, we need to consider the following entities and their attributes:

1. Employee: This entity will have the following attributes:
   - ID (primary key)
   - First Name
   - Last Name
   - Date of Birth
   - Gender
   - Position
   - Department
   - Salary

2. Salary: This entity will have the following attributes:
   - ID (primary key)
   - Employee ID (foreign key referencing Employee table)
   - Amount
   - Date

We can create two tables for this database: Employee and Salary. The Employee table will have one


## step-12: ollama模型转化

- 准备好微调好的模型文件夹（其中模型文件为safetensors格式，其余文件与基模型文件夹对齐）
- 在ollama中查看基模型的Modelfile格式（如：ollama show qwen2:7b --modelfile）
- 在微调文件夹中创建一致的Modelfile文件
- 执行转化与量化指令（如：ollama create --quantize Q4_K_M -f Modelfile Qwen1.5-4B-Chat-SFT-Q4_K_M）

In [None]:
# ollama show qwen2:7b --modelfile
'''
# Modelfile generated by "ollama show"
# To build a new Modelfile based on this, replace FROM with:
# FROM qwen2:7b

FROM F:\LLM\ollama\blobs\sha256-43f7a214e5329f672bb05404cfba1913cbb70fdaa1a17497224e1925046b0ed5
TEMPLATE "{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
{{ .Response }}<|im_end|>
"
PARAMETER stop <|im_start|>
PARAMETER stop <|im_end|>
'''

In [None]:
# ollama create --quantize Q4_K_M -f Modelfile Qwen1.5-4B-Chat-SFT-Q4_K_M
'''
transferring model data
unpacking model metadata
Error: Models based on 'Qwen2ForCausalLM' are not yet supported
'''