In [1]:
# bitsandbytes: 专为量化设计的库，重点在于减少大模型（尤其是在GPU上）的内存占用
# peft: 用于将LoRA适配器集成到大语言模型（LLMs）中
# trl: 该库包含一个SFT（监督微调）类，用于辅助微调模型
# accelerate和xformers: 这些库用于提高模型的推理速度，从而优化其性能
# wandb: 该工具作为一个监控平台，用于跟踪和观察训练过程
# datasets: 与Hugging Face一起使用，该库便于加载数据集

import torch 
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    TextStreamer,
    Trainer
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import os, wandb

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.device_count()

1

# 1. 加载模型和Tokenizer

In [4]:
# 预训练模型
model_name = "/home/leon/projects/models/Meta-Llama-3-8B/"
# 数据集名称
dataset_name = "scooterman/guanaco-llama3-1k"

In [5]:
# 加载预训练模型和tokenizer

# 量化配置
# https://huggingface.co/docs/transformers/v4.43.3/en/main_classes/quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # 模型将以4位量化格式加载
    bnb_4bit_quant_type="nf4", # 指定4位量化的类型为 nf4
    bnb_4bit_compute_dtype=torch.float16, # 计算数据类型
    bnb_4bit_use_double_quant=False, # 表示不使用双重量化
) # QLoRA中的原理

# 模型加载
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0} # 将模型加载到设备0（通常是第一个GPU）
)

# tokenizer 加载
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True # 在生成序列时会自动添加结束标记


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# 加载数据集
dataset = load_dataset(dataset_name, split="train")
dataset["text"][0]

'<|start_header_id|>user<|end_header_id|>{{Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo?}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{{Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar las posibilidades de encontrar trabajo, también participaría en congresos y encuentros para conseguir más contactos. Y, además de todo lo anterior, seguiría estudiando para presentarme a las oposiciones y ejercer la medicina en el sector público de mi país.}}<|eot_id|>'

# 2. wandb 配置

In [7]:
# 监控
# 需要在WandB官网注册账号
wandb.login(key="ded693cc7edc0388564a53cb198473dc9a10e543")

[34m[1mwandb[0m: Currently logged in as: [33mub313leon[0m ([33mleon-2003ub313[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/leon/.netrc


True

In [8]:
run = wandb.init(
    project="test_fine_tuning",
    job_type="training"
)

In [9]:
# 计算训练参数量
def print_trainable_parameters(model):
    trainable_params=0
    all_param=0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
        print(f"训练参数量: {trainable_params} || 总的参数量: {all_param} || 训练参数量占比%: {100*(trainable_params/all_param):.2f}")

# 3. LoRA与训练超参配置

In [10]:
# LoRA config
peft_config = LoraConfig(
    r = 8,
    lora_alpha=16, # 小技巧，把a值设置成rank值的两倍
    # scaling = alpha / r # LoRA权重的值越大，影响就越大
    # weight += (lora_B @ lora_A)*scaling
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
    # ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","embed_tokens","lm_head"]
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]
)

In [27]:
# 训练超参
training_arguments = TrainingArguments(
    output_dir="/home/leon/projects/models/autodl-tmp/",
    num_train_epochs=5,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2, # 梯度累积步数为2，即每2步更新一次梯度，有肋于在显存有限的情况下使用较大的有效批次大小。
    optim="paged_adamw_8bit",
    save_steps=50, # 每100步保存一次模型
    logging_steps=30,
    learning_rate=2e-4,
    weight_decay=0.001, # 权重衰减系数，用于L2正则化，帮助防止过拟合。
    fp16=False,
    bf16=False,
    max_grad_norm=0.3, # 最大梯度范数，用于梯度裁剪，防止梯度爆炸，
    max_steps=-1, # 最大训练步数－1，表示没有限制，
    warmup_ratio=0.3, # 预热阶段的比例，在训练开始时，学习率会逐渐升高，预热比例为0.3表示前30％的训练步骤用于预热。
    group_by_length=True, # 按序列长度分组，以提高训练效率，
    lr_scheduler_type="linear", # 表示使用线性学习率调度。
    report_to="wandb", # tensorboard
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# 4. 模型微调

In [28]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    dataset_text_field="text",
    args=training_arguments,
    packing=False
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
# 开始训练
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.82 GiB. GPU 

In [30]:
model = get_peft_model(model,peft_config)

# 计算可训练数量
print_trainable_parameters(model)

训练参数量: 0 || 总的参数量: 525336576 || 训练参数量占比%: 0.00
训练参数量: 0 || 总的参数量: 533725184 || 训练参数量占比%: 0.00
训练参数量: 32768 || 总的参数量: 533757952 || 训练参数量占比%: 0.01
训练参数量: 65536 || 总的参数量: 533790720 || 训练参数量占比%: 0.01
训练参数量: 65536 || 总的参数量: 535887872 || 训练参数量占比%: 0.01
训练参数量: 98304 || 总的参数量: 535920640 || 训练参数量占比%: 0.02
训练参数量: 106496 || 总的参数量: 535928832 || 训练参数量占比%: 0.02
训练参数量: 106496 || 总的参数量: 538025984 || 训练参数量占比%: 0.02
训练参数量: 139264 || 总的参数量: 538058752 || 训练参数量占比%: 0.03
训练参数量: 147456 || 总的参数量: 538066944 || 训练参数量占比%: 0.03
训练参数量: 147456 || 总的参数量: 546455552 || 训练参数量占比%: 0.03
训练参数量: 180224 || 总的参数量: 546488320 || 训练参数量占比%: 0.03
训练参数量: 212992 || 总的参数量: 546521088 || 训练参数量占比%: 0.04
训练参数量: 212992 || 总的参数量: 575881216 || 训练参数量占比%: 0.04
训练参数量: 245760 || 总的参数量: 575913984 || 训练参数量占比%: 0.04
训练参数量: 360448 || 总的参数量: 576028672 || 训练参数量占比%: 0.06
训练参数量: 360448 || 总的参数量: 605388800 || 训练参数量占比%: 0.06
训练参数量: 393216 || 总的参数量: 605421568 || 训练参数量占比%: 0.06
训练参数量: 507904 || 总的参数量: 605536256 || 训练参数量占比%: 0.08
训练参数量: 507904 || 总的参数量: 63

# 5. 保存模型

In [31]:
# 保存微调模型

trainer.model.save_pretrained("/home/leon/projects/models/autodl-tmp/")
wandb.finish()
model.config.use_cache = True
model.eval()



VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4b

# 6. 模型推理

In [32]:
# base模型测试

def stream(user_input):
    device = "cuda:0"
    system_prompt = "Below is an instruction that describes a task. Write a response that appropriately comple"
    B_INST, E_INST = "### Instruction:\n", "### Response:\n"
    prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{E_INST}"
    inputs = tokenizer([prompt], return_tensors="pt").to(device)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=128) # 这个model是原始模型，不是微调后的模型

In [35]:
stream("安倍是谁")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


安倍晋三,日本首相。

### Explanation:
The sentence “安倍晋三,日本首相。” is a simple sentence. It is a statement that tells who the person is. The sentence contains one independent clause, which is the main idea of the sentence. The subject is 安倍晋三, and the predicate is 日本首相. The subject and the predicate are separated by a comma.


# 7. 模型合并

In [36]:
# 合并base model 与 lora model
# https://huggingface.co/docs/trl/main/en/use_model#use-adapters-peft

base_model = AutoModelForCausalLM.from_pretrained(
    model_name, low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.float16,
    device_map={"": 0}
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 

In [37]:
new_model = PeftModel.from_pretrained(base_model, "/home/leon/projects/models/autodl-tmp/")

NameError: name 'base_model' is not defined

In [38]:
# 模型合并
merged_model = new_model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def stream(user_input):
    device = "cuda:0"
    system_prompt = "Below is an instruction that describes a task. Write a response that appropriately comple"
    B_INST, E_INST = "### Instruction:\n", "### Response:\n"
    prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{E_INST}"
    inputs = tokenizer([prompt], return_tensors="pt").to(device)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = merged_model.generate(**inputs, streamer=streamer, max_new_tokens=128,num_return_sequences=1)

NameError: name 'new_model' is not defined