# 模型微调

## 1. 导入相关库

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig

## 2.加载数据

In [None]:
# 将JSON文件转换为CSV文件
df = pd.read_json('./dataset/merged_story.json')
# df = pd.read_json('./data/story/呼啸山庄.json')
ds = Dataset.from_pandas(df)

In [None]:
print(ds[:3])  # 查看一条数据

In [None]:
print(len(ds)) # 总共微调指令数据数量

## 3. 加载Tokenizer

In [None]:
model_path = './Qwen/Qwen2-1___5B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
tokenizer

## 4. 指令集构造
指令集的构造需要针对不同的模型有针对的构造  

### *Prompt Template*
`Qwen1.5` 和 `Qwen2` 采用的`Prompt Template`格式如下：

```text
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
你是谁？<|im_end|>
<|im_start|>assistant
我是一个有用的助手。<|im_end|>
```

In [None]:
def process_func(example):
    MAX_LENGTH = 2048    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n你是一个熟读各类小说的专家，请你根据要求写一段800字左右的小说。<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

In [None]:
print(tokenizer.decode(tokenized_id[0]['input_ids']))

In [None]:
print(tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"]))))

## 5. 加载模型
### 加载半精度模型

模型以半精度形式加载，如果你的显卡比较新的话，可以用`torch.bfolat`形式加载。对于自定义的模型一定要指定`trust_remote_code`参数为`True`。

In [None]:
import torch

model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",torch_dtype=torch.bfloat16)
model

In [None]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [None]:
model.dtype

## 6. 配置模型参数

查看模型对应的target_modeules
```
from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING

target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING['chatglm']
```

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

## 7. 创建peft model

In [None]:
model = get_peft_model(model, config)
config

In [None]:
model.print_trainable_parameters()

## 8. 自定义 TrainingArguments 参数

In [None]:
lora_path = "./output/Qwen2-1_5B-Instruct_novel_all"

args = TrainingArguments(
    output_dir=lora_path,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

## 9. 模型训练

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [None]:
trainer.train()

## 10. 模型保存

In [None]:
lora_path = "./output/Qwen2-1_5B-Instruct_novel_all"
trainer.save_model(lora_path + "/final")

In [None]:
tokenizer.save_pretrained(lora_path + "/final")


# tokenizer文件同步到微调模型内
# !cp ./Qwen/Qwen2-1___5B-Instruct/tokenizer.json ./merged_model/

## 11. 模型合并

In [None]:
import os
os.makedirs('./merged_model', exist_ok=True)

In [None]:
# 模型合并存储

new_model_directory = "./merged_model"
merged_model = model.merge_and_unload()
# 将权重保存为safetensors格式的权重, 且每个权重文件最大不超过2GB(2048MB)
merged_model.save_pretrained(new_model_directory, max_shard_size="2048MB", safe_serialization=True)