# 常规SFT的微调样例代码

## 环境配置

系统为`Ubuntu 20.04`
CUDA为12.4，驱动是550.54.15

### 安装Conda
```bash
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
source ~/.bashrc
```

### 创建虚拟环境、添加一些镜像列表
```bash
# conda环境的镜像
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
conda config --add channels https://mirrors.bfsu.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.bfsu.edu.cn/anaconda/pkgs/main/
conda config --add channels https://mirrors.bfsu.edu.cn/anaconda/cloud/conda-forge/
# pip镜像
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
pip config set global.index-url http://mirrors.cloud.tencent.com/pypi/simple
pip config set global.index-url http://pypi.douban.com/simple/
pip config set global.index-url https://mirrors.163.com/pypi/simple/
# 创建虚拟环境
conda create -n <环境名称> python=3.12
# 激活你创建的环境
conda activate <环境名称>
```

### 安装运行库
```bash
# pytorch安装
pip install --upgrade torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124 --trusted-host download.pytorch.org
# 其他的依赖安装
pip install --upgrade torch==2.6.0 transformers BitsandBytes accelerate nltk numpy pandas tensorboardX evaluate scikit-learn sentence-transformers tiktoken deepspeed SentencePiece unsloth qwen-vl-utils[decord] nvitop trl -i https://mirrors.aliyun.com/pypi/simple/
# 如果用jupyter notebook 需要安装以下两个
conda install -n testllm ipykernel --update-deps --force-reinstall
conda install -n testllm IProgress
```

## 配置GPU

In [None]:
import os, re, argparse,json
os.environ["CUDA_VISIBLE_DEVICES"] = "6" # 这里替换成你想要的GPU序号

## 引入包体

In [None]:
import torch
from tqdm.auto import tqdm
import numpy as np
from transformers import (
    TrainingArguments,
    LogitsProcessorList,
    InfNanRemoveLogitsProcessor,
    PreTrainedTokenizer,
    PreTrainedTokenizerBase,
    PreTrainedTokenizerFast,
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    PreTrainedModel,
    PretrainedConfig,
    BitsAndBytesConfig,
    GenerationConfig,
    Seq2SeqTrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from transformers.trainer import TRAINER_STATE_NAME
from transformers.utils.versions import require_version
from transformers.trainer_utils import SaveStrategy
from transformers.tokenization_utils import PaddingStrategy
from peft import PeftModel, LoraConfig, get_peft_model, TaskType
from peft.utils import (
    CONFIG_NAME,
    SAFETENSORS_WEIGHTS_NAME,
    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
)

## 加载模型模板

In [None]:
from datasets import Dataset, IterableDataset
from dataclasses import dataclass
# from 模板 import get_template_and_model_path
# from public_apis.file_rw import *
from typing import Union, Optional, List, Dict, Any, Literal, Sequence, Tuple

RANDOM_SEED = 42
IGNORE_INDEX=-100# 引入这个常数，有时候在梯度下降时用于屏蔽无需计算梯度的token

TEMPLATE_DICT={
    'llama2':"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. \n{user_query}\nAssistant:  ",
    'llama3':"<|start_header_id|>system<|end_header_id|>\nYou are a helpful assistant. <|eot_id|>\n\n<|start_header_id|>user<|end_header_id|>\n{user_query}<|eot_id|>\n\n<|start_header_id|>assistant<|end_header_id|>\n",
    'qwen2.5':"<|im_start|>system\nYou are a helpful assistant. <|im_end|>\n<|im_start|>user\n{user_query}<|im_end|>\n<|im_start|>assistant\n",
    'gpt2':"{user_query}\n"
}

MODEL_BASE_PATH='/data'

# MODEL_BASE_PATH='/data/lzy/models/LLM'

MODEL_NAME_DICT={
    'llama2-7b':os.path.join(MODEL_BASE_PATH,'llama','llama-2-7b-hf'),
    'llama2-13b':os.path.join(MODEL_BASE_PATH,'llama','llama-2-13b-hf'),
    'llama3.1-8b-i':os.path.join(MODEL_BASE_PATH,'llama','llama3.1-8b-instruct'),
    'llama3.1-8b':os.path.join(MODEL_BASE_PATH,'llama','llama3.1-8b'),
    'llama3.2-1b-i':os.path.join(MODEL_BASE_PATH,'llama','llama3.2-1B-Instruct'),
    'llama3.2-3b-i':os.path.join(MODEL_BASE_PATH,'llama','llama3.2-3B-Instruct'),
    'qwen2.5-0.5b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-0.5B-Instruct'),
    'qwen2.5-1.5b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-1.5B-Instruct'),
    'qwen2.5-3b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-3B-Instruct'),
    'qwen2.5-7b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-7B-Instruct'),
    'qwen2.5-14b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-14B-Instruct'),
    'qwen2.5-coder-0.5b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-Coder-0.5B-Instruct'),
    'qwen2.5-coder-1.5b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-Coder-1.5B-Instruct'),
    'qwen2.5-coder-3b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-Coder-3B-Instruct'),
    'qwen2.5-coder-7b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-Coder-7B-Instruct'),
    'qwen2.5-coder-14b-i':os.path.join(MODEL_BASE_PATH,'Qwen','Qwen2.5-Coder-14B-Instruct'),
    # 'gpt2l':os.path.join(MODEL_BASE_PATH,'openai-community','gpt2-large'),
    # 'gpt2':os.path.join(MODEL_BASE_PATH,'openai-community','gpt2')
}
# 得到模板
def get_template_and_model_path(**arguments):
    model_name=str(arguments.get('model_name','llama3.1-8b-i'))
    model_name_or_path=MODEL_NAME_DICT[model_name]
    if model_name.startswith('qwen2.5'):
        template=TEMPLATE_DICT['qwen2.5']
    elif model_name.startswith('llama2'):
        template=TEMPLATE_DICT['llama2']
    elif model_name.startswith('gpt2'):
        template=TEMPLATE_DICT['gpt2']
    else:
        template=TEMPLATE_DICT['llama3']
    return model_name,model_name_or_path,template
# 自定义一种错误形式，主要用于防呆以及简化操作
class CustomDatasetTokenizerError(Exception):
    """Base class for custom exceptions in this module."""
    def __init__(self):
        # self.expression = expression
        self.message = "数据集元素的输入键值必须是以下组合：\n\t\"prompt\",\"query\",\"response\"\n\t\"instruction\",\"input\",\"output\",\n\t\"prompt\",\"completion\""
    def __str__(self):
        return self.message
    pass
# prompt_template应该是中间有个关键词叫做 `user_query` 
# 这个函数用于常规的微调（手搓，然后用`transformers`中的`trainer`或者`seq2seqtrainer`直接微调）
# 从载入的JSON文件一步变为令牌化的数据集样式
def process_tokens_tokenizer_functions(example: dict[str, str],tokenizer: Union[PreTrainedTokenizer,PreTrainedTokenizerBase,PreTrainedTokenizerFast,AutoTokenizer],prompt_template: str,train_mode: Literal["sft", "pt"] = "sft"):
    prompt=""
    completion=""
    if "prompt" in example.keys() and "query" in example.keys() and "response" in example.keys():
        prompt=prompt_template.format(user_query="{}\n{}\n".format(example["prompt"], example["query"]))
        completion=example["response"]
    elif "instruction" in example.keys() and "input" in example.keys() and "output" in example.keys():
        prompt=prompt_template.format(user_query="{}\n{}\n".format(example["instruction"], example["input"]))
        completion=example["output"]
    elif "prompt" in example.keys() and "completion" in example.keys():
        prompt=prompt_template.format(user_query="{}\n".format(example["prompt"]))
        completion=example["completion"]
    else:
        raise CustomDatasetTokenizerError()
    # SFT样本构建
    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
    # 找出非标签部分
    example_str = "{}{}".format(prompt, completion)
    input_id = tokenizer.encode(example_str)
    # 训练数据集末尾带上截止符号
    if input_id[-1] != tokenizer.eos_token_id:
        input_id.append(tokenizer.eos_token_id)
    input_id_before_label = tokenizer.encode(prompt)
    label = input_id.copy()
    # SFT只需要label部分计算梯度，pt需要所有部分计算梯度
    if train_mode == "sft":
        label[: len(input_id_before_label)] = [IGNORE_INDEX] * len(
            input_id_before_label
        )
    attention_mask = [1] * len(input_id)
    # 放入输入字典中
    if "qid" in example.keys():
        model_inputs["qid"] = example["qid"]
    elif "ID" in example.keys():
        model_inputs["qid"] = example["ID"]
    model_inputs["input_ids"] = input_id
    model_inputs["attention_mask"] = attention_mask
    model_inputs["labels"] = label
    return model_inputs
# 直接处理成`trl`可接受的格式（直接文本就行，剩下的交给剩下）
def process_dataset_functions(example: dict[str, str]):
    prompt=""
    completion=""
    if "prompt" in example.keys() and "query" in example.keys() and "response" in example.keys():
        prompt="{}\n{}\n".format(example["prompt"], example["query"])
        completion=example["response"]
    elif "instruction" in example.keys() and "input" in example.keys() and "output" in example.keys():
        prompt="{}\n{}\n".format(example["instruction"], example["input"])
        completion=example["output"]
    elif "prompt" in example.keys() and "completion" in example.keys():
        prompt="{}\n".format(example["prompt"])
        completion=example["completion"]
    else:
        raise CustomDatasetTokenizerError()
    return {"prompt":prompt,"completion":completion,"text":"### Instruction:\n {}\n ### Response:\n {}\n".format(prompt,completion),"ground_truth":completion}

# 查找检查点需要的正则表达式
CHECKPOINT_FOLD = re.compile(r"(?:checkpoint\-\d+)")
FINALMODEL_NAME = re.compile(r"(?:(?:(?:adapter|pytorch)_)?model(?:\-\d+)?\.(?:safetensors|bin))")
# 查找检查点的函数
def checkout_format(string: str, pattern: re.Pattern):
    result = [n for n in pattern.findall(string) if n]
    if len(result) == 1:
        return result[0] == string
    return False
# 查找检查点的函数
def checkpoint_sort_func(checkpoint_dirname: str):
    items = [n.strip() for n in checkpoint_dirname.split("-") if n.strip()]
    return int(items[-1])
# 查找检查点的函数
def findout_checkpoint(path: str):
    items = os.listdir(path)
    saved_files = [n for n in items if checkout_format(n, FINALMODEL_NAME)]
    if len(saved_files) > 0:
        return path
    check_dirs = [n for n in items if checkout_format(n, CHECKPOINT_FOLD)]
    if len(check_dirs) < 1:
        return None
    check_dirs.sort(key=lambda x: checkpoint_sort_func(x), reverse=True)
    for f in check_dirs:
        result = findout_checkpoint(os.path.join(path, f))
        if result != None:
            return result
    return None
# 检查并创建路径
def _check_create_dirs(file_path: str):
    """
    # 代码解释
    这段代码的功能是检查给定文件路径的父目录是否存在，如果不存在则创建该目录。具体逻辑如下：
    1. 使用 `os.path.dirname` 获取文件路径的父目录路径。
    2. 如果父目录路径不为空，则进一步检查该路径是否存在。
    3. 如果路径不存在，则调用 `os.makedirs` 创建该目录。

    # 控制流图
    ```mermaid
    flowchart TD
        A[开始] --> B[获取文件路径的父目录]
        B --> C{父目录是否为空}
        C -->|是| D[结束]
        C -->|否| E{父目录是否存在}
        E -->|否| F[创建父目录]
        F --> G[结束]
        E -->|是| G[结束]
    ```
    """
    # 获取文件路径的父目录
    p = os.path.dirname(file_path)
    # 检查父目录是否为空
    if p != '':
        # 检查父目录是否存在
        if not os.path.exists(p):
            # 如果父目录不存在，则创建该目录
            os.makedirs(p)
    pass
def load_from_json(file_path: str) -> Union[None, Dict, List]:
    """
    从指定的文件路径加载JSON数据并解析。

    此函数尝试从给定的文件路径中读取JSON数据，如果文件不存在，则返回None。
    如果文件存在，它将打开文件，读取内容，并将内容解析为JSON格式的数据，然后返回。
    支持返回的数据类型可以是字典、列表或None。

    参数:
    file_path (str): JSON文件的路径。

    返回:
    Union[None, Dict, List]: 解析后的JSON数据，如果文件不存在则返回None。
    """
    if not os.path.exists(file_path):
        return None
    else:
        with open(file_path, "r", encoding="utf-8") as f:
            content = json.loads(f.read())
            f.close()
        return content
    pass
def save_to_json(data: Union[Dict, List], file_path: str):
    """
    将数据保存到JSON文件中。
    
    该函数接受一个字典或列表形式的数据，并将其序列化为JSON格式，
    然后将序列化的数据写入到指定的文件路径中。如果文件路径中的目录不存在，
    则会先创建目录。
    
    参数:
    data (Union[Dict, List]): 要保存的字典或列表数据。
    file_path (str): 数据保存的文件路径。
    
    返回:
    无返回值。
    """
    _check_create_dirs(file_path)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False, indent=4))
        f.close()
    pass

## 参数、模型准备

In [None]:
model_name, model_name_or_path, template = get_template_and_model_path(
    model_name="llama3.1-8b-i"
)

output_dir = "results/normal"

config_dicts = dict(
    ignore_pad_token_for_loss=True,
    lora_config_dicts=dict(
        lora_rank=8, lora_alpha=32, lora_dropout=0.1, additional_target=None
    ),
)



training_config_dict = dict(
    data_path="<你的训练文件地址>",
    output_dir=output_dir,
    overwrite_output_dir=False,
    do_train=True,
    lr_scheduler_type="cosine",
    learning_rate=1e-4,
    num_train_epochs=20,
    save_steps=1000,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    fp16=True,
    bf16=False,
    logging_dir=os.path.join(output_dir, "logs"),
    logging_steps=1000,
    ddp_find_unused_parameters=False,
    resume_from_checkpoint=False,
    warmup_ratio=0,
    gradient_checkpointing=False,
)
# 注意，这里没写评估
eval_config_dict = dict(
    data_path="<你的评估文件地址>",
    max_new_tokens=2048,
)


generating_args = {
    "val_size": eval_config_dict.get("val_size", 0),
    "streaming": eval_config_dict.get("streaming", False),
    "buffer_size": eval_config_dict.get("buffer_size", 16384),
    "do_sample": eval_config_dict.get("do_sample", True),
    "temperature": eval_config_dict.get("temperature", 1),
    "top_p": eval_config_dict.get("top_p", 0.7),
    "top_k": eval_config_dict.get("top_k", 50),
    "num_beams": eval_config_dict.get(
        "num_beams", 8
    ),  # 训练的时候为`None`，推理的时候大于0
    "max_new_tokens": eval_config_dict.get("max_new_tokens", 1024),
    "repetition_penalty": eval_config_dict.get("repetition_penalty", 1.0),
    "length_penalty": eval_config_dict.get("length_penalty", 1.0),
    # "num_beam_groups":4,
    # "diversity_penalty":0.2
}

lora_config_dicts = config_dicts["lora_config_dicts"]
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=lora_config_dicts["lora_rank"],
    lora_alpha=lora_config_dicts["lora_alpha"],
    lora_dropout=lora_config_dicts["lora_dropout"],
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    modules_to_save=lora_config_dicts.get("additional_target", None),
)
prompt_template = template

model_path = model_name_or_path
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="cuda",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    use_auth_token=True,
)
model.config.use_cache = False
model = get_peft_model(model, lora_config, "default")
loadpath = findout_checkpoint(output_dir)
if loadpath != None:
    print("从`{}`加载lora头".format(loadpath))
    model.load_adapter(loadpath, "default")

## 训练

输入的数据集得的结构是以下三种类型的：

1. 键值同时包含（第一优先级）： `prompt`,`query`,`response`
2. 键值同时包含（第二优先级）： `instruction`,`input`,`output`
3. 键值同时包含（第三优先级）： `prompt`,`completion`

In [None]:
train_dataset=Dataset.from_list([process_tokens_tokenizer_functions(n,tokenizer,prompt_template,training_config_dict.get('stage','sft')) for n in tqdm(load_from_json(training_config_dict["data_path"])) ])
print(tokenizer.decode(train_dataset[0]['input_ids']) )
tokenizer.padding_side="right"
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model,label_pad_token_id=(IGNORE_INDEX if training_config_dict.get("ignore_pad_token_for_loss",True) else tokenizer.pad_token_id))
generation_config=GenerationConfig(**generating_args)
training_args = Seq2SeqTrainingArguments(
    output_dir=training_config_dict["output_dir"],
    overwrite_output_dir=training_config_dict.get("overwrite_output_dir",False),
    do_train=True,
    lr_scheduler_type=training_config_dict.get("lr_scheduler_type","cosine"),
    learning_rate=training_config_dict.get("learning_rate",1e-4),
    num_train_epochs=training_config_dict.get("num_train_epochs",20),
    save_steps=training_config_dict.get("save_steps",1000),
    per_device_train_batch_size=training_config_dict.get("batch_size",4),
    gradient_accumulation_steps=training_config_dict.get("gradient_accumulation_steps",4),
    per_device_eval_batch_size=training_config_dict.get("batch_size",4),
    fp16= isinstance(training_config_dict.get("compute_dtype",torch.float16),torch.dtype),
    bf16= not isinstance(training_config_dict.get("compute_dtype",torch.float16),torch.dtype),
    logging_dir=training_config_dict.get("logging_dir",None),
    logging_steps=training_config_dict.get("logging_steps",1000),
    ddp_find_unused_parameters=False,
    resume_from_checkpoint=True,
    warmup_ratio=training_config_dict.get("warmup_ratio",0),
    gradient_checkpointing=training_config_dict.get("gradient_checkpointing",False),
)
trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=None,
        train_dataset=train_dataset,
    )
trainer.train()
trainer.save_model()
