## 导入必要的包

In [1]:
from datasets import Dataset, load_dataset
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, BitsAndBytesConfig, GenerationConfig
import torch

In [2]:
print("torch version: ", torch.__version__)
print("transformers version: ", transformers.__version__)

torch version:  2.2.1+cu121
transformers version:  4.39.0


In [3]:
MAX_LENGTH = 512    # 分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
data_path = "../data/huanhuan.json"
pretrained_model_name_or_path = "../models/internlm2-chat-1_8b"
work_dir = "../work_dirs/internlm2_chat_1_8b_qlora_huanhuan_e3_hf"
system_prompt = "现在你要扮演皇帝身边的女人--甄嬛"

## 载入数据

In [4]:
# 使用datasets读取数据
df = pd.read_json(data_path)
df

Unnamed: 0,instruction,input,output
0,小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——,,嘘——都说许愿说破是不灵的。
1,这个温太医啊，也是古怪，谁不知太医不得皇命不能为皇族以外的人请脉诊病，他倒好，十天半月便往咱...,,你们俩话太多了，我该和温太医要一剂药，好好治治你们。
2,嬛妹妹，刚刚我去府上请脉，听甄伯母说你来这里进香了。,,出来走走，也是散心。
3,嬛妹妹，我虽是一介御医，俸禄微薄，可是我保证会一生一世对你好，疼爱你，保护你，永远事事以你为...,,实初哥哥这么说，就枉顾我们一直以来的兄妹情谊了，嬛儿没有哥哥，一直把你当作自己的亲哥哥一样看...
4,实初虽然唐突了妹妹，却是真心实意地希望妹妹不要去应选，这不仅仅是因为我心里一直把妹妹当成……...,,我们两家是世交，昔年恩义不过是父亲随手之劳，不必挂怀。
...,...,...,...
3724,你好,,皇上好，我是甄嬛，家父是大理寺少卿甄远道。
3725,你是谁？,,我是甄嬛，家父是大理寺少卿甄远道。
3726,你好,,皇上好，我是甄嬛，家父是大理寺少卿甄远道。
3727,你是谁？,,我是甄嬛，家父是大理寺少卿甄远道。


In [5]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 3729
})

In [6]:
ds[0]

{'instruction': '小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——',
 'input': '',
 'output': '嘘——都说许愿说破是不灵的。'}

In [7]:
print(Dataset.from_json(data_path))
print(Dataset.from_json(data_path)[0])

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 3729
})
{'input': '', 'output': '嘘——都说许愿说破是不灵的。', 'instruction': '小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——'}


In [8]:
print(load_dataset('json', data_files=data_path))
print(load_dataset('json', data_files=data_path)['train'][0])

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 3729
    })
})
{'input': '', 'output': '嘘——都说许愿说破是不灵的。', 'instruction': '小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——'}


## 处理数据集

In [9]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=False, trust_remote_code=True)
tokenizer

InternLM2Tokenizer(name_or_path='../models/internlm2-chat-1_8b', vocab_size=92544, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|action_start|>', '<|action_end|>', '<|interpreter|>', '<|plugin|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	92538: AddedToken("<|plugin|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	92539: AddedToken("<|interpreter|>", rstrip=False, lstrip=False, single_word=False, norm

In [10]:
tokenizer.all_special_ids

[1, 2, 0, 92543, 92542, 92541, 92540, 92539, 92538]

In [11]:
tokenizer.all_special_tokens

['<s>',
 '</s>',
 '<unk>',
 '<|im_start|>',
 '<|im_end|>',
 '<|action_start|>',
 '<|action_end|>',
 '<|interpreter|>',
 '<|plugin|>']

In [12]:
tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id

(1, 2, 2)

In [13]:
tokenizer.encode("<s>[UNUSED_TOKEN_146][UNUSED_TOKEN_145]", add_special_tokens=False)

[1, 92543, 92542]

In [14]:
tokenizer.decode([1, 92543, 92542], skip_special_tokens=False)

' <s><|im_start|><|im_end|>'

In [15]:
tokenizer.encode("<s><|im_start|><|im_end|>", add_special_tokens=False)

[1, 92543, 92542]

In [16]:
# https://github.com/InternLM/xtuner/blob/main/xtuner/utils/templates.py#L24
internlm2_chat = dict(
    SYSTEM = '<|im_start|>system\n{system}<|im_end|>\n',
    INSTRUCTION = ('<|im_start|>user\n{input}<|im_end|>\n'
                   '<|im_start|>assistant\n'),
    SUFFIX = '<|im_end|>',
    SUFFIX_AS_EOS = True,
    SEP = '\n',
    STOP_WORDS = ['<|im_end|>'])

In [17]:
# https://huggingface.co/internlm/internlm2-chat-1_8b/blob/main/modeling_internlm2.py#L1136
def build_inputs(tokenizer, query: str, history: list[tuple[str, str]] = [], meta_instruction=""):
    if tokenizer.add_bos_token:
        prompt = ""
    else:
        prompt = tokenizer.bos_token
    if meta_instruction:
        prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
    for record in history:
        prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
    prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
    return prompt, tokenizer([prompt], return_tensors="pt")

In [18]:
build_inputs('你好')

'<s><|im_start|>system\n我是系统<|im_end|>\n<|im_start|>user\n你好<|im_end|>\n<|im_start|>assistant\n'

In [25]:
def process_func(example):
    # print(example)
    # {
    #     'instruction': '小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——',
    #     'input': '',
    #     'output': '嘘——都说许愿说破是不灵的。'
    # }

    input_ids, attention_mask, labels = [], [], []
    # <s> tokenizer会默认添加,不过这里使用手动添加的方式
    instruction = tokenizer(f"<s><|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{example['instruction']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}<|im_end|>\n", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]             # tokenizer.eos_token_id = 2 是 </s>
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]                   # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]  # 3条数据长度相同

    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [26]:
# remove_columns: map 后会移除这一列
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

Map:   0%|          | 0/3729 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3729
})

In [27]:
print(tokenized_id[0].keys())
print(tokenized_id[0]['input_ids'])
print(tokenized_id[0]['attention_mask'])
print(tokenized_id[0]['labels'])

dict_keys(['input_ids', 'attention_mask', 'labels'])
[1, 92543, 9081, 364, 68293, 69538, 71156, 70621, 73488, 68943, 444, 63840, 65094, 92542, 364, 92543, 1008, 364, 73752, 60353, 69616, 61261, 60553, 69088, 60763, 60366, 60459, 60353, 79402, 72435, 73752, 60475, 60549, 66820, 76005, 60353, 79668, 68323, 68808, 60591, 60591, 74246, 92395, 92542, 364, 92543, 525, 11353, 364, 64389, 92395, 72598, 90192, 60423, 61198, 69153, 61255, 60354, 60355, 92542, 364, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 64389, 92395, 72598, 90192, 60423, 61198, 69153, 61255, 

In [27]:
print(tokenizer.decode(tokenized_id[0]['input_ids']))

 <s><|im_start|> system
现在你要扮演皇帝身边的女人--甄嬛<|im_end|> 
<|im_start|> user
小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——<|im_end|> 
<|im_start|> assistant
嘘——都说许愿说破是不灵的。<|im_end|> 
</s>


In [28]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[0]["labels"])))

' 嘘——都说许愿说破是不灵的。<|im_end|> \n</s>'

## 创建模型

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # 是否在4位精度下加载模型。如果设置为True，则在4位精度下加载模型。
    load_in_8bit=False,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.float16,   # 4位精度计算的数据类型。这里设置为torch.float16，表示使用半精度浮点数。
    bnb_4bit_quant_type='nf4',              # 4位精度量化的类型。这里设置为"nf4"，表示使用nf4量化类型。 nf4: 4bit-NormalFloat
    bnb_4bit_use_double_quant=True,         # 是否使用双精度量化。如果设置为True，则使用双精度量化。
)
quantization_config

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map='auto',
    low_cpu_mem_usage=True,             # 是否使用低CPU内存，使用 device_map 参数必须为 True
    quantization_config=quantization_config,
)
model.enable_input_require_grads()      # 开启梯度检查点时，要执行该方法
model

In [None]:
print(f"model.device: {model.device}, model.dtype: {model.dtype}")

## Lora

In [None]:
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training, load_peft_weights

In [None]:
# https://huggingface.co/docs/peft/developer_guides/quantization
model = prepare_model_for_kbit_training(model)
model

In [None]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,   # 训练模式
    r=64,                   # Lora 秩
    target_modules=['wqkv', 'wo', 'w1', 'w2', 'w3'],
    lora_alpha=16,          # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1,       # Dropout 比例
    bias='none'
)
config

In [None]:
model = get_peft_model(model, config)
config

In [None]:
model.print_trainable_parameters()

## 配置训练参数

In [None]:
args = TrainingArguments(
    output_dir=work_dir,
    optim="paged_adamw_32bit",
    learning_rate=1e-5,
    gradient_checkpointing=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # 4*4=16
    logging_steps=10,
    save_strategy="epoch",  # epoch or steps
    save_steps=1,           # 每个epoch保存一次模型
    save_total_limit=3,
    save_on_each_node=True,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    bf16 = False,   # 指定训练时的类型
    fp16 = True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [None]:
trainer.train()