> 参考资料：https://github.com/InternLM/InternLM/blob/main/README-zh-Hans.md

# 导入环境

In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


In [2]:
# 将JSON文件转换为CSV文件
df = pd.read_json('./dataset/huanhuan.json')
ds = Dataset.from_pandas(df)

In [3]:
ds[:3]

{'instruction': ['小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——',
  '这个温太医啊，也是古怪，谁不知太医不得皇命不能为皇族以外的人请脉诊病，他倒好，十天半月便往咱们府里跑。',
  '嬛妹妹，刚刚我去府上请脉，听甄伯母说你来这里进香了。'],
 'input': ['', '', ''],
 'output': ['嘘——都说许愿说破是不灵的。', '你们俩话太多了，我该和温太医要一剂药，好好治治你们。', '出来走走，也是散心。']}

# 处理数据集

In [4]:
tokenizer = AutoTokenizer.from_pretrained('/root/autodl-fs/Shanghai_AI_Laboratory/internlm-chat-7b', use_fast=False, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer

InternLMTokenizer(name_or_path='/root/autodl-fs/Shanghai_AI_Laboratory/internlm-chat-7b', vocab_size=103168, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False)

In [5]:
print(tokenizer.eos_token_id)

2


In [6]:
print(tokenizer.encode(ds[0]['instruction']))

[1, 72901, 98899, 68765, 61239, 60540, 68237, 60746, 60363, 60451, 98899, 78551, 71584, 72901, 60467, 60536, 66299, 75154, 98899, 78817, 67472, 67957, 60577, 60577, 73395, 98672, 98672]


In [7]:
text = "现在你要扮演皇帝身边的女人--甄嬛"
inputs = tokenizer(text, return_tensors="pt")
print(f"inputs:{inputs}")
# input_ids = inputs["input_ids"].to("cuda")
input_ids = inputs["input_ids"]

inputs:{'input_ids': tensor([[    1, 67442, 68687, 70305, 69770, 72637, 68092,   444, 63761, 64878]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [8]:
print(input_ids)

tensor([[    1, 67442, 68687, 70305, 69770, 72637, 68092,   444, 63761, 64878]])


In [9]:
# 需要注意MAX_LENGTH一定要根据数据集的文本长度来设置。这里设置为128，因为数据集中多为短文本
def process_func(example):
    MAX_LENGTH = 128
    input_ids, labels = [], []
    instruction = tokenizer.encode(text="\n".join(["<|system|>", "现在你要扮演皇帝身边的女人--甄嬛", "<|user|>"+
                                    example["instruction"] + example["input"] + "<|assistant|>"]).strip() + "\n",
                                    add_special_tokens=False, truncation=True, max_length=MAX_LENGTH)
    response = tokenizer.encode(text=example["output"], add_special_tokens=False, truncation=True, max_length=MAX_LENGTH)
    input_ids = instruction + response + [tokenizer.eos_token_id]
    labels = [tokenizer.pad_token_id] * len(instruction) + response + [tokenizer.eos_token_id]
    pad_len = MAX_LENGTH - len(input_ids)
    input_ids += [tokenizer.pad_token_id] * pad_len
    labels += [tokenizer.pad_token_id] * pad_len
    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

    return {
        "input_ids": input_ids,
        "labels": labels
    }

In [10]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

Map:   0%|          | 0/3729 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 3729
})

In [11]:
tokenizer.decode(tokenized_id[0]['input_ids'])

' <|system|>\n现在你要扮演皇帝身边的女人--甄嬛\n<|user|>小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——<|assistant|>\n嘘——都说许愿说破是不灵的。</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>'

In [12]:
tokenizer.decode(tokenized_id[0]['input_ids'])

' <|system|>\n现在你要扮演皇帝身边的女人--甄嬛\n<|user|>小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——<|assistant|>\n嘘——都说许愿说破是不灵的。</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>'

In [13]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"])))

'你们俩话太多了，我该和温太医要一剂药，好好治治你们。'

# 创建模型

In [14]:
import torch

model = AutoModelForCausalLM.from_pretrained('/root/autodl-fs/Shanghai_AI_Laboratory/internlm-chat-7b', trust_remote_code=True, torch_dtype=torch.half, device_map="auto")
model

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

InternLMForCausalLM(
  (model): InternLMModel(
    (embed_tokens): Embedding(103168, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x InternLMDecoderLayer(
        (self_attn): InternLMAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (rotary_emb): InternLMDynamicNTKScalingRotaryEmbedding()
        )
        (mlp): InternLMMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): InternLMRMSNorm()
        (post_attention_layernorm): InternLMRMSNorm(

In [15]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [16]:
model.dtype

torch.float16

In [17]:
for name, param in model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.q_proj.bias
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.k_proj.bias
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.v_proj.bias
model.layers.0.self_attn.o_proj.weight
model.layers.0.self_attn.o_proj.bias
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.q_proj.bias
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.k_proj.bias
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.v_proj.bias
model.layers.1.self_attn.o_proj.weight
model.layers.1.self_attn.o_proj.bias
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_l

# Lora 微调

1. target_modules也可以传入正则项,比如以h.1结尾的query_key_value：".*\.1.*query_key_value"  
2. modules_to_save指定的是除了拆成lora的模块，其他的模块可以完整的指定训练。

In [18]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj","v_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'q_proj', 'v_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [19]:
model = get_peft_model(model, config)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/root/autodl-fs/Shanghai_AI_Laboratory/internlm-chat-7b', revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'q_proj', 'v_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [20]:
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 7,326,142,464 || trainable%: 0.05725119352524783


# 配置训练参数

In [24]:
args = TrainingArguments(
    output_dir="./output/InternLM",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    logging_steps=10, # 输出步数
    num_train_epochs=1,
    gradient_checkpointing=True,
    save_steps=50, # 保存步数
    learning_rate=1e-4, # 学习率
    save_on_each_node=True
)

In [25]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [26]:
trainer.train()

Step,Training Loss
10,3.2002
20,3.1562
30,3.2613
40,3.2396
50,3.3165
60,3.1928
70,3.214
80,3.2571
90,3.3024
100,3.1729


TrainOutput(global_step=233, training_loss=3.232807470493562, metrics={'train_runtime': 422.8568, 'train_samples_per_second': 8.819, 'train_steps_per_second': 0.551, 'total_flos': 2.03544749850624e+16, 'train_loss': 3.232807470493562, 'epoch': 1.0})

In [None]:
model.eval()
# ipt = tokenizer("<|im_start|>system\n现在你要扮演皇帝身边的女人--甄嬛.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n".format( "介绍一下自己是谁", "").strip() + "\nAssistant: ", return_tensors="pt").to(model.device)
# tokenizer.decode(model.generate(**ipt, max_length=512, do_sample=True, eos_token_id=tokenizer.eos_token_id, temperature=0.1)[0], skip_special_tokens=True)

In [None]:
response, history = model.chat(tokenizer, "介绍一下自己是谁", history=[])
response

In [None]:
model = model.cuda()
# ipt = tokenizer("<|system|>\n现在你要扮演皇帝身边的女人--甄嬛\n<|user|>\n {}\n{}".format("你是谁？", "").strip() + "<|assistant|>\n", return_tensors="pt").to(model.device)
# tokenizer.decode(model.generate(**ipt, max_length=128, do_sample=True)[0], skip_special_tokens=True)

In [None]:
model = model.eval()
response, history = model.chat(tokenizer, "你好", history=[])
print(response)
assert len(response) != 0
response, history = model.chat(tokenizer, "请提供三个管理时间的建议。", history=history)
print(response)
assert len(response) != 0

In [None]:
ipt = tokenizer("<|Bot|>\n现在你要扮演皇帝身边的女人--甄嬛\n<|User|>\n {}\n{}".format("你是谁？", "").strip() + "<|assistant|>\n", return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**ipt, max_length=128, do_sample=True)[0], skip_special_tokens=True)

In [None]:
response, history = model.chat(tokenizer, "你好", history=[])
response