In [26]:
import mindnlp
import mindspore
from mindnlp import core
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

In [27]:
import pandas as pd
from datasets import Dataset
df = pd.read_json('./lora_training_data.json')
ds = Dataset.from_pandas(df)
ds[:3]

{'instruction': ['请回答以下问题：',
  '请基于以下对话历史回答问题：\n用户: 你知道北京石刻艺术博物馆吗？\n助手: 知道，是第一座陈列北京地区石刻文物的专题性博物馆。',
  '请基于以下对话历史回答问题：\n用户: 你知道北京石刻艺术博物馆吗？\n助手: 知道，是第一座陈列北京地区石刻文物的专题性博物馆。\n用户: 都啥时候开放呢？\n助手: 9:00-16:00（周一闭馆）。'],
 'input': ['你知道北京石刻艺术博物馆吗？', '都啥时候开放呢？', '地址呢？'],
 'output': ['知道，是第一座陈列北京地区石刻文物的专题性博物馆。',
  '9:00-16:00（周一闭馆）。',
  '北京市海淀区五塔寺村24号（近动物园西北门）。']}

In [28]:
# 实例化tokenizer
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', use_fast=False, trust_remote_code=True)
tokenizer

LlamaTokenizerFast(name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', vocab_size=151643, model_max_length=16384, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<｜end▁of▁sentence｜>', 'pad_token': '<｜end▁of▁sentence｜>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<｜end▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<｜User｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151645: AddedToken("<｜Assistant｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151646: AddedToken("<｜begin▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151647: AddedToken("<|EOT|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151648: AddedToken("<think>", rstrip=False

In [29]:
# 数据预处理
def process_func(example):
    MAX_LENGTH = 8092    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n现在你是专业的北京旅游导游<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [30]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

Map: 100%|██████████| 9280/9280 [00:12<00:00, 722.45 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9280
})

In [31]:
# 分词器解码
tokenizer.decode(tokenized_id[0]['input_ids'])

'<|im_start|>system\n现在你是专业的北京旅游导游<|im_end|>\n<|im_start|>user\n请回答以下问题：你知道北京石刻艺术博物馆吗？<|im_end|>\n<|im_start|>assistant\n知道，是第一座陈列北京地区石刻文物的专题性博物馆。<｜end▁of▁sentence｜>'

In [32]:
# 加载基础模型
model = AutoModelForCausalLM.from_pretrained('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', ms_dtype=mindspore.bfloat16, device_map=0)

# 开启梯度检查点时，要执行该方法
model.enable_input_require_grads()

In [44]:
# 微调前推理
# host to device
model = model.npu()

prompt = "你现在是专业的北京导游"
contents= "北京恭王府好不好？"
inputs = tokenizer.apply_chat_template([{"role": "system", "content": prompt},{"role": "user", "content": contents}],
                                       add_generation_prompt=True,
                                       tokenize=True,
                                       return_tensors="ms",
                                       return_dict=True
                                       ).to('cuda')


gen_kwargs = {"max_length": 8192, "do_sample": True, "top_k": 1}
with core.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


北京恭王府是北京著名的一片 greenery，恭王府是英国传教士王文庆在顺平陵修建的北京的第16座城关大殿后，作为顺平陵和延庆府衙之间的一块地，恭王府的建筑风格古朴典雅，布局有序，有古建筑特色。


In [34]:
# 配置LoRA
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'gate_proj', 'q_proj', 'k_proj', 'down_proj', 'o_proj', 'up_proj', 'v_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)

In [35]:
print("Model without LoRA:\n",model)
# 根据上述的lora配置，为模型添加lora部分
model = get_peft_model(model, config)
print('='*50)
print("Model with LoRA:\n",model)
# 输出打印需要训练的参数比例
model.print_trainable_parameters()

Model without LoRA:
 Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear (1536 -> 1536)
          (k_proj): Linear (1536 -> 256)
          (v_proj): Linear (1536 -> 256)
          (o_proj): Linear (1536 -> 1536)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear (1536 -> 8960)
          (up_proj): Linear (1536 -> 8960)
          (down_proj): Linear (8960 -> 1536)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbedding()
  )
  (lm_head): Linear (1536 -> 151936)
)
Model with LoRA:
 PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): E

In [36]:
# 定义训练超参数
args = TrainingArguments(
    output_dir="./output_1.5bf/Qwen2.5_instruct_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=5,
    logging_steps=10,
    num_train_epochs=5,
    save_steps=100, 
    learning_rate=1e-4,
    save_on_each_node=True,
)

In [None]:
args.resume_from_checkpoint = True
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

trainer.train()

Detected kernel version 4.19.90, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,1.378
20,1.3692
30,1.4
40,1.4493
50,1.4041
60,1.423
70,1.4615
80,1.4614
90,1.4235
100,1.2934


TrainOutput(global_step=2320, training_loss=0.9467645869172853, metrics={'train_runtime': 9279.135, 'train_samples_per_second': 5.0, 'train_steps_per_second': 0.25, 'total_flos': 1.500668663402373e+17, 'train_loss': 0.9467645869172853, 'epoch': 5.0})

In [43]:
mode_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
lora_path = './output_1.5bf/Qwen2.5_instruct_lora/checkpoint-1300' # 这里改称你的 lora 输出对应 checkpoint 地址

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, ms_dtype=mindspore.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

# host to device
model = model.npu()
prompt =  "恭王府附近有什么景点？"
inputs = tokenizer.apply_chat_template([{"role": "system", "content": "现在你是专业的北京导游"},{"role": "user", "content": prompt}],
                                       add_generation_prompt=True,
                                       tokenize=True,
                                       return_tensors="ms",
                                       return_dict=True
                                       ).to('cuda')


gen_kwargs = {"max_length": 8192, "do_sample": True, "top_k":3}
with core.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


北京故宫是全国家家博物馆，是全国家家文化宝库，也是中华民族文化的象征。北京故宫坐北朝南的，东、南、西、北四个门。故宫的外墙上有“崇文修道，历朝 recurrence of events”的 Title，
