In [None]:
# PDTB 句间关系分类 LoRA 微调实验
本实验使用 `MindSpore + mindnlp + LoRA` 在 `DeepSeek-R1-Distill-Qwen-1.5B` 模型上进行微调，任务目标是：
- 输入：一个句子（或对话内容）
- 输出：该句子属于哪一种 PDTB 分类，并解释原因。

In [1]:
import mindnlp
import mindspore
from mindnlp import core
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

Modular Diffusers is currently an experimental feature under active development. The API is subject to breaking changes in future releases.


In [2]:
train_path = "/home/ma-user/work/data/train.json"
val_path = "/home/ma-user/work/data/val.json"

df_train = pd.read_json(train_path)
df_val = pd.read_json(val_path)

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)

ds_train[:3]

{'content': ['他的有没有什么不足之处？我觉得他可以就是加一些他自己的感受，因为他如果光只说那些一系列的动作，就感觉很空白，没有什么情感在里面。',
  '星汉是什么？银河。',
  '对于花来说没有人欣赏是多么的悲惨，就像我们姑娘把自己打扮得花枝招展，却没有人欣赏一样是一种不幸'],
 'summary': ['扩展\n原因：前半句话提出问题，询问他的不足之处，后半句话则具体回答了我认为的他的不足之处，所以属于扩展关系。',
  '扩展\n原因：',
  '扩展\n原因：']}

In [3]:
# 实例化tokenizer
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', use_fast=False, trust_remote_code=True)
tokenizer

LlamaTokenizerFast(name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', vocab_size=151643, model_max_length=16384, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<｜end▁of▁sentence｜>', 'pad_token': '<｜end▁of▁sentence｜>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<｜end▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<｜User｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151645: AddedToken("<｜Assistant｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151646: AddedToken("<｜begin▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151647: AddedToken("<|EOT|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151648: AddedToken("<think>", rstrip=False

In [4]:
MAX_LENGTH = 384

def process_func(example):
    instruction = tokenizer(
        f"<|im_start|>system\n你是PDTB文本关系分析助手<|im_end|>\n"
        f"<|im_start|>user\n{example.get('content','') + example.get('input','')}<|im_end|>\n"
        f"<|im_start|>assistant\n",
        add_special_tokens=False
    )
    response = tokenizer(f"{example.get('summary','')}", add_special_tokens=False)

    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]

    # 截断
    input_ids = input_ids[:MAX_LENGTH]
    attention_mask = attention_mask[:MAX_LENGTH]
    labels = labels[:MAX_LENGTH]

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

tokenized_train = ds_train.map(process_func, remove_columns=ds_train.column_names)
tokenized_train

tokenized_val = ds_val.map(process_func, remove_columns=ds_val.column_names)
tokenized_val

tokenizer.decode(tokenized_train[0]['input_ids'])

Map: 100%|██████████| 9198/9198 [00:06<00:00, 1330.83 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 1502.76 examples/s]


'<|im_start|>system\n你是PDTB文本关系分析助手<|im_end|>\n<|im_start|>user\n他的有没有什么不足之处？我觉得他可以就是加一些他自己的感受，因为他如果光只说那些一系列的动作，就感觉很空白，没有什么情感在里面。<|im_end|>\n<|im_start|>assistant\n扩展\n原因：前半句话提出问题，询问他的不足之处，后半句话则具体回答了我认为的他的不足之处，所以属于扩展关系。<｜end▁of▁sentence｜>'

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
    ms_dtype=mindspore.bfloat16,
    device_map=0
)

model.enable_input_require_grads()  # 开启梯度检查点

[MS_ALLOC_CONF]Runtime config:  enable_vmm:True  vmm_align_size:2MB


In [6]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    inference_mode=False
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 9,232,384 || all params: 1,786,320,384 || trainable%: 0.5168


In [7]:
# 定义训练超参数
args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=5,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100, 
    learning_rate=3e-5,
    save_on_each_node=True,
)


In [8]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

Detected kernel version 4.19.90, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
10,4.5547
20,3.4098
30,2.5344
40,1.9698
50,1.7212
60,1.5305
70,1.4155
80,1.3218
90,1.2873
100,1.289


TrainOutput(global_step=1380, training_loss=0.9027012603870337, metrics={'train_runtime': 5893.6465, 'train_samples_per_second': 4.682, 'train_steps_per_second': 0.234, 'total_flos': 2.8579728914049024e+16, 'train_loss': 0.9027012603870337, 'epoch': 3.0})