In [1]:
from datasets import load_dataset,Dataset
from transformers import AutoTokenizer,AutoModelForCausalLM,Trainer, TrainingArguments,DataCollatorForSeq2Seq
import json

In [None]:
tokenizer = AutoTokenizer.from_pretrained("qw_0.5_instruct")

INSTRUCTION = "你是一个物流专家，请根据用户的问题给出专业的回答"

def preprocess_dataset(input_file, tokenizer, instruction_text, max_length=512):
    """
    预处理数据集，将输入和输出文本转化为模型可以接受的格式。

    Args:
        input_file (str): 输入 JSON 文件的路径。
        tokenizer: 用于处理文本的 tokenizer。
        instruction_text (str): 系统的提示信息。
        max_length (int): 最大序列长度。

    Returns:
        list[dict]: 预处理后的数据集，每个样本是一个字典，包含 input_ids、attention_mask 和 labels。
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)  # 假定 JSON 文件内容为一个数组

    processed_data = []

    for example in data:
        # Tokenize instruction and response
        instruction_tokens = tokenizer(
            f"<|im_start|>system\n{instruction_text}<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n<|im_start|>assistant\n",
            add_special_tokens=False
        )
        response_tokens = tokenizer(
            example['answer'],
            add_special_tokens=False
        )

        # Combine instruction and response tokens
        input_ids = (
            instruction_tokens["input_ids"] + 
            response_tokens["input_ids"] + 
            [tokenizer.pad_token_id]
        )
        attention_mask = (
            instruction_tokens["attention_mask"] + 
            response_tokens["attention_mask"] + 
            [1]
        )
        labels = (
            [-100] * len(instruction_tokens["input_ids"]) + 
            response_tokens["input_ids"] + 
            [tokenizer.pad_token_id]
        )

        # Apply truncation if necessary
        if len(input_ids) > max_length:
            input_ids = input_ids[:max_length]
            attention_mask = attention_mask[:max_length]
            labels = labels[:max_length]

        # Append processed example to the list
        processed_data.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        })

    return processed_data

# 使用示例
input_file = r'C:\Users\AN\Desktop\qw\train_datas.json'
instruction_text = "你是一个物流专家，请根据用户的问题给出专业的回答"
processed_dataset = preprocess_dataset(input_file, tokenizer, instruction_text)

# # 打印第一个样本进行验证
print(processed_dataset[0])

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\AN\\Desktop\\qw\\train_datas1.json'

In [None]:
tokenizer.decode(processed_dataset[1]["input_ids"])

In [None]:
from peft import LoraConfig,TaskType,get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj"
    ],
    inference_mode=False,  # 训练模式
    r=64, 
    lora_alpha=16,  
    lora_dropout=0.1,  
)
model = AutoModelForCausalLM.from_pretrained("qw_0.5_instruct")
model = get_peft_model(model, config)


In [None]:
args = TrainingArguments(
    output_dir="lora_qw2.5",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    logging_steps=10,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=processed_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer,padding=True)
)

In [None]:
trainer.train()