In [1]:
import json
from datasets import Dataset, DatasetDict
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer

## 数据处理和格式化

In [5]:
model_name = r"D:\competition\llm_info_extract\Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [2]:
# 数据处理函数
def process_function(example):
    """处理单条样本"""
    instruction = example["instruction"]
    input_text = example["input"]
    output_text = example["output"]
    
    # 构建Qwen2.5的对话格式
    messages = [
        {"role": "system", "content": "你是一个专业的命名实体识别助手。"},
        {"role": "user", "content": f"{instruction}\n\n{input_text}"},
        {"role": "assistant", "content": output_text}
    ]
    
    # 使用tokenizer的应用聊天模板方法
    text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=False
    )
    
    return {"text": text}

# 加载示例数据
def load_sample_data():
    """加载示例数据"""
    samples = [
        {
            "instruction": "请从给定文本中识别出所有命名实体，并按照指定的实体类型进行分类。",
            "input": "文本：After renewing the cylinder assembly , it was tested around Brighton and Eastleigh using an LNER Dynamometer car , where good running was experienced at high costs in fuel and effort on the part of the fireman .\n\n可选的实体类型：organization、science、politics、location、event",
            "output": "实体识别结果：\n1. 实体：Brighton\n   粗类型：location\n   细类型：city\n2. 实体：Eastleigh\n   粗类型：location\n   细类型：town"
        },
        # 可以添加更多样本...
    ]
    return samples

def load_data(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            sample = json.loads(line.strip())
            # 构造对话格式
            conversation = process_function(sample)
            data.append({"conversations": conversation})
    return Dataset.from_list(data)

In [6]:
train_data = load_data("stage1/ner_finetuning_train.json")
dev_data = load_data("stage1/ner_finetuning_dev.json")
predict_data = load_data("stage1/ner_finetuning_predict.json")


In [10]:
train_data[0]

{'conversations': {'text': '<|im_start|>system\n你是一个专业的命名实体识别助手。<|im_end|>\n<|im_start|>user\n请从给定文本中识别出所有命名实体，并按照指定的实体类型进行分类。\n\n文本：“截至9月末，深圳现金累计投放量同比出现负数。”近日，一位接近监管部门人士对本报记者称，“\n\n可选的实体类型：生物、职位、科学、组织机构、学历、位置<|im_end|>\n<|im_start|>assistant\n实体识别结果：\n1. 实体：记者\n   粗类型：职位\n   细类型：概念<|im_end|>\n'}}

## 模型和tokenizer加载

In [7]:
# 模型配置

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# 设置padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

`torch_dtype` is deprecated! Use `dtype` instead!


## LoRA配置

In [8]:
# LoRA配置
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,  # LoRA秩
    lora_alpha=32,  # LoRA alpha参数
    lora_dropout=0.1,  # LoRA dropout
    target_modules=[
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # 针对Qwen2.5的模块
    bias="none",
)

# 应用LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


## 训练参数配置

In [9]:
# 训练参数
training_args = TrainingArguments(
    output_dir="./qwen2.5-7b-ner-lora",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    warmup_steps=100,
    logging_dir="./logs",
    report_to=["tensorboard"],
    save_total_limit=3,
    fp16=True,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
)

## 训练脚本

In [13]:


# 创建训练器
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=dev_data,
    dataset_text_field="text",
    max_seq_length=2048,
    packing=False,  # 对于指令微调，建议关闭packing
)
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=train_data,
#     eval_dataset=dev_data,
#     peft_config=lora_config,
#     dataset_text_field="conversations",  # 数据集中对话字段名称
#     max_seq_length=1024,  # 最大序列长度（根据数据调整）
#     tokenizer=tokenizer,
#     args=training_args,
#     packing=False,  # 不打包样本（对话数据通常不打包）
# )

# 开始训练
trainer.train()

# 保存模型
trainer.save_model()
tokenizer.save_pretrained(training_args.output_dir)



TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'tokenizer'

## 推理测试代码

In [None]:
def inference_test(model_path, input_text):
    """推理测试函数"""
    # 加载模型和tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    
    # 构建输入
    messages = [
        {"role": "system", "content": "你是一个专业的命名实体识别助手。"},
        {"role": "user", "content": input_text}
    ]
    
    text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # 生成
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
    return response

# 测试示例
test_input = """请从给定文本中识别出所有命名实体，并按照指定的实体类型进行分类。

文本：Apple Inc. was founded by Steve Jobs in Cupertino, California.

可选的实体类型：organization、person、location"""

result = inference_test("./qwen2.5-7b-ner-lora", test_input)
print(result)