In [1]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from evaluate import load


# 设置环境变量优化性能
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# 模型和数据集配置
MODEL_ID = "Qwen/Qwen2.5-7B"
DATASET_PATH = "EvolInstruct-Code-80k"  # 本地数据集路径
MAX_LENGTH = 2048  # 序列最大长度
OUTPUT_DIR = "./my_model/qwen2.5-7b-code-finetuned"


In [2]:
ds = load_dataset('nickrosh/Evol-Instruct-Code-80k-v1')
ds

Using the latest cached version of the dataset since nickrosh/Evol-Instruct-Code-80k-v1 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\tassa\.cache\huggingface\datasets\nickrosh___evol-instruct-code-80k-v1\default\0.0.0\3ae930c20d5496e2c8386872d5628c45f6957db4 (last modified on Tue May 27 21:54:19 2025).


DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 78264
    })
})

In [3]:
# 加载tokenizer和模型
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Qwen模型pad_token与eos_token相同

In [4]:
# 加载并预处理数据集
def load_and_process_data():
    # 加载数据集
    dataset = ds
    
    # 过滤过长样本（保留<2048 tokens）
    def filter_long_samples(example):
        tokens = tokenizer(example["instruction"] + example["output"], truncation=False)
        return len(tokens["input_ids"]) <= MAX_LENGTH
    
    filtered_dataset = dataset.filter(filter_long_samples)
    
    # 划分数据集（训练:验证:测试 = 8:1:1）
    train_val_dataset = filtered_dataset["train"].train_test_split(test_size=0.2, seed=42)
    val_test_dataset = train_val_dataset["test"].train_test_split(test_size=0.5, seed=42)
    
    return {
        "train": train_val_dataset["train"],
        "validation": val_test_dataset["train"],
        "test": val_test_dataset["test"],
    }

datasets = load_and_process_data()
datasets

{'train': Dataset({
     features: ['instruction', 'output'],
     num_rows: 62600
 }),
 'validation': Dataset({
     features: ['instruction', 'output'],
     num_rows: 7825
 }),
 'test': Dataset({
     features: ['instruction', 'output'],
     num_rows: 7826
 })}

# baseline评估

In [5]:
# 加载tokenizer和模型
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True,padding_side='left')
tokenizer.pad_token = tokenizer.eos_token  # Qwen模型pad_token与eos_token相同
os.environ["TOKENIZERS_PARALLELISM"] = "true"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,  # 使用BF16进行评估，平衡速度和精度
    trust_remote_code=True,
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [6]:

# 格式化数据集
def format_dataset(example):
    instruction = example["instruction"]
    # Qwen模型的指令格式（只包含指令部分，让模型生成回复）
    formatted_text = f"[INST] {instruction} [/INST]"
    return {"text": formatted_text, "reference": example["output"]}

# 加载并处理验证数据
validation_dataset = datasets['test']
validation_dataset = validation_dataset.map(format_dataset)

Map:   0%|          | 0/7826 [00:00<?, ? examples/s]

In [None]:
sample_valid_set = validation_dataset.shuffle().select(range(100))

In [8]:
import os
import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from evaluate import load
from tqdm import tqdm

# 生成配置
generation_config = GenerationConfig(
    # temperature=0.1,  # 低温度，提高确定性
    max_new_tokens=1024,  # 最大生成长度
    # top_p=0.9,
    # top_k=40,
    num_beams=1,  # 使用贪婪解码（更快）
    # early_stopping=True,
    pad_token_id=tokenizer.eos_token_id,
)

# 初始化BLEU评估器
bleu = load("bleu")

In [None]:
# 批量评估函数
def evaluate_bleu(model, dataset, batch_size=4):
    model.eval()  # 设置为评估模式
    all_predictions = []
    all_references = []
    
    # 分批处理数据
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        
        # 准备输入
        inputs = tokenizer(
            batch["text"], 
            return_tensors="pt", 
            padding=True, 
            truncation=True, 
            max_length=MAX_LENGTH
        ).to(model.device)
        
        # 生成回复
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                generation_config=generation_config,
            )
        
        # 解码生成的回复
        generated_texts = tokenizer.batch_decode(
            outputs[:, inputs.input_ids.shape[1]:],  # 只取生成的部分
            skip_special_tokens=True
        )
        
        # 收集预测和参考
        all_predictions.extend(generated_texts)
        all_references.extend([[ref] for ref in batch["reference"]])  # BLEU需要列表的列表
    
    # 计算BLEU分数（包括BLEU-1到BLEU-4）
    results = {}
    for n in range(1, 5):
        results[f"bleu-{n}"] = bleu.compute(
            predictions=all_predictions,
            references=all_references,
            max_order=n
        )["bleu"]
    
    return results

# 执行评估
print("开始在验证集上评估基线BLEU分数...")
bleu_results = evaluate_bleu(model, sample_valid_set)

# 打印结果
print("\n基线BLEU分数:")
for key, value in bleu_results.items():
    print(f"{key}: {value:.4f}")



print(f"\n基线分数已保存至: {os.path.join(OUTPUT_DIR, 'baseline_bleu_scores.txt')}")

开始在验证集上评估基线BLEU分数...


100%|██████████| 25/25 [1:41:52<00:00, 244.51s/it]



基线BLEU分数:
bleu-1: 0.5731
bleu-2: 0.4842
bleu-3: 0.4253
bleu-4: 0.3817


FileNotFoundError: [Errno 2] No such file or directory: './my_model/qwen2.5-7b-code-finetuned\\baseline_bleu_scores.txt'

# 微调模型

4位量化或者8位量化就是QLora形式，减少内存，但是会增大训练时间

In [5]:

# 格式化数据集（添加特殊token）
def format_dataset(example):
    instruction = example["instruction"]
    response = example["output"]
    # Qwen模型的指令格式
    formatted_text = f"[INST] {instruction} [/INST] {response}"
    return {"text": formatted_text}
# 编码数据集
def encode_dataset(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding="max_length")

# 格式化和编码
for split in datasets:
    datasets[split] = datasets[split].map(format_dataset)
    datasets[split] = datasets[split].map(encode_dataset, batched=True)
    datasets[split] = datasets[split].remove_columns(["instruction", "output", "text"])



In [6]:

# !pip install bitsandbytes-cuda12x

# from transformers import BitsAndBytesConfig
# 量化配置（INT4）
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

# LoRA配置
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM",
    inference_mode=False,
)

# 数据收集器
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 按照量化配置加载模型
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    # quantization_config=bnb_config, # QLORA使用
    torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,  # 使用BF16或FP16
    device_map={"": 0},
    trust_remote_code=True,
    use_cache=False
)
model.enable_input_require_grads()
# 准备模型进行k-bit训练
# model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# 检查模型的梯度状态
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Parameter {name} has requires_grad=True")
    else:
        print(f"Parameter {name} has requires_grad=False")

In [None]:
train_sets = datasets['train'].shuffle().select(range(1000))
valid_sets = datasets['validation'].shuffle().select(range(100))

# !pip install tensorboard
model.train()
# 训练参数配置
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,  # 等效batch_size=16
    # fp16=True,
    bf16=True,  # 使用BF16进行训练
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard",
    dataloader_num_workers=4,  # 并行数据加载
    gradient_checkpointing=True,  # 节省显存
    remove_unused_columns=False,  # 保留所有列用于评估
)


# BLEU评估函数
def compute_metrics(eval_preds):
    bleu = load("bleu")
    predictions, labels = eval_preds
    # 将预测和标签转换为文本
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # 标签中的-100需要替换为padding token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # 计算BLEU-4
    results = bleu.compute(predictions=decoded_preds, references=decoded_labels, max_order=4)
    return {"bleu-4": results["bleu"]}

# 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sets,
    eval_dataset=valid_sets,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 训练模型
trainer.train()




  0%|          | 0/186 [00:00<?, ?it/s]

In [None]:

# 保存LoRA权重（仅需约60-100MB）
model.save_pretrained(os.path.join(OUTPUT_DIR, "final_lora_weights"))


In [None]:

# 在测试集上评估
# test_results = trainer.evaluate(datasets["test"])
test_results = trainer.evaluate(sample_valid_set)
print(f"Test Results: {test_results}")