In [None]:
!pip install bitsandbytes

In [None]:
# 导入包
from c2net.context import prepare, upload_output # type: ignore
# 初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
# 获取代码路径，数据集路径，预训练模型路径，输出路径
code_path = c2net_context.code_path
dataset_path = c2net_context.dataset_path
pretrain_model_path = c2net_context.pretrain_model_path
you_should_save_here = c2net_context.output_path

# ====== Qwen2.5-7B模型LoRA微调和合并 ======
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

# 1. 准备数据
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch

# ===== 修改点1：使用code_path加载数据集 =====
# 因为您的数据集在code path下，而不是dataset path
train_csv_path = os.path.join(code_path, '')
print(f"正在从 {train_csv_path} 加载训练数据")
train = pd.read_csv(train_csv_path)
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category + ":" + train.Misconception

# 创建标签编码器
le = LabelEncoder()
train['label'] = le.fit_transform(train['target'])
n_classes = len(le.classes_)

# 特征工程
idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c', ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId', 'MC_Answer']]
correct['is_correct'] = 1

train = train.merge(correct, on=['QuestionId', 'MC_Answer'], how='left')
train.is_correct = train.is_correct.fillna(0)

# 定义输入格式
def format_input(row):
    x = "Yes" if row['is_correct'] else "No"
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct? {x}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

train['text'] = train.apply(format_input, axis=1)

# 分割训练集和验证集
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

# 转换为Hugging Face数据集
train_ds = Dataset.from_pandas(train_df[['text', 'label']])
val_ds = Dataset.from_pandas(val_df[['text', 'label']])

# 2. 加载模型和tokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)
import bitsandbytes

# ===== 修改点2：使用pretrain_model_path加载预训练模型 =====
model_name = pretrain_model_path
print(f"正在从 {model_name} 加载预训练模型")

# 配置4-bit量化
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 加载基础模型（使用量化）
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=n_classes,
    quantization_config=bnb_config,
    device_map="auto",
    ignore_mismatched_sizes=True
)
model.config.pad_token_id = tokenizer.pad_token_id

# 准备模型进行k-bit训练
model = prepare_model_for_kbit_training(model)

# 配置LoRA参数
lora_config = LoraConfig(
    r=8,  # LoRA秩
    lora_alpha=32,  # 缩放因子
    target_modules=["q_proj", "v_proj"],  # 目标模块（Qwen2.5的注意力层）
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"  # 序列分类任务
)

# 应用LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # 打印可训练参数

# 3. 微调模型
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

# 应用tokenization
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)

# 设置数据集格式
columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

# ===== 修改点3：将输出目录指向平台指定的输出路径 =====
lora_output_dir = os.path.join(you_should_save_here, "qwen_lora")
merged_output_dir = os.path.join(you_should_save_here, "qwen_merged")

# 创建输出目录
os.makedirs(lora_output_dir, exist_ok=True)
os.makedirs(merged_output_dir, exist_ok=True)

# 定义训练参数
training_args = TrainingArguments(
    output_dir=lora_output_dir,  # LoRA权重保存到指定输出目录
    num_train_epochs=3,
    per_device_train_batch_size=16,  # 可适当增大
    per_device_eval_batch_size=16,
    learning_rate=1e-4,  # LoRA通常使用更高学习率
    optim="paged_adamw_8bit",  # 使用分页优化器
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir=os.path.join(you_should_save_here, "logs"),
    report_to="none",
    fp16=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
)

# 自定义MAP@3指标
def compute_map3(eval_pred):
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    
    top3 = np.argsort(-probs, axis=1)[:, :3]
    match = (top3 == labels[:, None])

    map3 = 0
    for i in range(len(labels)):
        if match[i, 0]:
            map3 += 1.0
        elif match[i, 1]:
            map3 += 1.0 / 2
        elif match[i, 2]:
            map3 += 1.0 / 3
    return {"map@3": map3 / len(labels)}

# 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_map3,
)

# 微调模型
trainer.train()

# 保存LoRA权重
trainer.model.save_pretrained(lora_output_dir)
tokenizer.save_pretrained(lora_output_dir)

# 4. 合并LoRA权重与原始模型
print("\n===== 开始合并LoRA权重 =====")

# 释放GPU内存
del model
del trainer
torch.cuda.empty_cache()

# 加载原始模型（非量化版本）
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=n_classes,
    torch_dtype=torch.float16,
    device_map="auto",
    ignore_mismatched_sizes=True
)

# 加载LoRA权重
lora_model = PeftModel.from_pretrained(
    base_model,
    lora_output_dir,
    torch_dtype=torch.float16
)

# 合并权重
merged_model = lora_model.merge_and_unload()

# 保存完整模型
merged_model.save_pretrained(merged_output_dir)
tokenizer.save_pretrained(merged_output_dir)
print(f"模型已保存至: {merged_output_dir}")

# ===== 必要步骤：确认结果已保存到指定输出目录 =====
print("\n===== 验证输出目录 =====")
print(f"输出目录内容: {os.listdir(you_should_save_here)}")
print(f"LoRA目录内容: {os.listdir(lora_output_dir)}")
print(f"合并模型目录内容: {os.listdir(merged_output_dir)}")

# 回传结果，只有训练任务才能回传
upload_output()