# Qwen - test on kaggle


In [10]:
# -*- coding: utf-8 -*-
"""
训练 + 评测（LoRA / 4bit / 单卡）
- 基座: Qwen/Qwen2.5-1.5B-Instruct
- 训练集: 两数据集合并 (mbti_sample_with_all_views.json + mbti_sample_with_all_views_pandora.json)
- Eval/Test: 只在指定的数据集上评测 (默认 Pandora)
- 输出: 指标 + 混淆矩阵 + ROC(micro/macro) + LoRA 适配器权重
"""
import os, json, random
from typing import Dict, Any, List
import numpy as np
import torch
import torch.nn.functional as F

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import label_binarize

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer, TrainingArguments,
    set_seed,
)
from peft import (
    LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
)

# ================== 配置 ==================
BASE_MODEL   = "Qwen/Qwen2.5-1.5B-Instruct"
DATA_A       = "mbti_sample_with_all_views.json"          # 旧数据
DATA_B       = "mbti_sample_with_all_views_pandora.json"  # Pandora
EVAL_ON      = "A"  # 只在哪个集上做 eval/test: "A" or "B"
OUTPUT_DIR   = "qwen-test-on-pandora"                      # 输出目录（含 lora）
RESUME_ADAPTER_DIR = None  # 若已有 LoRA 断点，可填入目录；否则置为 None

MAX_LEN      = 320
USE_4BIT     = True
SEED         = 42
NUM_LABELS   = 16

# LoRA 超参（可按需微调）
LORA_R       = 16
LORA_ALPHA   = 32
LORA_DROPOUT = 0.05
# Qwen2.5 常用目标模块
LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

# 训练超参（按你的显存情况调整）
BATCH_SIZE_PER_DEVICE_TRAIN = 8
BATCH_SIZE_PER_DEVICE_EVAL  = 8
GR_ACCUM_STEPS              = 1
EPOCHS                      = 3
LR                          = 2e-4
WARMUP_RATIO                = 0.05
LOGGING_STEPS               = 20
SAVE_STEPS                  = 500
EVAL_STEPS                  = 500

MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

# 与训练一致的多视角 token 预算
BUDGET = {"posts_cleaned": 192, "semantic_view": 64, "sentiment_view": 32, "linguistic_view": 24}

HF_TOKEN = os.getenv("HF_TOKEN")
HF_KW = {"token": HF_TOKEN} if HF_TOKEN else {}

# ================== 工具函数 ==================
def mbti_to_4d(m: str):
    m = m.upper()
    return (
        0 if m[0]=="I" else 1,
        0 if m[1]=="S" else 1,
        0 if m[2]=="F" else 1,
        0 if m[3]=="P" else 1,
    )

def truncate_to_budget(tok: AutoTokenizer, text: str, budget: int) -> str:
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

def build_input(item: Dict[str, Any], tok: AutoTokenizer) -> str:
    p_raw = item.get("posts_cleaned", item.get("posts","")) or item.get("text","") or ""
    sem   = item.get("semantic_view","")  or ""
    sen   = item.get("sentiment_view","") or ""
    lin   = item.get("linguistic_view","") or ""

    p   = truncate_to_budget(tok, p_raw, BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, sem,   BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, sen,   BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, lin,   BUDGET["linguistic_view"])

    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    clean = []
    for r in rows:
        t = (r.get("type") or r.get("label") or "").upper().strip()
        if t in MBTI2ID:
            r["type"] = t
            clean.append(r)
    return clean

class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it   = self.rows[idx]
        text = build_input(it, self.tok)
        y    = MBTI2ID[it["type"]]
        enc  = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

def compute_metrics(eval_pred):
    preds, labels = (eval_pred if isinstance(eval_pred, tuple)
                     else (eval_pred.predictions, eval_pred.label_ids))
    if isinstance(preds, (list, tuple)): preds = preds[0]
    preds = np.asarray(preds); labels = np.asarray(labels)
    pred_ids = preds.argmax(-1)
    acc16 = float((pred_ids == labels).mean())

    pred_types = [MBTI_16[i] for i in pred_ids]
    true_types = [MBTI_16[i] for i in labels]
    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(labels)
    return {"acc_16": acc16, "acc_ei": c_ei/n, "acc_ns": c_ns/n, "acc_tf": c_tf/n, "acc_jp": c_jp/n, "acc_4D": c_all/n}

def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir, tag="eval"):
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title(f"Confusion Matrix ({tag})")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, f"{tag}_confusion_matrix.png"))
    plt.close(fig_cm)

    # ROC：跳过评测集中没有正样本的类
    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))
    fpr, tpr, roc_auc = {}, {}, {}
    valid = []
    for i in range(len(class_names)):
        if Y_true_bin[:, i].sum() == 0:
            continue
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        valid.append(i)
    if len(valid) >= 2:
        fpr["micro"], tpr["micro"], _ = roc_curve(
            Y_true_bin[:, valid].ravel(), y_prob[:, valid].ravel()
        )
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        all_fpr = np.unique(np.concatenate([fpr[i] for i in valid]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in valid:
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= len(valid)
        fpr["macro"] = all_fpr; tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
        ax_roc.plot(fpr["micro"], tpr["micro"],
                    label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
        ax_roc.plot(fpr["macro"], tpr["macro"],
                    label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
        ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
        ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
        ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
        ax_roc.set_title(f"Multiclass ROC ({tag})")
        ax_roc.legend(loc="lower right")
        fig_roc.tight_layout()
        fig_roc.savefig(os.path.join(out_dir, f"{tag}_roc_micro_macro.png"))
        plt.close(fig_roc)

# ================== 主流程 ==================
def main():
    # 环境 & 种子
    os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
    os.environ["BITSANDBYTES_NOWELCOME"] = "1"
    torch.cuda.set_device(0)
    set_seed(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL, use_fast=True, trust_remote_code=True, **HF_KW
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # 量化
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=USE_4BIT,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    ) if USE_4BIT else None

    # 分类头：num_labels=16
    base_cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True, **HF_KW)
    base_cfg.num_labels = NUM_LABELS

    # 基座
    base = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL,
        config=base_cfg,
        device_map={"": "cuda:0"},
        quantization_config=quant_cfg,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **HF_KW,
    )

    # ========= LoRA：新训或续训 =========
    if RESUME_ADAPTER_DIR:
        # 从已训练的 LoRA 继续
        model = PeftModel.from_pretrained(base, RESUME_ADAPTER_DIR, is_trainable=True)
    else:
        # 新建 LoRA
        base = prepare_model_for_kbit_training(base)  # 4bit 可训练准备
        lora_cfg = LoraConfig(
            r=LORA_R,
            lora_alpha=LORA_ALPHA,
            target_modules=LORA_TARGET_MODULES,
            lora_dropout=LORA_DROPOUT,
            bias="none",
            task_type="SEQ_CLS",
        )
        model = get_peft_model(base, lora_cfg)

    model.config.use_cache = False
    model.config.pad_token_id = tokenizer.pad_token_id
    model.print_trainable_parameters()

    # ========= 数据 =========
    rows_A = load_rows(DATA_A)
    rows_B = load_rows(DATA_B)

    # 训练集 = A ∪ B
    train_rows: List[Dict[str, Any]] = rows_A + rows_B
    random.Random(SEED).shuffle(train_rows)

    # eval/test 只用指定一个数据集（默认 B=Pandora）
    if EVAL_ON.upper() == "A":
        eval_rows = rows_A
        eval_tag  = "A_eval"
    else:
        eval_rows = rows_B
        eval_tag  = "B_eval"

    # （可选）从 eval_rows 再划一个 test 子集；这里简单按 80/20 切
    cut = int(0.8 * len(eval_rows)) if len(eval_rows) > 5 else len(eval_rows)
    test_rows = eval_rows[cut:]
    eval_rows = eval_rows[:cut] if cut > 0 else eval_rows

    # 构建数据集
    train_ds = MBTIDataset(train_rows, tokenizer, max_len=MAX_LEN)
    eval_ds  = MBTIDataset(eval_rows,  tokenizer, max_len=MAX_LEN)
    test_ds  = MBTIDataset(test_rows,  tokenizer, max_len=MAX_LEN) if test_rows else None
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    # ========= 训练参数 =========
    from transformers import TrainingArguments

    common_kwargs = dict(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE_PER_DEVICE_TRAIN,
        per_device_eval_batch_size=BATCH_SIZE_PER_DEVICE_EVAL,
        gradient_accumulation_steps=GR_ACCUM_STEPS,
        learning_rate=LR,
        num_train_epochs=EPOCHS,
        warmup_ratio=WARMUP_RATIO,
        logging_steps=LOGGING_STEPS,
        eval_steps=EVAL_STEPS,
        save_steps=SAVE_STEPS,
        save_total_limit=2,
        lr_scheduler_type="cosine",
        report_to="none",
        fp16=False, bf16=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_acc_16",
        greater_is_better=True,
        # 这两个在新老版本都存在，显式写上更稳
        logging_strategy="steps",
        save_strategy="steps",
    )

    # 依次尝试新/旧/远古命名，保证不同版本都能跑
    try:
        args = TrainingArguments(eval_strategy="steps", **common_kwargs)
    except TypeError:
        try:
            args = TrainingArguments(evaluation_strategy="steps", **common_kwargs)
        except TypeError:
            # 超老版本（3.x）兜底
            args = TrainingArguments(evaluate_during_training=True, **common_kwargs)


    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    # ========= 训练 =========
    trainer.train()

    # 保存 LoRA（适配器）
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    try:
        model.save_pretrained(os.path.join(OUTPUT_DIR, "lora_adapter"))
    except Exception as e:
        print("Save adapter failed:", e)

    # ========= Eval（在指定集）=========
    eval_output = trainer.predict(eval_ds)
    logits = eval_output.predictions
    if isinstance(logits, (list, tuple)):
        logits = logits[0]
    probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
    y_true = eval_output.label_ids
    plot_confusion_and_roc(y_true, probs, MBTI_16, OUTPUT_DIR, tag=f"{eval_tag}")

    metrics = trainer.evaluate(eval_dataset=eval_ds)
    print("\n=== Eval on chosen dataset ===")
    for k, v in metrics.items():
        try:
            print(f"{k}: {float(v):.4f}")
        except Exception:
            print(k, v)

    # ========= Test（同一数据集的 hold-out 部分）=========
    if test_ds and len(test_ds) > 0:
        test_output = trainer.predict(test_ds)
        logits = test_output.predictions
        if isinstance(logits, (list, tuple)):
            logits = logits[0]
        probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
        y_true = test_output.label_ids
        plot_confusion_and_roc(y_true, probs, MBTI_16, OUTPUT_DIR, tag=f"{eval_tag}_test")

        # 简单整体准确率
        pred_ids = probs.argmax(-1)
        acc = float((pred_ids == y_true).mean())
        print(f"\n=== Test accuracy on chosen dataset: {acc:.4f}")

    # ========= 示例推理 =========
    model.eval()
    sample = (rows_B[0] if EVAL_ON.upper()=="B" else rows_A[0]) if (rows_A and rows_B) else (train_rows[0])
    text = build_input(sample, tokenizer)
    batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    batch = {k: v.to("cuda:0") for k, v in batch.items()}
    with torch.no_grad():
        out = model(**batch).logits
        pred_id = int(torch.argmax(out, dim=-1))
        print("样例原标签:", sample["type"], "| 预测:", MBTI_16[pred_id])

if __name__ == "__main__":
    main()


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 18,489,344 || all params: 1,562,228,224 || trainable%: 1.1835


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Acc 16,Acc Ei,Acc Ns,Acc Tf,Acc Jp,Acc 4d
500,2.2856,1.825324,0.446024,0.786593,0.716327,0.737668,0.785675,0.446024
1000,1.9074,0.707887,0.805657,0.915813,0.939688,0.913756,0.88775,0.805657
1500,1.691,0.994345,0.732305,0.898439,0.934435,0.868797,0.852489,0.732305
2000,1.8413,0.575598,0.810358,0.938439,0.960808,0.923453,0.888411,0.810358
2500,1.5904,0.645246,0.806575,0.915813,0.95034,0.90685,0.889734,0.806575
3000,1.5582,0.553937,0.843269,0.938512,0.958457,0.928007,0.910854,0.843269
3500,1.5765,0.514555,0.848191,0.939394,0.962645,0.932158,0.917833,0.848191
4000,1.5154,0.513039,0.850101,0.941965,0.958457,0.934068,0.912103,0.850101
4500,1.6423,0.4894,0.853186,0.942736,0.962314,0.932672,0.920037,0.853186
5000,1.4736,0.496077,0.85157,0.943691,0.950303,0.933701,0.919229,0.85157


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a


=== Eval on chosen dataset ===
eval_loss: 0.1203
eval_acc_16: 0.9662
eval_acc_ei: 0.9870
eval_acc_ns: 0.9908
eval_acc_tf: 0.9872
eval_acc_jp: 0.9805
eval_acc_4D: 0.9662
eval_runtime: 532.5159
eval_samples_per_second: 51.1250
eval_steps_per_second: 6.3920
epoch: 3.0000

=== Test accuracy on chosen dataset: 1.0000
样例原标签: INFJ | 预测: INFJ


In [1]:
# -*- coding: utf-8 -*-
"""
训练 + 评测（LoRA / 4bit / 单卡）
- 基座: Qwen/Qwen2.5-1.5B-Instruct
- 训练集: A ∪ B_train（分层8:2后B的80%）
- Eval/Test: eval 在 B_train，test 在 B_test（B的20%）
- 输出: 指标 + 混淆矩阵 + ROC(micro/macro) + LoRA 适配器权重
"""
import os, json, random
from typing import Dict, Any, List
from collections import defaultdict
import numpy as np
import torch
import torch.nn.functional as F

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import label_binarize

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer, TrainingArguments,
    set_seed,
)
from peft import (
    LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
)

# ================== 配置 ==================
BASE_MODEL   = "Qwen/Qwen2.5-1.5B-Instruct"
DATA_A       = "mbti_sample_with_all_views.json"          # 旧数据（A）
DATA_B       = "mbti_sample_with_all_views_pandora.json"  # Pandora（B）
EVAL_ON      = "A"  # 只在哪个集上做 eval/test: "A" or "B"
OUTPUT_DIR   = "qwen-test-on-pandora_new"                     # 输出目录（含 LoRA）
RESUME_ADAPTER_DIR = None  # 若已有 LoRA 断点，可填入目录；否则置为 None


MAX_LEN      = 320
USE_4BIT     = True
SEED         = 42
NUM_LABELS   = 16

# LoRA 超参（可按需微调）
LORA_R       = 16
LORA_ALPHA   = 32
LORA_DROPOUT = 0.05
# Qwen2.5 常用目标模块
LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

# 训练超参（按你的显存情况调整）
BATCH_SIZE_PER_DEVICE_TRAIN = 8
BATCH_SIZE_PER_DEVICE_EVAL  = 8
GR_ACCUM_STEPS              = 1
EPOCHS                      = 3
LR                          = 2e-4
WARMUP_RATIO                = 0.05
LOGGING_STEPS               = 20
SAVE_STEPS                  = 500
EVAL_STEPS                  = 500

MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

# 与训练一致的多视角 token 预算
BUDGET = {"posts_cleaned": 192, "semantic_view": 64, "sentiment_view": 32, "linguistic_view": 24}

HF_TOKEN = os.getenv("HF_TOKEN")
HF_KW = {"token": HF_TOKEN} if HF_TOKEN else {}

# ================== 工具函数 ==================
def mbti_to_4d(m: str):
    m = m.upper()
    return (
        0 if m[0]=="I" else 1,
        0 if m[1]=="S" else 1,
        0 if m[2]=="F" else 1,
        0 if m[3]=="P" else 1,
    )

def truncate_to_budget(tok: AutoTokenizer, text: str, budget: int) -> str:
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

def build_input(item: Dict[str, Any], tok: AutoTokenizer) -> str:
    p_raw = item.get("posts_cleaned", item.get("posts","")) or item.get("text","") or ""
    sem   = item.get("semantic_view","")  or ""
    sen   = item.get("sentiment_view","") or ""
    lin   = item.get("linguistic_view","") or ""

    p   = truncate_to_budget(tok, p_raw, BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, sem,   BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, sen,   BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, lin,   BUDGET["linguistic_view"])

    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    clean = []
    for r in rows:
        t = (r.get("type") or r.get("label") or "").upper().strip()
        if t in MBTI2ID:
            r["type"] = t
            clean.append(r)
    return clean

def stratified_split_by_type(rows, ratio=0.8, seed=42):
    """按 16 类型分层切分 rows -> (train_part, test_part)"""
    buckets = defaultdict(list)
    for r in rows:
        buckets[r["type"]].append(r)

    rng = random.Random(seed)
    train, test = [], []
    for t, lst in buckets.items():
        rng.shuffle(lst)
        n = len(lst)
        if n <= 1:
            train.extend(lst)              # 极小类：全进训练
            continue
        cut = int(round(n * ratio))
        cut = min(max(1, cut), n - 1)      # 保证两边都有样本
        train.extend(lst[:cut])
        test.extend(lst[cut:])
    return train, test

class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it   = self.rows[idx]
        text = build_input(it, self.tok)
        y    = MBTI2ID[it["type"]]
        enc  = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

def compute_metrics(eval_pred):
    preds, labels = (eval_pred if isinstance(eval_pred, tuple)
                     else (eval_pred.predictions, eval_pred.label_ids))
    if isinstance(preds, (list, tuple)): preds = preds[0]
    preds = np.asarray(preds); labels = np.asarray(labels)
    pred_ids = preds.argmax(-1)
    acc16 = float((pred_ids == labels).mean())

    pred_types = [MBTI_16[i] for i in pred_ids]
    true_types = [MBTI_16[i] for i in labels]
    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(labels)
    return {"acc_16": acc16, "acc_ei": c_ei/n, "acc_ns": c_ns/n, "acc_tf": c_tf/n, "acc_jp": c_jp/n, "acc_4D": c_all/n}

def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir, tag="eval"):
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title(f"Confusion Matrix ({tag})")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, f"{tag}_confusion_matrix.png"))
    plt.close(fig_cm)

    # ROC：跳过评测集中没有正样本的类
    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))
    fpr, tpr, roc_auc = {}, {}, {}
    valid = []
    for i in range(len(class_names)):
        if Y_true_bin[:, i].sum() == 0:
            continue
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        valid.append(i)
    if len(valid) >= 2:
        fpr["micro"], tpr["micro"], _ = roc_curve(
            Y_true_bin[:, valid].ravel(), y_prob[:, valid].ravel()
        )
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        all_fpr = np.unique(np.concatenate([fpr[i] for i in valid]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in valid:
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= len(valid)
        fpr["macro"] = all_fpr; tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
        ax_roc.plot(fpr["micro"], tpr["micro"],
                    label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
        ax_roc.plot(fpr["macro"], tpr["macro"],
                    label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
        ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
        ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
        ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
        ax_roc.set_title(f"Multiclass ROC ({tag})")
        ax_roc.legend(loc="lower right")
        fig_roc.tight_layout()
        fig_roc.savefig(os.path.join(out_dir, f"{tag}_roc_micro_macro.png"))
        plt.close(fig_roc)

# ================== 主流程 ==================
def main():
    # 环境 & 种子
    os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
    os.environ["BITSANDBYTES_NOWELCOME"] = "1"
    torch.cuda.set_device(0)
    set_seed(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL, use_fast=True, trust_remote_code=True, **HF_KW
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # 量化
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=USE_4BIT,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    ) if USE_4BIT else None

    # 分类头：num_labels=16
    base_cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True, **HF_KW)
    base_cfg.num_labels = NUM_LABELS

    # 基座
    base = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL,
        config=base_cfg,
        device_map={"": "cuda:0"},
        quantization_config=quant_cfg,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **HF_KW,
    )

    # ========= LoRA：新训或续训 =========
    if RESUME_ADAPTER_DIR:
        model = PeftModel.from_pretrained(base, RESUME_ADAPTER_DIR, is_trainable=True)
    else:
        base = prepare_model_for_kbit_training(base)  # 4bit 可训练准备
        lora_cfg = LoraConfig(
            r=LORA_R,
            lora_alpha=LORA_ALPHA,
            target_modules=LORA_TARGET_MODULES,
            lora_dropout=LORA_DROPOUT,
            bias="none",
            task_type="SEQ_CLS",
        )
        model = get_peft_model(base, lora_cfg)

    model.config.use_cache = False
    model.config.pad_token_id = tokenizer.pad_token_id
    model.print_trainable_parameters()

    # ========= 数据（分层切分 + 落盘，按 EVAL_ON 选择 A 或 B）=========
    rows_A = load_rows(DATA_A)
    rows_B = load_rows(DATA_B)

    def stratified_split_by_type(rows, ratio=0.8, seed=SEED):
        from collections import defaultdict
        rng = random.Random(seed)
        buckets = defaultdict(list)
        for r in rows:
            buckets[r["type"]].append(r)
        train, test = [], []
        for t, lst in buckets.items():
            rng.shuffle(lst)
            n = len(lst)
            if n <= 1:
                train.extend(lst)
                continue
            cut = int(round(n * ratio))
            cut = min(max(1, cut), n - 1)  # 保证两边都有样本
            train.extend(lst[:cut])
            test.extend(lst[cut:])
        return train, test

    if EVAL_ON.upper() == "A":
        chosen_all   = rows_A     # 只对 A 做分层切分并用于 eval/test
        other_all    = rows_B     # B 整集全部进训练
        split_tag    = "A"
    else:
        chosen_all   = rows_B     # 只对 B 做分层切分并用于 eval/test
        other_all    = rows_A     # A 整集全部进训练
        split_tag    = "B"

    split_dir = os.path.join(OUTPUT_DIR, f"splits_stratified_{split_tag}")
    os.makedirs(split_dir, exist_ok=True)
    train_path = os.path.join(split_dir, f"{split_tag}_train_80.json")
    test_path  = os.path.join(split_dir, f"{split_tag}_test_20.json")

    if os.path.exists(train_path) and os.path.exists(test_path):
        chosen_train = json.load(open(train_path, "r", encoding="utf-8"))
        chosen_test  = json.load(open(test_path,  "r", encoding="utf-8"))
        print(f"[Split-{split_tag}] Loaded existing: train={len(chosen_train)}, test={len(chosen_test)}")
    else:
        chosen_train, chosen_test = stratified_split_by_type(chosen_all, ratio=0.8, seed=SEED)
        json.dump(chosen_train, open(train_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
        json.dump(chosen_test,  open(test_path,  "w", encoding="utf-8"), ensure_ascii=False, indent=2)
        print(f"[Split-{split_tag}] Saved: train={len(chosen_train)}, test={len(chosen_test)}")

    # 训练集 = 未选中整集 + 选中集的 80%
    train_rows: List[Dict[str, Any]] = other_all + chosen_train
    random.Random(SEED).shuffle(train_rows)

    # Eval/Test 只用选中的那个集（保持与你预期一致）
    eval_rows = chosen_train
    test_rows = chosen_test
    eval_tag  = f"{split_tag}_eval_stratified"

    # 构建数据集
    train_ds = MBTIDataset(train_rows, tokenizer, max_len=MAX_LEN)
    eval_ds  = MBTIDataset(eval_rows,  tokenizer, max_len=MAX_LEN)
    test_ds  = MBTIDataset(test_rows,  tokenizer, max_len=MAX_LEN) if test_rows else None
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    # ========= 训练参数 =========
    common_kwargs = dict(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE_PER_DEVICE_TRAIN,
        per_device_eval_batch_size=BATCH_SIZE_PER_DEVICE_EVAL,
        gradient_accumulation_steps=GR_ACCUM_STEPS,
        learning_rate=LR,
        num_train_epochs=EPOCHS,
        warmup_ratio=WARMUP_RATIO,
        logging_steps=LOGGING_STEPS,
        eval_steps=EVAL_STEPS,
        save_steps=SAVE_STEPS,
        save_total_limit=2,
        lr_scheduler_type="cosine",
        report_to="none",
        fp16=False, bf16=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_acc_16",
        greater_is_better=True,
        logging_strategy="steps",
        save_strategy="steps",
    )
    try:
        args = TrainingArguments(eval_strategy="steps", **common_kwargs)
    except TypeError:
        try:
            args = TrainingArguments(evaluation_strategy="steps", **common_kwargs)
        except TypeError:
            args = TrainingArguments(evaluate_during_training=True, **common_kwargs)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    # ========= 训练 =========
    trainer.train()

    # 保存 LoRA（适配器）
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    try:
        model.save_pretrained(os.path.join(OUTPUT_DIR, "lora_adapter"))
    except Exception as e:
        print("Save adapter failed:", e)

    # ========= Eval（B_train）=========
    eval_output = trainer.predict(eval_ds)
    logits = eval_output.predictions
    if isinstance(logits, (list, tuple)):
        logits = logits[0]
    probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
    y_true = eval_output.label_ids
    plot_confusion_and_roc(y_true, probs, MBTI_16, OUTPUT_DIR, tag=f"{eval_tag}")

    metrics = trainer.evaluate(eval_dataset=eval_ds)
    print("\n=== Eval on chosen dataset ===")
    for k, v in metrics.items():
        try:
            print(f"{k}: {float(v):.4f}")
        except Exception:
            print(k, v)

    # ========= Test（B_test）=========
    if test_ds and len(test_ds) > 0:
        test_output = trainer.predict(test_ds)
        logits = test_output.predictions
        if isinstance(logits, (list, tuple)):
            logits = logits[0]
        probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
        y_true = test_output.label_ids
        plot_confusion_and_roc(y_true, probs, MBTI_16, OUTPUT_DIR, tag=f"{eval_tag}_test")

        pred_ids = probs.argmax(-1)
        acc = float((pred_ids == y_true).mean())
        print(f"\n=== Test accuracy on chosen dataset: {acc:.4f}")

    # ========= 示例推理 =========
    model.eval()
    sample = (rows_B_all[0] if rows_B_all else rows_A[0])
    text = build_input(sample, tokenizer)
    batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    batch = {k: v.to("cuda:0") for k, v in batch.items()}
    with torch.no_grad():
        out = model(**batch).logits
        pred_id = int(torch.argmax(out, dim=-1))
        print("样例原标签:", sample["type"], "| 预测:", MBTI_16[pred_id])

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 18,489,344 || all params: 1,562,228,224 || trainable%: 1.1835
[Split-A] Saved: train=27226, test=6806


  trainer = Trainer(
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Acc 16,Acc Ei,Acc Ns,Acc Tf,Acc Jp,Acc 4d
500,2.1746,0.981938,0.741387,0.91071,0.93194,0.873063,0.859693,0.741387
1000,2.042,0.596887,0.828326,0.914787,0.931352,0.929002,0.910527,0.828326
1500,1.971,0.632225,0.829685,0.935319,0.923602,0.929259,0.904907,0.829685
2000,1.9686,0.546939,0.835415,0.933666,0.942702,0.937449,0.905201,0.835415
2500,1.7423,0.48531,0.858334,0.944281,0.944795,0.93947,0.922611,0.858334
3000,1.5415,0.587856,0.830566,0.912033,0.94913,0.948799,0.922905,0.830566
3500,1.7349,0.455448,0.865643,0.944281,0.952876,0.944612,0.924704,0.865643
4000,1.5818,0.439966,0.867112,0.944648,0.958606,0.945383,0.923235,0.867112
4500,1.7252,0.427858,0.870675,0.953794,0.968413,0.947366,0.924668,0.870675
5000,1.7417,0.410553,0.879307,0.952545,0.969992,0.946705,0.929846,0.879307


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a


=== Eval on chosen dataset ===
eval_loss: 0.0890
eval_acc_16: 0.9743
eval_acc_ei: 0.9908
eval_acc_ns: 0.9938
eval_acc_tf: 0.9896
eval_acc_jp: 0.9848
eval_acc_4D: 0.9743
eval_runtime: 542.2758
eval_samples_per_second: 50.2070
eval_steps_per_second: 6.2770
epoch: 3.0000

=== Test accuracy on chosen dataset: 0.8892


NameError: name 'rows_B_all' is not defined