In [None]:
# -*- coding: utf-8 -*-
"""
将 mbti_sample_with_all_views.json 分层切分为 8:1:1 的 train/val/test
随机种子固定为 42，确保可复现
"""
import json
from pathlib import Path
from sklearn.model_selection import train_test_split

MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t: i for i, t in enumerate(MBTI_16)}

def load_rows(path: Path):
    with path.open("r", encoding="utf-8") as f:
        rows = json.load(f)
    rows = [r for r in rows if isinstance(r, dict) and r.get("type") in MBTI2ID]
    if not rows:
        raise ValueError("输入文件中没有合法样本（缺少 'type' 或不在 16 类里）。")
    return rows

def save_json(rows, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=2)

def main(
    input_file="mbti_sample_with_all_views.json",
    outdir=".",
    seed=42
):
    inp = Path(input_file)
    out = Path(outdir)

    rows = load_rows(inp)
    y = [r["type"] for r in rows]

    # 先取 10% 作为 TEST（分层）
    trainval_rows, test_rows = train_test_split(
        rows, test_size=0.10, random_state=seed, stratify=y
    )
    # 再从 90% 里切 10% 作为 VAL（占总数 0.1）=> 0.1 / 0.9
    y_trainval = [r["type"] for r in trainval_rows]
    train_rows, val_rows = train_test_split(
        trainval_rows, test_size=0.1111111111, random_state=seed, stratify=y_trainval
    )

    save_json(train_rows, out / "train.json")
    save_json(val_rows,   out / "val.json")
    save_json(test_rows,  out / "test.json")

    print(f"✅ 已保存：{out/'train.json'}（{len(train_rows)} 条）")
    print(f"✅ 已保存：{out/'val.json'}（{len(val_rows)} 条）")
    print(f"✅ 已保存：{out/'test.json'}（{len(test_rows)} 条）")

if __name__ == "__main__":
    main()


✅ 已保存：train.json（27224 条）
✅ 已保存：val.json（3404 条）
✅ 已保存：test.json（3404 条）


In [1]:
# -*- coding: utf-8 -*-
"""
DeepSeek-R1-Distill-Qwen-1.5B + QLoRA(4bit) + LoRA
读取 train/val/test.json 进行训练与评测（VAL & TEST）
Transformers==4.55
"""
import os, json
from typing import Dict, Any

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

import numpy as np
import torch
import torch.nn.functional as F

from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, auc
)
from sklearn.preprocessing import label_binarize

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer, TrainingArguments,
    set_seed,
)

# 环境开关
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
os.environ["BITSANDBYTES_NOWELCOME"] = "1"

# ============ 配置 ============
MODEL_NAME   = "Qwen/Qwen2.5-1.5B-Instruct"
MAX_LEN      = 440
BUDGET = {"posts_cleaned": 320, "semantic_view": 64, "sentiment_view": 32, "linguistic_view": 24}


SEED         = 42
EPOCHS       = 4
LR           = 2e-4
BSZ_TRN      = 8
BSZ_EVAL     = 4
GRAD_ACCUM   = 1
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01
OUTPUT_DIR   = "mbti_lora_qwen1.5b-split_kaggle_ckpt_new"

USE_4BIT     = True
LORA_R       = 16
LORA_ALPHA   = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

HF_TOKEN = os.getenv("HF_TOKEN")
HF_KW = {"token": HF_TOKEN} if HF_TOKEN else {}

MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t: i for i, t in enumerate(MBTI_16)}

# ============ 基础函数 ============
def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    rows = [r for r in rows if isinstance(r, dict) and r.get("type") in MBTI2ID]
    if not rows:
        raise ValueError(f"{path} 中没有合法样本。")
    return rows

def mbti_to_4d(m: str):
    return (
        0 if m[0]=="I" else 1,
        0 if m[1]=="S" else 1,
        0 if m[2]=="F" else 1,
        0 if m[3]=="P" else 1,
    )

def truncate_to_budget(tok: AutoTokenizer, text: str, budget: int) -> str:
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

def build_input(item: Dict[str, Any], tok: AutoTokenizer) -> str:
    p   = truncate_to_budget(tok, item.get("posts_cleaned", item.get("posts","")) or "", BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, item.get("semantic_view","")  or "", BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, item.get("sentiment_view","") or "", BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, item.get("linguistic_view","") or "", BUDGET["linguistic_view"])
    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it  = self.rows[idx]
        text= build_input(it, self.tok)
        y   = MBTI2ID[it["type"]]
        enc = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

def compute_metrics(eval_pred):
    if isinstance(eval_pred, tuple):
        preds, labels = eval_pred
    else:
        preds, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(preds, (list, tuple)): preds = preds[0]
    if not isinstance(preds, np.ndarray): preds = np.asarray(preds)
    if not isinstance(labels, np.ndarray): labels = np.asarray(labels)

    pred_ids = preds.argmax(-1)
    acc16 = float((pred_ids == labels).mean())

    pred_types = [MBTI_16[i] for i in pred_ids]
    true_types = [MBTI_16[i] for i in labels]
    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(labels)
    return {"acc_16": acc16, "acc_ei": c_ei/n, "acc_ns": c_ns/n, "acc_tf": c_tf/n, "acc_jp": c_jp/n, "acc_4D": c_all/n}

def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir, suffix=""):
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)

    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title(f"Confusion Matrix{suffix}")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, f"confusion_matrix{suffix}.png"))
    plt.close(fig_cm)

    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))
    fpr = {}; tpr = {}; roc_auc = {}
    for i in range(len(class_names)):
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(Y_true_bin.ravel(), y_prob.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(class_names))]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(class_names)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= len(class_names)
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
    ax_roc.plot(fpr["micro"], tpr["micro"],
                label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
    ax_roc.plot(fpr["macro"], tpr["macro"],
                label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
    ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
    ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
    ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
    ax_roc.set_title(f"Multiclass ROC (micro & macro){suffix}")
    ax_roc.legend(loc="lower right")
    fig_roc.tight_layout()
    fig_roc.savefig(os.path.join(out_dir, f"roc_micro_macro{suffix}.png"))
    plt.close(fig_roc)

def main():
    torch.cuda.set_device(0)
    set_seed(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # 读取切分后的文件
    train_rows = load_rows("train.json")
    val_rows   = load_rows("val.json")
    test_rows  = load_rows("test对应的原始数据.json")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, **HF_KW)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=USE_4BIT,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    ) if USE_4BIT else None

    model_kwargs = dict(
        num_labels=16,
        quantization_config=quant_cfg,
        device_map={"": "cuda:0"},
        low_cpu_mem_usage=True,
        **HF_KW,
    )
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, **model_kwargs)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False

    from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)
    try:
        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
    except Exception:
        pass
    peft_cfg = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
        target_modules=TARGET_MODULES, bias="none"
    )
    model = get_peft_model(model, peft_cfg)
    model = model.to("cuda:0")

    def _noop_to(self, *args, **kwargs): return self
    model.to = _noop_to.__get__(model, type(model))

    train_ds = MBTIDataset(train_rows, tokenizer, max_len=MAX_LEN)
    val_ds   = MBTIDataset(val_rows,   tokenizer, max_len=MAX_LEN)
    test_ds  = MBTIDataset(test_rows,  tokenizer, max_len=MAX_LEN)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BSZ_TRN,
        per_device_eval_batch_size=BSZ_EVAL,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        warmup_ratio=WARMUP_RATIO,
        weight_decay=WEIGHT_DECAY,
        lr_scheduler_type="linear",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        logging_steps=50,
        bf16=False, fp16=False,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="eval_acc_4D",
        greater_is_better=True,
        optim="paged_adamw_8bit",
        eval_accumulation_steps=12,
        gradient_checkpointing=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    # 训练
    trainer.train()

    # 验证集预测 & 作图
    val_output = trainer.predict(val_ds)
    val_logits = val_output.predictions[0] if isinstance(val_output.predictions, (list, tuple)) else val_output.predictions
    val_probs  = F.softmax(torch.tensor(val_logits, dtype=torch.float32), dim=-1).cpu().numpy()
    val_y_true = val_output.label_ids
    plot_confusion_and_roc(val_y_true, val_probs, MBTI_16, OUTPUT_DIR, suffix="")
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'confusion_matrix.png')}")
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'roc_micro_macro.png')}")

    # 验证集指标
    eval_metrics = trainer.evaluate(eval_dataset=val_ds)
    print("\n=== Final Eval (on VAL) ===")
    for k, v in eval_metrics.items():
        try: print(f"{k}: {float(v):.4f}")
        except: print(k, v)

    # 测试集评测 & 作图（只报告）
    test_output = trainer.predict(test_ds)
    test_logits = test_output.predictions[0] if isinstance(test_output.predictions, (list, tuple)) else test_output.predictions
    test_probs  = F.softmax(torch.tensor(test_logits, dtype=torch.float32), dim=-1).cpu().numpy()
    test_y_true = test_output.label_ids
    plot_confusion_and_roc(test_y_true, test_probs, MBTI_16, OUTPUT_DIR, suffix="_test")
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'confusion_matrix_test.png')}")
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'roc_micro_macro_test.png')}")

    # 测试集 4D 指标
    test_pred_ids = test_logits.argmax(-1)
    acc16 = float((test_pred_ids == test_y_true).mean())
    pred_types = [MBTI_16[i] for i in test_pred_ids]
    true_types = [MBTI_16[i] for i in test_y_true]
    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(test_y_true)
    print("\n=== Final Test (held-out TEST) ===")
    print(f"acc_16: {acc16:.4f}")
    print(f"acc_ei: {c_ei/n:.4f}  acc_ns: {c_ns/n:.4f}  acc_tf: {c_tf/n:.4f}  acc_jp: {c_jp/n:.4f}  acc_4D: {c_all/n:.4f}")

    # 保存 LoRA 适配器
    trainer.save_model(OUTPUT_DIR)
    print(f"\n✅ LoRA adapter saved to: {OUTPUT_DIR}")

    # 推理示例（从 TEST 取一个样本）
    model.eval()
    sample = test_rows[0]
    text = build_input(sample, tokenizer)
    batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    batch = {k: v.to("cuda:0") for k, v in batch.items()}
    with torch.no_grad():
        logits = model(**batch).logits
        pred_id = int(torch.argmax(logits, dim=-1))
        pred_mbti = MBTI_16[pred_id]
    print("\n[Inference on TEST sample]")
    print("原标签:", sample["type"], " | 预测:", pred_mbti)

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc 16,Acc Ei,Acc Ns,Acc Tf,Acc Jp,Acc 4d
1,0.3867,0.383969,0.895417,0.954465,0.977086,0.953878,0.94389,0.895417
2,0.3792,0.354813,0.896298,0.956816,0.977086,0.95329,0.943302,0.896298
3,0.1472,0.580361,0.900411,0.962103,0.977086,0.959459,0.944477,0.900411
4,0.0271,0.882224,0.897767,0.956522,0.972973,0.958578,0.945358,0.897767


Saved: mbti_lora_qwen1.5b-split_kaggle_ckpt_new/confusion_matrix.png
Saved: mbti_lora_qwen1.5b-split_kaggle_ckpt_new/roc_micro_macro.png



=== Final Eval (on VAL) ===
eval_loss: 0.5804
eval_acc_16: 0.9004
eval_acc_ei: 0.9621
eval_acc_ns: 0.9771
eval_acc_tf: 0.9595
eval_acc_jp: 0.9445
eval_acc_4D: 0.9004
eval_runtime: 91.6887
eval_samples_per_second: 37.1260
eval_steps_per_second: 9.2810
epoch: 4.0000
Saved: mbti_lora_qwen1.5b-split_kaggle_ckpt_new/confusion_matrix_test.png
Saved: mbti_lora_qwen1.5b-split_kaggle_ckpt_new/roc_micro_macro_test.png

=== Final Test (held-out TEST) ===
acc_16: 0.6589
acc_ei: 0.8570  acc_ns: 0.9109  acc_tf: 0.8593  acc_jp: 0.8042  acc_4D: 0.6589

✅ LoRA adapter saved to: mbti_lora_qwen1.5b-split_kaggle_ckpt_new

[Inference on TEST sample]
原标签: INFP  | 预测: INFP


In [1]:
# -*- coding: utf-8 -*-
"""
Meta-Llama-3-8B-Instruct + QLoRA(4bit) + LoRA
MBTI 16类分类（含4D严格准确率）
Transformers==4.55
GPU-only（绝不使用CPU/磁盘offload）
"""
import matplotlib
matplotlib.use("Agg")  # 服务器/无显示环境
import matplotlib.pyplot as plt

from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, auc
)
from sklearn.preprocessing import label_binarize
import torch.nn.functional as F
import os

import os, json
from typing import Dict, Any

# 禁用 accelerate 混精，关 bnb 欢迎语
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
os.environ["BITSANDBYTES_NOWELCOME"] = "1"

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer, TrainingArguments,
    set_seed,
)

# 建议先关 flash-attn（环境确认后再打开）
# try:
#     from transformers import set_attn_implementation
#     set_attn_implementation("flash_attention_2")
# except Exception:
#     pass

# ============ 配置 ============
MODEL_NAME   = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
FILE_PATH    = "mbti_sample_with_all_views.json"

# 24GB 显存友好（等效 tokens/step ≈ 3840）
MAX_LEN      = 320
BUDGET = {"posts_cleaned": 192, "semantic_view": 64, "sentiment_view": 32, "linguistic_view": 24}

SEED         = 42
EPOCHS       = 4
LR           = 2e-4
BSZ_TRN      = 8           # per-device
BSZ_EVAL     = 4
GRAD_ACCUM   = 1            # 320 * 2 * 6 = 3840
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01
OUTPUT_DIR   = "mbti_lora_deepseek-1b_ckpt"

# QLoRA & LoRA
USE_4BIT     = True
LORA_R       = 16
LORA_ALPHA   = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

HF_TOKEN = os.getenv("HF_TOKEN")
HF_KW = {"token": HF_TOKEN} if HF_TOKEN else {}

# ============ MBTI 工具 ============
MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

def mbti_to_4d(m: str):
    return (
        0 if m[0]=="I" else 1,
        0 if m[1]=="S" else 1,
        0 if m[2]=="F" else 1,
        0 if m[3]=="P" else 1,
    )

def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    return [r for r in rows if r.get("type") in MBTI2ID]

def truncate_to_budget(tok: AutoTokenizer, text: str, budget: int) -> str:
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

def build_input(item: Dict[str, Any], tok: AutoTokenizer) -> str:
    p   = truncate_to_budget(tok, item.get("posts_cleaned", item.get("posts","")) or "", BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, item.get("semantic_view","")  or "", BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, item.get("sentiment_view","") or "", BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, item.get("linguistic_view","") or "", BUDGET["linguistic_view"])
    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

# ============ Dataset ============
class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it  = self.rows[idx]
        text= build_input(it, self.tok)
        y   = MBTI2ID[it["type"]]
        enc = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

# ============ 指标 ============
def compute_metrics(eval_pred):
    if isinstance(eval_pred, tuple):
        preds, labels = eval_pred
    else:
        preds, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(preds, (list, tuple)): preds = preds[0]
    if not isinstance(preds, np.ndarray): preds = np.asarray(preds)
    if not isinstance(labels, np.ndarray): labels = np.asarray(labels)

    pred_ids = preds.argmax(-1)
    acc16 = float((pred_ids == labels).mean())

    pred_types = [MBTI_16[i] for i in pred_ids]
    true_types = [MBTI_16[i] for i in labels]
    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(labels)
    return {"acc_16": acc16, "acc_ei": c_ei/n, "acc_ns": c_ns/n, "acc_tf": c_tf/n, "acc_jp": c_jp/n, "acc_4D": c_all/n}
def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir):
    """
    y_true: shape (N,)
    y_prob: shape (N, C) — softmax 后的概率
    class_names: 长度 C 的类别名列表
    """
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)

    # ===== 混淆矩阵 =====
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title("Confusion Matrix")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, "confusion_matrix.png"))
    plt.close(fig_cm)

    # ===== 多分类 ROC（micro/macro）=====
    # 将 y_true 进行 one-vs-rest 二值化
    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))  # (N, C)
    fpr = dict(); tpr = dict(); roc_auc = dict()
    for i in range(len(class_names)):
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # micro
    fpr["micro"], tpr["micro"], _ = roc_curve(Y_true_bin.ravel(), y_prob.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # macro（各类 AUC 的算术平均）
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(class_names))]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(class_names)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= len(class_names)
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # 只画 micro/macro（可读性更好；若需要每类曲线可再加）
    fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
    ax_roc.plot(fpr["micro"], tpr["micro"],
                label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
    ax_roc.plot(fpr["macro"], tpr["macro"],
                label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
    ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
    ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
    ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
    ax_roc.set_title("Multiclass ROC (micro & macro)")
    ax_roc.legend(loc="lower right")
    fig_roc.tight_layout()
    fig_roc.savefig(os.path.join(out_dir, "roc_micro_macro.png"))
    plt.close(fig_roc)

# ============ 主流程 ============
def main():
    torch.cuda.set_device(0)  # 显式选择 GPU
    set_seed(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    rows = load_rows(FILE_PATH)
    from sklearn.model_selection import train_test_split
    train_rows, val_rows = train_test_split(
        rows, test_size=0.1, random_state=SEED, stratify=[r["type"] for r in rows]
    )

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, **HF_KW)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # QLoRA 4bit（GPU-only）
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=USE_4BIT,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    ) if USE_4BIT else None

    # 模型加载到 GPU（禁止自动下放）
    model_kwargs = dict(
        num_labels=16,
        quantization_config=quant_cfg,
        device_map={"": "cuda:0"},    # 强制整模在单卡
        low_cpu_mem_usage=True,
        **HF_KW,
    )
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, **model_kwargs)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False

    # 禁止 resize 词表（避免未量化大embedding + 设备错放）
    # model.resize_token_embeddings(len(tokenizer))

    # LoRA（先准备再统一迁移到 GPU）
    from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)
    try:
        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
    except Exception:
        pass
    peft_cfg = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
        target_modules=TARGET_MODULES, bias="none"
    )
    model = get_peft_model(model, peft_cfg)

    # 统一放到 GPU（只指定设备，不改 dtype）
    model = model.to("cuda:0")

    # （可选）此时再屏蔽 .to，防误触 dtype cast
    def _noop_to(self, *args, **kwargs): return self
    model.to = _noop_to.__get__(model, type(model))

    # 数据 & collator
    train_ds = MBTIDataset(train_rows, tokenizer, max_len=MAX_LEN)
    val_ds   = MBTIDataset(val_rows,   tokenizer, max_len=MAX_LEN)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    # 训练参数（GPU-only 友好）
    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BSZ_TRN,      # 2
        per_device_eval_batch_size=BSZ_EVAL,      # 4
        gradient_accumulation_steps=GRAD_ACCUM,   # 6
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        warmup_ratio=WARMUP_RATIO,
        weight_decay=WEIGHT_DECAY,
        lr_scheduler_type="linear",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        logging_steps=50,
        bf16=False, fp16=False,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="eval_acc_4D",
        greater_is_better=True,

        optim="paged_adamw_8bit",
        eval_accumulation_steps=12,
        gradient_checkpointing=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    # 训练结束后，拿验证集预测的 logits
    pred_output = trainer.predict(val_ds)
    # pred_output.predictions 形状通常是 (N, C)
    logits = pred_output.predictions
    if isinstance(logits, (list, tuple)):
        logits = logits[0]
    # 概率：softmax
    probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
    y_true = pred_output.label_ids

    # 画图并保存
    plot_confusion_and_roc(
        y_true=y_true,
        y_prob=probs,
        class_names=MBTI_16,
        out_dir=OUTPUT_DIR
    )
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'confusion_matrix.png')}")
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'roc_micro_macro.png')}")

    eval_metrics = trainer.evaluate()
    print("\n=== Final Eval ===")
    for k, v in eval_metrics.items():
        try:
            print(f"{k}: {float(v):.4f}")
        except Exception:
            print(k, v)

    trainer.save_model(OUTPUT_DIR)
    print(f"\n✅ LoRA adapter saved to: {OUTPUT_DIR}")

    # 推理示例（确保同一设备）
    model.eval()
    sample = val_rows[0]
    text = build_input(sample, tokenizer)
    batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    batch = {k: v.to("cuda:0") for k, v in batch.items()}
    with torch.no_grad():
        logits = model(**batch).logits
        pred_id = int(torch.argmax(logits, dim=-1))
        pred_mbti = MBTI_16[pred_id]
    print("\n原标签:", sample["type"], " | 预测:", pred_mbti)

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc 16,Acc Ei,Acc Ns,Acc Tf,Acc Jp,Acc 4d
1,0.3805,0.423923,0.871328,0.952409,0.969154,0.945358,0.920094,0.871328
2,0.3565,0.384398,0.888954,0.960047,0.975029,0.952409,0.933901,0.888954
3,0.1203,0.584234,0.882491,0.952115,0.970623,0.948884,0.932726,0.882491
4,0.015,0.841067,0.88396,0.952996,0.970623,0.950646,0.932726,0.88396


Saved: mbti_lora_deepseek-1b_ckpt/confusion_matrix.png
Saved: mbti_lora_deepseek-1b_ckpt/roc_micro_macro.png



=== Final Eval ===
eval_loss: 0.3844
eval_acc_16: 0.8890
eval_acc_ei: 0.9600
eval_acc_ns: 0.9750
eval_acc_tf: 0.9524
eval_acc_jp: 0.9339
eval_acc_4D: 0.8890
eval_runtime: 65.9047
eval_samples_per_second: 51.6500
eval_steps_per_second: 12.9130
epoch: 4.0000

✅ LoRA adapter saved to: mbti_lora_deepseek-1b_ckpt

原标签: ENTP  | 预测: ENTP


In [11]:
# -*- coding: utf-8 -*- 
"""
Qwen-1.5B-Instruct（Qwen2.5） + QLoRA(4bit) + LoRA
MBTI 16类分类（含4D严格准确率）
Transformers==4.55
GPU-only（绝不使用CPU/磁盘offload）
"""
import matplotlib
matplotlib.use("Agg")  # 服务器/无显示环境
import matplotlib.pyplot as plt

from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, auc
)
from sklearn.preprocessing import label_binarize
import torch.nn.functional as F
import os

import os, json
from typing import Dict, Any

# 禁用 accelerate 混精，关 bnb 欢迎语
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
os.environ["BITSANDBYTES_NOWELCOME"] = "1"

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer, TrainingArguments,
    set_seed,
)

# 建议先关 flash-attn（环境确认后再打开）
# try:
#     from transformers import set_attn_implementation
#     set_attn_implementation("flash_attention_2")
# except Exception:
#     pass

# ============ 配置 ============
# 模型改为 Qwen 1.5B Instruct（推荐 Qwen2.5）
MODEL_NAME   = "Qwen/Qwen2.5-1.5B-Instruct"
# 新数据集文件名
FILE_PATH    = "mbti_sample_with_all_views_pandora.json"

# 24GB 显存友好（等效 tokens/step ≈ 3840）
MAX_LEN      = 320
BUDGET = {"posts_cleaned": 192, "semantic_view": 64, "sentiment_view": 32, "linguistic_view": 24}

SEED         = 42
EPOCHS       = 4
LR           = 2e-4
BSZ_TRN      = 8           # per-device
BSZ_EVAL     = 4
GRAD_ACCUM   = 1           # 320 * 2 * 6 = 3840
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01
# 输出目录改名
OUTPUT_DIR   = "mbti_lora_qwen2.5-1.5b_pandora_ckpt"

# QLoRA & LoRA
USE_4BIT     = True
LORA_R       = 16
LORA_ALPHA   = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

HF_TOKEN = os.getenv("HF_TOKEN")
HF_KW = {"token": HF_TOKEN} if HF_TOKEN else {}

# ============ MBTI 工具 ============
MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

def mbti_to_4d(m: str):
    return (
        0 if m[0]=="I" else 1,
        0 if m[1]=="S" else 1,
        0 if m[2]=="F" else 1,
        0 if m[3]=="P" else 1,
    )

def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    return [r for r in rows if r.get("type") in MBTI2ID]

def truncate_to_budget(tok: AutoTokenizer, text: str, budget: int) -> str:
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

def build_input(item: Dict[str, Any], tok: AutoTokenizer) -> str:
    p   = truncate_to_budget(tok, item.get("posts_cleaned", item.get("posts","")) or "", BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, item.get("semantic_view","")  or "", BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, item.get("sentiment_view","") or "", BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, item.get("linguistic_view","") or "", BUDGET["linguistic_view"])
    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

# ============ Dataset ============
class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it  = self.rows[idx]
        text= build_input(it, self.tok)
        y   = MBTI2ID[it["type"]]
        enc = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

# ============ 指标 ============
def compute_metrics(eval_pred):
    if isinstance(eval_pred, tuple):
        preds, labels = eval_pred
    else:
        preds, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(preds, (list, tuple)): preds = preds[0]
    if not isinstance(preds, np.ndarray): preds = np.asarray(preds)
    if not isinstance(labels, np.ndarray): labels = np.asarray(labels)

    pred_ids = preds.argmax(-1)
    acc16 = float((pred_ids == labels).mean())

    pred_types = [MBTI_16[i] for i in pred_ids]
    true_types = [MBTI_16[i] for i in labels]
    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(labels)
    return {"acc_16": acc16, "acc_ei": c_ei/n, "acc_ns": c_ns/n, "acc_tf": c_tf/n, "acc_jp": c_jp/n, "acc_4D": c_all/n}

def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir):
    """
    y_true: shape (N,)
    y_prob: shape (N, C) — softmax 后的概率
    class_names: 长度 C 的类别名列表
    """
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)

    # ===== 混淆矩阵 =====
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title("Confusion Matrix")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, "confusion_matrix.png"))
    plt.close(fig_cm)

    # ===== 多分类 ROC（micro/macro）=====
    # 将 y_true 进行 one-vs-rest 二值化
    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))  # (N, C)
    fpr = dict(); tpr = dict(); roc_auc = dict()
    for i in range(len(class_names)):
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # micro
    fpr["micro"], tpr["micro"], _ = roc_curve(Y_true_bin.ravel(), y_prob.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # macro（各类 AUC 的算术平均）
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(class_names))]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(class_names)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= len(class_names)
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # 只画 micro/macro（可读性更好；若需要每类曲线可再加）
    fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
    ax_roc.plot(fpr["micro"], tpr["micro"],
                label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
    ax_roc.plot(fpr["macro"], tpr["macro"],
                label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
    ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
    ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
    ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
    ax_roc.set_title("Multiclass ROC (micro & macro)")
    ax_roc.legend(loc="lower right")
    fig_roc.tight_layout()
    fig_roc.savefig(os.path.join(out_dir, "roc_micro_macro.png"))
    plt.close(fig_roc)

# ============ 主流程 ============
def main():
    torch.cuda.set_device(0)  # 显式选择 GPU
    set_seed(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    rows = load_rows(FILE_PATH)
    from sklearn.model_selection import train_test_split
    train_rows, val_rows = train_test_split(
        rows, test_size=0.1, random_state=SEED, stratify=[r["type"] for r in rows]
    )

    # tokenizer（Qwen 推荐 trust_remote_code）
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME, use_fast=True, trust_remote_code=True, **HF_KW
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # QLoRA 4bit（GPU-only）
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=USE_4BIT,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    ) if USE_4BIT else None

    # 模型加载到 GPU（禁止自动下放）
    model_kwargs = dict(
        num_labels=16,
        quantization_config=quant_cfg,
        device_map={"": "cuda:0"},    # 强制整模在单卡
        low_cpu_mem_usage=True,
        trust_remote_code=True,       # Qwen 推荐加
        **HF_KW,
    )
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, **model_kwargs)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False

    # 禁止 resize 词表（避免未量化大embedding + 设备错放）
    # model.resize_token_embeddings(len(tokenizer))

    # LoRA（先准备再统一迁移到 GPU）
    from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)
    try:
        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
    except Exception:
        pass
    peft_cfg = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
        target_modules=TARGET_MODULES, bias="none"
    )
    model = get_peft_model(model, peft_cfg)

    # 统一放到 GPU（只指定设备，不改 dtype）
    model = model.to("cuda:0")

    # （可选）此时再屏蔽 .to，防误触 dtype cast
    def _noop_to(self, *args, **kwargs): return self
    model.to = _noop_to.__get__(model, type(model))

    # 数据 & collator
    train_ds = MBTIDataset(train_rows, tokenizer, max_len=MAX_LEN)
    val_ds   = MBTIDataset(val_rows,   tokenizer, max_len=MAX_LEN)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    # 训练参数（GPU-only 友好）
    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BSZ_TRN,      # 8
        per_device_eval_batch_size=BSZ_EVAL,      # 4
        gradient_accumulation_steps=GRAD_ACCUM,   # 1
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        warmup_ratio=WARMUP_RATIO,
        weight_decay=WEIGHT_DECAY,
        lr_scheduler_type="linear",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        logging_steps=50,
        bf16=False, fp16=False,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="eval_acc_4D",
        greater_is_better=True,

        optim="paged_adamw_8bit",
        eval_accumulation_steps=12,
        gradient_checkpointing=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    # 训练结束后，拿验证集预测的 logits
    pred_output = trainer.predict(val_ds)
    logits = pred_output.predictions
    if isinstance(logits, (list, tuple)):
        logits = logits[0]
    # 概率：softmax
    probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
    y_true = pred_output.label_ids

    # 画图并保存
    plot_confusion_and_roc(
        y_true=y_true,
        y_prob=probs,
        class_names=MBTI_16,
        out_dir=OUTPUT_DIR
    )
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'confusion_matrix.png')}")
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'roc_micro_macro.png')}")

    eval_metrics = trainer.evaluate()
    print("\n=== Final Eval ===")
    for k, v in eval_metrics.items():
        try:
            print(f"{k}: {float(v):.4f}")
        except Exception:
            print(k, v)

    trainer.save_model(OUTPUT_DIR)
    print(f"\n✅ LoRA adapter saved to: {OUTPUT_DIR}")

    # 推理示例（确保同一设备）
    model.eval()
    sample = val_rows[0]
    text = build_input(sample, tokenizer)
    batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    batch = {k: v.to("cuda:0") for k, v in batch.items()}
    with torch.no_grad():
        logits = model(**batch).logits
        pred_id = int(torch.argmax(logits, dim=-1))
        pred_mbti = MBTI_16[pred_id]
    print("\n原标签:", sample["type"], " | 预测:", pred_mbti)

if __name__ == "__main__":
    main()


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc 16,Acc Ei,Acc Ns,Acc Tf,Acc Jp,Acc 4d
1,2.1357,2.125176,0.325208,0.659375,0.670833,0.689583,0.653125,0.325208
2,1.7021,1.925803,0.41,0.703125,0.72125,0.733542,0.680417,0.41
3,0.9662,2.273827,0.42375,0.720417,0.717292,0.742708,0.704375,0.42375
4,0.0974,5.073913,0.415833,0.713125,0.717708,0.737292,0.686458,0.415833


Saved: mbti_lora_qwen2.5-1.5b_pandora_ckpt/confusion_matrix.png
Saved: mbti_lora_qwen2.5-1.5b_pandora_ckpt/roc_micro_macro.png



=== Final Eval ===
eval_loss: 2.2738
eval_acc_16: 0.4238
eval_acc_ei: 0.7204
eval_acc_ns: 0.7173
eval_acc_tf: 0.7427
eval_acc_jp: 0.7044
eval_acc_4D: 0.4238
eval_runtime: 96.8746
eval_samples_per_second: 49.5490
eval_steps_per_second: 12.3870
epoch: 4.0000

✅ LoRA adapter saved to: mbti_lora_qwen2.5-1.5b_pandora_ckpt

原标签: INTJ  | 预测: INTJ


In [12]:
# -*- coding: utf-8 -*-
"""
Qwen-2.5-1.5B + LoRA 训练 MBTI 16类，同时统计4D严格准确率
（不做量化，避免 .to() / bitsandbytes 的兼容问题；适配 transformers 4.55+）
并按指定格式输出混淆矩阵和ROC两张图：
- confusion_matrix.png
- roc_micro_macro.png
"""

import os, json
from typing import Dict, Any, List

import numpy as np
import torch
from sklearn.model_selection import train_test_split

# ========= 新增：绘图相关依赖（保持你要的输出格式）=========
import matplotlib
matplotlib.use("Agg")  # 服务器/无显示环境
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import label_binarize
import torch.nn.functional as F
# ======================================================

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer, TrainingArguments,
    set_seed,
)

# ---------------- 基本配置 ----------------
MODEL_NAME = "Qwen/Qwen2.5-1.5B"              # 也可用 "Qwen/Qwen2.5-1.5B-Instruct"
FILE_PATH  = "mbti_sample_with_all_views_pandora.json"

MAX_LEN = 768
BUDGET = {
    "posts_cleaned": 384,
    "semantic_view": 128,
    "sentiment_view": 128,
    "linguistic_view": 128,
}

SEED         = 42
EPOCHS       = 4
LR           = 2e-4
BSZ_TRN      = 4           # 显存吃紧就再降
BSZ_EVAL     = 8
GRAD_ACCUM   = 2
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01
OUTPUT_DIR   = "mbti_lora_qwen2.5-1.5b_pandora_new_ckpt"   # ← 按你要求改名

# ---------------- MBTI 工具 ----------------
MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP",
    "INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ",
    "ISTP","ISFP","ESTP","ESFP",
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

def mbti_to_4d(m: str):
    # I/E, S/N, F/T, P/J -> 0/1
    return (
        0 if m[0]=="I" else 1,
        0 if m[1]=="S" else 1,
        0 if m[2]=="F" else 1,
        0 if m[3]=="P" else 1,
    )

def truncate_to_budget(tok: "AutoTokenizer", text: str, budget: int) -> str:
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

def build_input(item: Dict[str, Any], tok: "AutoTokenizer") -> str:
    p   = truncate_to_budget(tok, item.get("posts_cleaned", item.get("posts","")) or "", BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, item.get("semantic_view","")  or "", BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, item.get("sentiment_view","") or "", BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, item.get("linguistic_view","") or "", BUDGET["linguistic_view"])
    return (
        f"[POSTS]\n{p}\n"
        f"[SEMANTIC]\n{sem}\n"
        f"[SENTIMENT]\n{sen}\n"
        f"[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

def load_rows(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    return [r for r in rows if r.get("type") in MBTI2ID]

# ---------------- Dataset ----------------
class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it  = self.rows[idx]
        text= build_input(it, self.tok)
        y   = MBTI2ID[it["type"]]
        enc = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

# ---------------- 指标（沿用旧逻辑） ----------------
def compute_metrics(eval_pred):
    # 兼容 EvalPrediction / (preds, labels) / logits tuple
    preds, labels = (eval_pred.predictions, eval_pred.label_ids) if hasattr(eval_pred, "predictions") else eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    if isinstance(preds, torch.Tensor):
        preds = preds.detach().cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()

    preds = preds.argmax(-1)
    acc16 = float((preds == labels).mean())

    pred_types = [MBTI_16[i] for i in preds]
    true_types = [MBTI_16[i] for i in labels]

    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(labels)
    return {
        "acc_16": acc16,
        "acc_ei": c_ei/n, "acc_ns": c_ns/n, "acc_tf": c_tf/n, "acc_jp": c_jp/n,
        "acc_4D": c_all/n,
    }

# ---------------- 绘图函数（保持你要的格式/文件名） ----------------
def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)

    # 混淆矩阵
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title("Confusion Matrix")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, "confusion_matrix.png"))
    plt.close(fig_cm)

    # ROC（micro / macro）
    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))
    fpr = {}; tpr = {}; roc_auc = {}
    for i in range(len(class_names)):
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(Y_true_bin.ravel(), y_prob.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(class_names))]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(class_names)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= len(class_names)
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
    ax_roc.plot(fpr["micro"], tpr["micro"],
                label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
    ax_roc.plot(fpr["macro"], tpr["macro"],
                label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
    ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
    ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
    ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
    ax_roc.set_title("Multiclass ROC (micro & macro)")
    ax_roc.legend(loc="lower right")
    fig_roc.tight_layout()
    fig_roc.savefig(os.path.join(out_dir, "roc_micro_macro.png"))
    plt.close(fig_roc)

# ---------------- 训练主流程 ----------------
def main():
    set_seed(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    rows = load_rows(FILE_PATH)
    train_rows, val_rows = train_test_split(
        rows, test_size=0.1, random_state=SEED, stratify=[r["type"] for r in rows]
    )

    # Tokenizer（为稳妥可加 trust_remote_code）
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # 模型（不量化）+ 分类头
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=16,
        torch_dtype=(torch.bfloat16 if torch.cuda.is_available() else None),
        device_map="auto",
        trust_remote_code=True,
    )
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False
    model.resize_token_embeddings(len(tokenizer))
    if hasattr(model, "gradient_checkpointing_enable"):
        model.gradient_checkpointing_enable()

    # 只做 LoRA（不做 k-bit）
    from peft import LoraConfig, TaskType, get_peft_model
    peft_cfg = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=16, lora_alpha=32, lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
        bias="none"
    )
    model = get_peft_model(model, peft_cfg)
    model.print_trainable_parameters()

    train_ds = MBTIDataset(train_rows, tokenizer, max_len=MAX_LEN)
    val_ds   = MBTIDataset(val_rows,   tokenizer, max_len=MAX_LEN)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    # transformers 4.55+ 使用 eval_strategy
    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BSZ_TRN,
        per_device_eval_batch_size=BSZ_EVAL,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        warmup_ratio=WARMUP_RATIO,
        weight_decay=WEIGHT_DECAY,
        logging_steps=50,
        save_total_limit=2,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="eval_acc_4D",
        greater_is_better=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        bf16=(torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8),
        fp16=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # ====== 生成与保存图表（保持你的输出格式）======
    pred_output = trainer.predict(val_ds)
    logits = pred_output.predictions
    if isinstance(logits, (list, tuple)):
        logits = logits[0]
    probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
    y_true = pred_output.label_ids

    plot_confusion_and_roc(
        y_true=y_true,
        y_prob=probs,
        class_names=MBTI_16,
        out_dir=OUTPUT_DIR
    )
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'confusion_matrix.png')}")
    print(f"Saved: {os.path.join(OUTPUT_DIR, 'roc_micro_macro.png')}")

    eval_metrics = trainer.evaluate()
    print("\n=== Final Eval ===")
    for k, v in eval_metrics.items():
        try:
            print(f"{k}: {float(v):.4f}")
        except Exception:
            print(k, v)

    trainer.save_model(OUTPUT_DIR)
    print(f"\n✅ LoRA adapter saved to: {OUTPUT_DIR}")

    # 简单推理示例
    model.eval()
    sample = val_rows[0]
    text = build_input(sample, tokenizer)
    batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    batch = {k: v.to(model.device) for k, v in batch.items()}
    with torch.no_grad():
        logits = model(**batch).logits
        pred_id = int(torch.argmax(logits, dim=-1))
        pred_mbti = MBTI_16[pred_id]
    print("\n原标签:", sample["type"], "| 预测:", pred_mbti)

if __name__ == "__main__":
    main()


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 18,489,344 || all params: 1,561,811,968 || trainable%: 1.1838


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc 16,Acc Ei,Acc Ns,Acc Tf,Acc Jp,Acc 4d
1,3.9868,1.984458,0.374167,0.678125,0.69875,0.712292,0.685,0.374167
2,3.0299,1.729987,0.463958,0.737917,0.750417,0.75375,0.717917,0.463958
3,1.4719,2.178578,0.479583,0.74,0.752083,0.767292,0.734167,0.479583
4,0.1857,3.985021,0.483958,0.747292,0.76,0.760833,0.736042,0.483958




Saved: mbti_lora_qwen2.5-1.5b_pandora_new_ckpt/confusion_matrix.png
Saved: mbti_lora_qwen2.5-1.5b_pandora_new_ckpt/roc_micro_macro.png



=== Final Eval ===
eval_loss: 3.9850
eval_acc_16: 0.4840
eval_acc_ei: 0.7473
eval_acc_ns: 0.7600
eval_acc_tf: 0.7608
eval_acc_jp: 0.7360
eval_acc_4D: 0.4840
eval_runtime: 100.1548
eval_samples_per_second: 47.9260
eval_steps_per_second: 5.9910
epoch: 4.0000





✅ LoRA adapter saved to: mbti_lora_qwen2.5-1.5b_pandora_new_ckpt

原标签: INTJ | 预测: ISTJ


In [2]:
import json, os
ADAPTER_DIR = "mbti_lora_qwen1.5b_ckpt"   # 你的 ckpt 目录
cfg = json.load(open(os.path.join(ADAPTER_DIR, "adapter_config.json"), "r"))
print("base_model_name_or_path =", cfg.get("base_model_name_or_path"))
print("task_type =", cfg.get("task_type"))


base_model_name_or_path = Qwen/Qwen2.5-1.5B-Instruct
task_type = SEQ_CLS


In [5]:
# -*- coding: utf-8 -*-
"""
评测脚本（最终版，无 AutoPeft）
- 基座: Qwen/Qwen2.5-1.5B-Instruct
- LoRA: mbti_lora_qwen1.5b_ckpt
- 数据: mbti_sample_with_all_views_pandora.json
- 输出: 指标 + 混淆矩阵 + ROC(micro/macro)
- 纯评测（不训练），GPU-only，4bit 量化
"""

import os, json
from typing import Dict, Any
import numpy as np
import torch
import torch.nn.functional as F

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import label_binarize

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer, TrainingArguments,
    set_seed,
)
from peft import PeftModel  # 只用 PeftModel，不用 AutoPeft

# ================== 配置 ==================
BASE_MODEL   = "Qwen/Qwen2.5-1.5B-Instruct"                # 必须与 adapter_config.json 对齐
ADAPTER_DIR  = "mbti_lora_qwen1.5b_ckpt"                   # 你的 LoRA 目录
FILE_PATH    = "mbti_sample_with_all_views_pandora.json"   # 新数据集（潘多拉）
OUTPUT_DIR   = "eval_on_pandora_outputs"                   # 评测输出目录

MAX_LEN      = 320
USE_4BIT     = True
SEED         = 42
NUM_LABELS   = 16

MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

# 与训练一致的多视角 token 预算
BUDGET = {"posts_cleaned": 192, "semantic_view": 64, "sentiment_view": 32, "linguistic_view": 24}

HF_TOKEN = os.getenv("HF_TOKEN")
HF_KW = {"token": HF_TOKEN} if HF_TOKEN else {}

# ================== 工具函数 ==================
def mbti_to_4d(m: str):
    m = m.upper()
    return (
        0 if m[0]=="I" else 1,
        0 if m[1]=="S" else 1,
        0 if m[2]=="F" else 1,
        0 if m[3]=="P" else 1,
    )

def truncate_to_budget(tok: AutoTokenizer, text: str, budget: int) -> str:
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

def build_input(item: Dict[str, Any], tok: AutoTokenizer) -> str:
    """
    潘多拉与原数据字段若一致则直接用；
    若是最简 {"text": "...", "label": "..."}，把 posts_cleaned 改成 text 即可。
    """
    p_raw = item.get("posts_cleaned", item.get("posts","")) or item.get("text","") or ""
    sem   = item.get("semantic_view","")  or ""
    sen   = item.get("sentiment_view","") or ""
    lin   = item.get("linguistic_view","") or ""

    p   = truncate_to_budget(tok, p_raw, BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, sem,   BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, sen,   BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, lin,   BUDGET["linguistic_view"])

    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    clean = []
    for r in rows:
        t = (r.get("type") or r.get("label") or "").upper().strip()
        if t in MBTI2ID:
            r["type"] = t
            clean.append(r)
    return clean

class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it   = self.rows[idx]
        text = build_input(it, self.tok)
        y    = MBTI2ID[it["type"]]
        enc  = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

def compute_metrics(eval_pred):
    preds, labels = (eval_pred if isinstance(eval_pred, tuple)
                     else (eval_pred.predictions, eval_pred.label_ids))
    if isinstance(preds, (list, tuple)): preds = preds[0]
    preds = np.asarray(preds); labels = np.asarray(labels)
    pred_ids = preds.argmax(-1)
    acc16 = float((pred_ids == labels).mean())

    pred_types = [MBTI_16[i] for i in pred_ids]
    true_types = [MBTI_16[i] for i in labels]
    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(labels)
    return {"acc_16": acc16, "acc_ei": c_ei/n, "acc_ns": c_ns/n, "acc_tf": c_tf/n, "acc_jp": c_jp/n, "acc_4D": c_all/n}

def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title("Confusion Matrix (Pandora)")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, "pandora_confusion_matrix.png"))
    plt.close(fig_cm)

    # ROC：跳过评测集中没有正样本的类
    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))
    fpr, tpr, roc_auc = {}, {}, {}
    valid = []
    for i in range(len(class_names)):
        if Y_true_bin[:, i].sum() == 0:
            continue
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        valid.append(i)
    if len(valid) >= 2:
        fpr["micro"], tpr["micro"], _ = roc_curve(
            Y_true_bin[:, valid].ravel(), y_prob[:, valid].ravel()
        )
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        all_fpr = np.unique(np.concatenate([fpr[i] for i in valid]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in valid:
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= len(valid)
        fpr["macro"] = all_fpr; tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
        ax_roc.plot(fpr["micro"], tpr["micro"],
                    label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
        ax_roc.plot(fpr["macro"], tpr["macro"],
                    label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
        ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
        ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
        ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
        ax_roc.set_title("Multiclass ROC (Pandora)")
        ax_roc.legend(loc="lower right")
        fig_roc.tight_layout()
        fig_roc.savefig(os.path.join(out_dir, "pandora_roc_micro_macro.png"))
        plt.close(fig_roc)

# ================== 主流程 ==================
def main():
    # 纯 GPU 环境
    os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
    os.environ["BITSANDBYTES_NOWELCOME"] = "1"  # 注意拼写
    torch.cuda.set_device(0)
    set_seed(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # tokenizer 与基座一致
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL, use_fast=True, trust_remote_code=True, **HF_KW
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # 量化配置
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=USE_4BIT,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,  # 老显卡可改 torch.float16
    ) if USE_4BIT else None

    # ===== 关键：先用 num_labels=16 初始化“基座分类模型”，再套 LoRA =====
    base_cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True, **HF_KW)
    base_cfg.num_labels = NUM_LABELS

    base = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL,
        config=base_cfg,                         # 带 num_labels=16 的 config
        device_map={"": "cuda:0"},
        quantization_config=quant_cfg,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **HF_KW,
    )

    model = PeftModel.from_pretrained(base, ADAPTER_DIR, is_trainable=False)
    model.config.use_cache = False
    model.config.pad_token_id = tokenizer.pad_token_id

    # （可选）形状检查
    try:
        print("hidden_size =", model.base_model.model.config.hidden_size)
        print("score.weight shape =", tuple(model.base_model.model.score.weight.shape))
    except Exception:
        pass

    # 数据
    rows = load_rows(FILE_PATH)
    eval_ds = MBTIDataset(rows, tokenizer, max_len=MAX_LEN)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    # 只评测
    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_eval_batch_size=8,
        dataloader_drop_last=False,
        report_to="none",
        fp16=False, bf16=False,
    )
    trainer = Trainer(
        model=model,
        args=args,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    # 预测 → 概率 → 作图
    pred_output = trainer.predict(eval_ds)
    logits = pred_output.predictions
    if isinstance(logits, (list, tuple)):
        logits = logits[0]
    probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
    y_true = pred_output.label_ids

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    plot_confusion_and_roc(y_true, probs, MBTI_16, OUTPUT_DIR)

    # 指标（含四维与整体）
    metrics = trainer.evaluate()
    print("\n=== Pandora Eval ===")
    for k, v in metrics.items():
        try:
            print(f"{k}: {float(v):.4f}")
        except Exception:
            print(k, v)

    # 简单样例
    model.eval()
    sample = rows[0]
    text = build_input(sample, tokenizer)
    batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    batch = {k: v.to("cuda:0") for k, v in batch.items()}
    with torch.no_grad():
        out = model(**batch).logits
        pred_id = int(torch.argmax(out, dim=-1))
        print("样例原标签:", sample["type"], "| 预测:", MBTI_16[pred_id])

if __name__ == "__main__":
    main()


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


hidden_size = 1536
score.weight shape = (16, 1536)


  trainer = Trainer(



=== Pandora Eval ===
eval_loss: 3.4766
eval_model_preparation_time: 0.0030
eval_acc_16: 0.1066
eval_acc_ei: 0.5264
eval_acc_ns: 0.5206
eval_acc_tf: 0.6357
eval_acc_jp: 0.5229
eval_acc_4D: 0.1066
eval_runtime: 807.8859
eval_samples_per_second: 59.4140
eval_steps_per_second: 7.4270
样例原标签: INTP | 预测: INFJ


# 融合训练 单个评测kaggle/pandora

In [None]:
# -*- coding: utf-8 -*-
"""
训练 + 评测（LoRA / 4bit / 单卡）
- 基座: Qwen/Qwen2.5-1.5B-Instruct
- 训练集: 两数据集合并 (mbti_sample_with_all_views.json + mbti_sample_with_all_views_pandora.json)
- Eval/Test: 只在指定的数据集上评测 (默认 Pandora)
- 输出: 指标 + 混淆矩阵 + ROC(micro/macro) + LoRA 适配器权重
"""
import os, json, random
from typing import Dict, Any, List
import numpy as np
import torch
import torch.nn.functional as F

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import label_binarize

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer, TrainingArguments,
    set_seed,
)
from peft import (
    LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
)

# ================== 配置 ==================
BASE_MODEL   = "Qwen/Qwen2.5-1.5B-Instruct"
DATA_A       = "mbti_sample_with_all_views.json"          # 旧数据
DATA_B       = "mbti_sample_with_all_views_pandora.json"  # Pandora
EVAL_ON      = "B"  # 只在哪个集上做 eval/test: "A" or "B"
OUTPUT_DIR   = "qwen-test-on-pandora"                      # 输出目录（含 lora）
RESUME_ADAPTER_DIR = None  # 若已有 LoRA 断点，可填入目录；否则置为 None

MAX_LEN      = 320
USE_4BIT     = True
SEED         = 42
NUM_LABELS   = 16

# LoRA 超参（可按需微调）
LORA_R       = 16
LORA_ALPHA   = 32
LORA_DROPOUT = 0.05
# Qwen2.5 常用目标模块
LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

# 训练超参（按你的显存情况调整）
BATCH_SIZE_PER_DEVICE_TRAIN = 8
BATCH_SIZE_PER_DEVICE_EVAL  = 8
GR_ACCUM_STEPS              = 1
EPOCHS                      = 3
LR                          = 2e-4
WARMUP_RATIO                = 0.05
LOGGING_STEPS               = 20
SAVE_STEPS                  = 500
EVAL_STEPS                  = 500

MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

# 与训练一致的多视角 token 预算
BUDGET = {"posts_cleaned": 192, "semantic_view": 64, "sentiment_view": 32, "linguistic_view": 24}

HF_TOKEN = os.getenv("HF_TOKEN")
HF_KW = {"token": HF_TOKEN} if HF_TOKEN else {}

# ================== 工具函数 ==================
def mbti_to_4d(m: str):
    m = m.upper()
    return (
        0 if m[0]=="I" else 1,
        0 if m[1]=="S" else 1,
        0 if m[2]=="F" else 1,
        0 if m[3]=="P" else 1,
    )

def truncate_to_budget(tok: AutoTokenizer, text: str, budget: int) -> str:
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

def build_input(item: Dict[str, Any], tok: AutoTokenizer) -> str:
    p_raw = item.get("posts_cleaned", item.get("posts","")) or item.get("text","") or ""
    sem   = item.get("semantic_view","")  or ""
    sen   = item.get("sentiment_view","") or ""
    lin   = item.get("linguistic_view","") or ""

    p   = truncate_to_budget(tok, p_raw, BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, sem,   BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, sen,   BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, lin,   BUDGET["linguistic_view"])

    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    clean = []
    for r in rows:
        t = (r.get("type") or r.get("label") or "").upper().strip()
        if t in MBTI2ID:
            r["type"] = t
            clean.append(r)
    return clean

class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it   = self.rows[idx]
        text = build_input(it, self.tok)
        y    = MBTI2ID[it["type"]]
        enc  = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

def compute_metrics(eval_pred):
    preds, labels = (eval_pred if isinstance(eval_pred, tuple)
                     else (eval_pred.predictions, eval_pred.label_ids))
    if isinstance(preds, (list, tuple)): preds = preds[0]
    preds = np.asarray(preds); labels = np.asarray(labels)
    pred_ids = preds.argmax(-1)
    acc16 = float((pred_ids == labels).mean())

    pred_types = [MBTI_16[i] for i in pred_ids]
    true_types = [MBTI_16[i] for i in labels]
    c_ei=c_ns=c_tf=c_jp=c_all=0
    for pt, tt in zip(pred_types, true_types):
        pei,pns,ptf,pjp = mbti_to_4d(pt)
        tei,tns,ttf,tjp = mbti_to_4d(tt)
        c_ei += (pei==tei); c_ns += (pns==tns); c_tf += (ptf==ttf); c_jp += (pjp==tjp)
        c_all+= (pei==tei and pns==tns and ptf==ttf and pjp==tjp)
    n = len(labels)
    return {"acc_16": acc16, "acc_ei": c_ei/n, "acc_ns": c_ns/n, "acc_tf": c_tf/n, "acc_jp": c_jp/n, "acc_4D": c_all/n}

def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir, tag="eval"):
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title(f"Confusion Matrix ({tag})")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, f"{tag}_confusion_matrix.png"))
    plt.close(fig_cm)

    # ROC：跳过评测集中没有正样本的类
    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))
    fpr, tpr, roc_auc = {}, {}, {}
    valid = []
    for i in range(len(class_names)):
        if Y_true_bin[:, i].sum() == 0:
            continue
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        valid.append(i)
    if len(valid) >= 2:
        fpr["micro"], tpr["micro"], _ = roc_curve(
            Y_true_bin[:, valid].ravel(), y_prob[:, valid].ravel()
        )
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        all_fpr = np.unique(np.concatenate([fpr[i] for i in valid]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in valid:
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= len(valid)
        fpr["macro"] = all_fpr; tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
        ax_roc.plot(fpr["micro"], tpr["micro"],
                    label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
        ax_roc.plot(fpr["macro"], tpr["macro"],
                    label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
        ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
        ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
        ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
        ax_roc.set_title(f"Multiclass ROC ({tag})")
        ax_roc.legend(loc="lower right")
        fig_roc.tight_layout()
        fig_roc.savefig(os.path.join(out_dir, f"{tag}_roc_micro_macro.png"))
        plt.close(fig_roc)

# ================== 主流程 ==================
def main():
    # 环境 & 种子
    os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
    os.environ["BITSANDBYTES_NOWELCOME"] = "1"
    torch.cuda.set_device(0)
    set_seed(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL, use_fast=True, trust_remote_code=True, **HF_KW
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # 量化
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=USE_4BIT,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    ) if USE_4BIT else None

    # 分类头：num_labels=16
    base_cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True, **HF_KW)
    base_cfg.num_labels = NUM_LABELS

    # 基座
    base = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL,
        config=base_cfg,
        device_map={"": "cuda:0"},
        quantization_config=quant_cfg,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **HF_KW,
    )

    # ========= LoRA：新训或续训 =========
    if RESUME_ADAPTER_DIR:
        # 从已训练的 LoRA 继续
        model = PeftModel.from_pretrained(base, RESUME_ADAPTER_DIR, is_trainable=True)
    else:
        # 新建 LoRA
        base = prepare_model_for_kbit_training(base)  # 4bit 可训练准备
        lora_cfg = LoraConfig(
            r=LORA_R,
            lora_alpha=LORA_ALPHA,
            target_modules=LORA_TARGET_MODULES,
            lora_dropout=LORA_DROPOUT,
            bias="none",
            task_type="SEQ_CLS",
        )
        model = get_peft_model(base, lora_cfg)

    model.config.use_cache = False
    model.config.pad_token_id = tokenizer.pad_token_id
    model.print_trainable_parameters()

    # ========= 数据 =========
    rows_A = load_rows(DATA_A)
    rows_B = load_rows(DATA_B)

    # 训练集 = A ∪ B
    train_rows: List[Dict[str, Any]] = rows_A + rows_B
    random.Random(SEED).shuffle(train_rows)

    # eval/test 只用指定一个数据集（默认 B=Pandora）
    if EVAL_ON.upper() == "A":
        eval_rows = rows_A
        eval_tag  = "A_eval"
    else:
        eval_rows = rows_B
        eval_tag  = "B_eval"

    # （可选）从 eval_rows 再划一个 test 子集；这里简单按 80/20 切
    cut = int(0.8 * len(eval_rows)) if len(eval_rows) > 5 else len(eval_rows)
    test_rows = eval_rows[cut:]
    eval_rows = eval_rows[:cut] if cut > 0 else eval_rows

    # 构建数据集
    train_ds = MBTIDataset(train_rows, tokenizer, max_len=MAX_LEN)
    eval_ds  = MBTIDataset(eval_rows,  tokenizer, max_len=MAX_LEN)
    test_ds  = MBTIDataset(test_rows,  tokenizer, max_len=MAX_LEN) if test_rows else None
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    # ========= 训练参数 =========
    from transformers import TrainingArguments

    common_kwargs = dict(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE_PER_DEVICE_TRAIN,
        per_device_eval_batch_size=BATCH_SIZE_PER_DEVICE_EVAL,
        gradient_accumulation_steps=GR_ACCUM_STEPS,
        learning_rate=LR,
        num_train_epochs=EPOCHS,
        warmup_ratio=WARMUP_RATIO,
        logging_steps=LOGGING_STEPS,
        eval_steps=EVAL_STEPS,
        save_steps=SAVE_STEPS,
        save_total_limit=2,
        lr_scheduler_type="cosine",
        report_to="none",
        fp16=False, bf16=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_acc_16",
        greater_is_better=True,
        # 这两个在新老版本都存在，显式写上更稳
        logging_strategy="steps",
        save_strategy="steps",
    )

    # 依次尝试新/旧/远古命名，保证不同版本都能跑
    try:
        args = TrainingArguments(eval_strategy="steps", **common_kwargs)
    except TypeError:
        try:
            args = TrainingArguments(evaluation_strategy="steps", **common_kwargs)
        except TypeError:
            # 超老版本（3.x）兜底
            args = TrainingArguments(evaluate_during_training=True, **common_kwargs)


    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    # ========= 训练 =========
    trainer.train()

    # 保存 LoRA（适配器）
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    try:
        model.save_pretrained(os.path.join(OUTPUT_DIR, "lora_adapter"))
    except Exception as e:
        print("Save adapter failed:", e)

    # ========= Eval（在指定集）=========
    eval_output = trainer.predict(eval_ds)
    logits = eval_output.predictions
    if isinstance(logits, (list, tuple)):
        logits = logits[0]
    probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
    y_true = eval_output.label_ids
    plot_confusion_and_roc(y_true, probs, MBTI_16, OUTPUT_DIR, tag=f"{eval_tag}")

    metrics = trainer.evaluate(eval_dataset=eval_ds)
    print("\n=== Eval on chosen dataset ===")
    for k, v in metrics.items():
        try:
            print(f"{k}: {float(v):.4f}")
        except Exception:
            print(k, v)

    # ========= Test（同一数据集的 hold-out 部分）=========
    if test_ds and len(test_ds) > 0:
        test_output = trainer.predict(test_ds)
        logits = test_output.predictions
        if isinstance(logits, (list, tuple)):
            logits = logits[0]
        probs = F.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1).cpu().numpy()
        y_true = test_output.label_ids
        plot_confusion_and_roc(y_true, probs, MBTI_16, OUTPUT_DIR, tag=f"{eval_tag}_test")

        # 简单整体准确率
        pred_ids = probs.argmax(-1)
        acc = float((pred_ids == y_true).mean())
        print(f"\n=== Test accuracy on chosen dataset: {acc:.4f}")

    # ========= 示例推理 =========
    model.eval()
    sample = (rows_B[0] if EVAL_ON.upper()=="B" else rows_A[0]) if (rows_A and rows_B) else (train_rows[0])
    text = build_input(sample, tokenizer)
    batch = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    batch = {k: v.to("cuda:0") for k, v in batch.items()}
    with torch.no_grad():
        out = model(**batch).logits
        pred_id = int(torch.argmax(out, dim=-1))
        print("样例原标签:", sample["type"], "| 预测:", MBTI_16[pred_id])

if __name__ == "__main__":
    main()


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 18,489,344 || all params: 1,562,228,224 || trainable%: 1.1835


  trainer = Trainer(
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


In [1]:
import os, json, random

SEED = 42
DATA_A = "mbti_sample_with_all_views.json"
DATA_B = "mbti_sample_with_all_views_pandora.json"
OUT = "qwen-test-on-pandora/splits"

MBTI_16 = {"INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
           "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"}

def load_rows(path):
    rows = json.load(open(path, "r", encoding="utf-8"))
    clean = []
    for r in rows:
        t = (r.get("type") or r.get("label") or "").upper().strip()
        if t in MBTI_16:
            r["type"] = t
            clean.append(r)   # 不打乱，也不去重，保持文件原始顺序
    return clean

os.makedirs(OUT, exist_ok=True)

rows_A = load_rows(DATA_A)
rows_B = load_rows(DATA_B)

# 训练时实际用到的“训练集合”：A ∪ B 的全部样本
train_rows_all = rows_A + rows_B
json.dump(train_rows_all, open(os.path.join(OUT, "train_used_in_training.json"), "w", encoding="utf-8"),
          ensure_ascii=False, indent=2)

# 如果你想把“训练时第一轮的打包顺序（随机打乱顺序）”也复刻出来，记录那次 shuffle 的索引即可：
idx = list(range(len(train_rows_all)))
random.Random(SEED).shuffle(idx)
json.dump(idx, open(os.path.join(OUT, "train_shuffle_index_seed42.json"), "w", encoding="utf-8"))

# Pandora 的 eval/test（与你脚本一致：不打乱，按顺序 8:2 切分）
cut = int(0.8 * len(rows_B)) if len(rows_B) > 5 else len(rows_B)
pandora_eval = rows_B[:cut]
pandora_test = rows_B[cut:]

json.dump(pandora_eval, open(os.path.join(OUT, "pandora_eval_80.json"), "w", encoding="utf-8"),
          ensure_ascii=False, indent=2)
json.dump(pandora_test, open(os.path.join(OUT, "pandora_test_20.json"), "w", encoding="utf-8"),
          ensure_ascii=False, indent=2)

print(f"Saved: train_used_in_training={len(train_rows_all)}, "
      f"pandora_eval_80={len(pandora_eval)}, pandora_test_20={len(pandora_test)}")


Saved: train_used_in_training=82032, pandora_eval_80=38400, pandora_test_20=9600
