In [None]:
import sys, subprocess, platform, torch, time
import os
import math
import json
import random
from pathlib import Path
from typing import Any, Dict, List

import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import precision_recall_curve

# ---------- 小网格搜索 ----------
def cfg_key(cfg: dict) -> str:
    return "|".join(f"{k}={cfg[k]}" for k in sorted(cfg))

TUNE_DIR = Path("outputs/tuning"); TUNE_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_CSV = TUNE_DIR / "search_results.csv"

# 搜索空间（先从少量高性价比超参开始；需要再扩可加）
SEARCH_SPACE = {
    "LR":            [1e-5],
    "WARMUP_RATIO":  [0.06],
    "WEIGHT_DECAY":  [0.01],
    "GRAD_ACCUM":    [8],
    "DROPOUT_RATE":  [0.1],
    "HEAD_WIDTH":    [1.0],
    "HEAD_LR_MULT":  [3.0],   # 新增
    "EPOCHS":        [20],
    "MAX_GRAD_NORM": [1.0],            # 想加裁剪就放开
    "NEG_WEIGHT": [1, 1.5, 2.0],
    "POOL": ["topk2"],
}

keys, values = zip(*SEARCH_SPACE.items())
rows = []
start_ts = int(time.time())

HYPER_COLS = sorted(SEARCH_SPACE.keys())
FIELDNAMES = (
    ["config"] + HYPER_COLS +
    ["val_auc", "val_f1", "val_acc", "test_auc", "test_f1", "test_acc", "ckpt", "run_dir"]
)

def pick_threshold(y_true, y_prob, target="f1", min_precision=None):
    P, R, T = precision_recall_curve(y_true, y_prob)
    # sklearn 返回的 T 长度比 P/R 少 1；下面索引会相应对齐
    if min_precision is not None:
        mask = P[:-1] >= min_precision
        if mask.any():
            idx = mask.argmax()
            return T[idx]
    if target == "f1":
        f1 = 2*P*R/(P+R+1e-12)
        idx = np.nanargmax(f1[:-1])
        return T[idx]
    return 0.5

class EarlyStopper:
    def __init__(self, patience=3, mode="max", min_delta=1e-4):
        self.best = None
        self.bad = 0
        self.patience = patience
        self.mode = mode
        self.min_delta = min_delta
    def step(self, value):
        if self.best is None:
            self.best = value
            return False
        improved = (value > self.best + self.min_delta) if self.mode=="max" else (value < self.best - self.min_delta)
        if improved:
            self.best = value
            self.bad = 0
        else:
            self.bad += 1
        return self.bad > self.patience
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
)
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

from sklearn.metrics import (
    roc_auc_score, roc_curve, auc,
    precision_recall_curve,
    f1_score, accuracy_score, precision_score, recall_score,
    confusion_matrix, ConfusionMatrixDisplay
)

from tqdm import tqdm
import torch.nn.functional as F
from transformers import AutoConfig, AutoModel


# ---- Config ----
COMBINED_JSONL = Path(r"D:\learning\APS360\project\leetcode_github\combined_train\all.jsonl")
MODEL_NAME  = "microsoft/codebert-base"   # or "Salesforce/codet5-base"
OUT_DIR     = Path("outputs/code_model_nb")
MAX_LENGTH  = 512
CHUNK_STRIDE = 256  # 新增
BATCH_SIZE  = 8
EVAL_BATCH  = 8
EPOCHS      = 20
LR          = 3e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
GRAD_ACCUM   = 1
NUM_WORKERS  = 2
SEED         = 42
USE_FP16     = True                       # mixed precision on CUDA if available
FORCE_CPU    = False                      # set True to force CPU

OUT_DIR.mkdir(parents=True, exist_ok=True)

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() and not FORCE_CPU else "cpu")
print(f"[INFO] Device: {device}")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for ln in f:
            ln = ln.strip()
            if not ln:
                continue
            rows.append(json.loads(ln))
    return rows

# 从 combined_train/all.jsonl 读取所有样本
all_rows = read_jsonl(COMBINED_JSONL)
print(f"[OK] Loaded combined jsonl: {len(all_rows)} samples")

# 按 label 做一次 stratified train/val/test 划分
from sklearn.model_selection import train_test_split

all_labels = [int(r.get("label", 0)) for r in all_rows]

# 先拆出 test 10%
train_val_rows, test_rows, train_val_y, test_y = train_test_split(
    all_rows, all_labels, test_size=0.10, random_state=42, stratify=all_labels
)

# 再从 train_val 里拆出 val 10%
train_rows, val_rows, train_y, val_y = train_test_split(
    train_val_rows, train_val_y, test_size=0.10, random_state=42, stratify=train_val_y
)

print(f"[OK] Split combined -> train={len(train_rows)} val={len(val_rows)} test={len(test_rows)}")

class CodeJsonlDataset(Dataset):
    """
    Each item returns:
      {
        "chunks": List[ Dict[input_ids, attention_mask] ],
        "label": float (0.0/1.0)
      }
    We first encode without special tokens, then slice token ids into
    blocks of (max_length - 2), and call `prepare_for_model` per block.
    """
    def __init__(self, data: List[Dict[str, Any]], tokenizer, max_length: int):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []
        for row in data:
            text = row.get("text", "")
            label = float(row.get("label", 0))
            chunks = self._tokenize_to_chunks(text)
            self.samples.append({"chunks": chunks, "label": label})

    def _tokenize_to_chunks(self, text: str):
        ids = self.tokenizer.encode(text, add_special_tokens=False, truncation=False)
        if len(ids) == 0:
            enc = self.tokenizer("", truncation=True, max_length=self.max_length, return_attention_mask=True)
            return [enc]

        block = self.max_length - 2
        chunks = []
        step = max(1, (block - CHUNK_STRIDE))
        for start in range(0, len(ids), step):
            seg = ids[start:start+block]
            enc = self.tokenizer.prepare_for_model(
                seg, truncation=True, max_length=self.max_length,
                add_special_tokens=True, return_attention_mask=True
            )
            chunks.append({"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"]})
        return chunks

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx): return self.samples[idx]


class ChunkCollator:
    """
    Collate a batch of variable-length chunk lists.
    Returns:
      input_ids       : (num_chunks, L)
      attention_mask  : (num_chunks, L)
      group_bounds    : 1D tensor of length (B+1), cumulative chunk counts per sample
      labels          : (B,)
    """
    def __init__(self, tokenizer, pad_to_multiple_of: int = 8):
        self.tok = tokenizer
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, batch):
        ids_list, att_list = [], []
        group_bounds = [0]
        labels = []

        for ex in batch:
            labels.append(ex["label"])
            for ch in ex["chunks"]:
                ids_list.append(torch.tensor(ch["input_ids"], dtype=torch.long))
                att_list.append(torch.tensor(ch["attention_mask"], dtype=torch.long))
            group_bounds.append(group_bounds[-1] + len(ex["chunks"]))

        input_ids = torch.nn.utils.rnn.pad_sequence(
            ids_list, batch_first=True, padding_value=self.tok.pad_token_id
        )
        attention = torch.nn.utils.rnn.pad_sequence(
            att_list, batch_first=True, padding_value=0
        )

        if self.pad_to_multiple_of:
            L = input_ids.size(1)
            pad_len = (self.pad_to_multiple_of - L % self.pad_to_multiple_of) % self.pad_to_multiple_of
            if pad_len > 0:
                pad_ids = torch.full((input_ids.size(0), pad_len), self.tok.pad_token_id, dtype=torch.long)
                pad_att = torch.zeros((attention.size(0), pad_len), dtype=torch.long)
                input_ids = torch.cat([input_ids, pad_ids], dim=1)
                attention = torch.cat([attention, pad_att], dim=1)

        labels = torch.tensor(labels, dtype=torch.float32)
        group_bounds = torch.tensor(group_bounds, dtype=torch.long)
        return {"input_ids": input_ids, "attention_mask": attention,
                "group_bounds": group_bounds, "labels": labels}

class CodeClassifier(nn.Module):
    def __init__(self, model_name, dropout_rate=0.1, hidden_mult=2, pos_weight=None, pool="logit_mean"):
        super().__init__()
        self.config  = AutoConfig.from_pretrained(model_name)
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden      = self.encoder.config.hidden_size
        mid         = int(hidden * hidden_mult)
        self.pool   = pool
        self.topk_k = 2
        self.att_vec = nn.Linear(hidden, 1)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(hidden, mid),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(mid, 1)
        )

        if pos_weight is not None:
            pw = torch.as_tensor(pos_weight).reshape(1)
            self.register_buffer("pos_weight", pw)   # 作为 buffer 跟随 device
            self.use_pos_weight = True
        else:
            # 仍然定义属性，避免 forward 里找不到
            self.register_buffer("pos_weight", torch.tensor([1.0]), persistent=False)  # 占位，不使用
            self.use_pos_weight = False

    def forward(self, input_ids, attention_mask, group_bounds, labels=None):
        # 1) 编码得到每个 token 的隐状态
        out  = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state                         # (num_chunks, L, H)

        # 2) 对每个 chunk 做 masked mean，得到 chunk 级表征
        mask = attention_mask.unsqueeze(-1).type_as(last)    # (num_chunks, L, 1)
        chunk_repr = (last * mask).sum(1) / mask.sum(1).clamp(min=1)  # (num_chunks, H)

        # 3) 逐 chunk 过分类头，得到 chunk 级 logit
        chunk_logits = self.classifier(chunk_repr).squeeze(-1)        # (num_chunks,)

        # 4) 聚合 chunk → sample
        B = group_bounds.numel() - 1
        sample_logits = []
        for i in range(B):
            s, e = int(group_bounds[i]), int(group_bounds[i + 1])
            if e > s:
                if self.pool == "topk2":
                    # --- Top-k 概率平均（更鲁棒） ---
                    p = torch.sigmoid(chunk_logits[s:e])
                    k = getattr(self, "topk_k", 2)
                    k = min(k, e - s)
                    if k > 0:
                        topk = p.topk(k).values.mean()
                        topk = topk.clamp(1e-6, 1 - 1e-6)
                        logit = torch.log(topk / (1 - topk))
                    else:
                        logit = chunk_logits.new_zeros(())
                    sample_logits.append(logit)
                else:
                    # --- 原版 mean 聚合 ---
                    sample_logits.append(chunk_logits[s:e].mean())
            else:
                sample_logits.append(chunk_logits.new_zeros(()))

        logits = torch.stack(sample_logits, dim=0)                     # (B,)

        # 5) 组织输出；只有提供 labels 时才计算 loss
        result = {"logits": logits}
        if labels is not None:
            neg_w = cfg.get("NEG_WEIGHT", 1.5)
            w = torch.ones_like(labels, dtype=logits.dtype, device=logits.device)
            w[labels == 0] = neg_w
            result["loss"] = F.binary_cross_entropy_with_logits(
                logits, labels.float(), weight=w
            )
        return result

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or tokenizer.sep_token

train_ds = CodeJsonlDataset(train_rows, tokenizer, MAX_LENGTH)
val_ds   = CodeJsonlDataset(val_rows, tokenizer, MAX_LENGTH)
test_ds  = CodeJsonlDataset(test_rows, tokenizer, MAX_LENGTH)

collator = ChunkCollator(tokenizer, pad_to_multiple_of=8)

'''train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, collate_fn=collator)
val_loader   = DataLoader(val_ds, batch_size=EVAL_BATCH, shuffle=False,
                          num_workers=NUM_WORKERS, collate_fn=collator)
test_loader  = DataLoader(test_ds, batch_size=EVAL_BATCH, shuffle=False,
                          num_workers=NUM_WORKERS, collate_fn=collator)'''
PIN = (device.type == "cuda")

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=0, pin_memory=PIN, collate_fn=collator, timeout=0
)

val_loader = DataLoader(
    val_ds, batch_size=EVAL_BATCH, shuffle=False,
    num_workers=0, pin_memory=PIN, collate_fn=collator, timeout=0
)

test_loader = DataLoader(
    test_ds, batch_size=EVAL_BATCH, shuffle=False,
    num_workers=0, pin_memory=PIN, collate_fn=collator, timeout=0
)


len(train_loader), len(val_loader), len(test_loader)

# == 统计训练集正负样本，计算 pos_weight ==
labels = [float(r["label"]) for r in train_rows]
N_pos = sum(labels)
N_neg = len(labels) - N_pos

torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("high")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# 避免频繁分配带来的碎片化（可选）
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors


[INFO] Device: cuda
[OK] Loaded combined jsonl: 4003 samples
[OK] Split combined -> train=3241 val=361 test=401


In [None]:
# %% [markdown]
# ## Train / Evaluate (with EarlyStopping + Grid Search)


import os, math, time, csv, itertools, random
from pathlib import Path
import numpy as np
import torch
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from safetensors.torch import save_file, load_file
from tqdm import tqdm
import torch.nn.functional as F

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


def evaluate(model, loader, device):
    model.eval()
    all_y, all_prob = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Eval", leave=False):
            ids   = batch["input_ids"].to(device)
            attn  = batch["attention_mask"].to(device)
            gb    = batch["group_bounds"].to(device)
            y     = batch["labels"].to(device)

            out = model(ids, attn, gb, labels=None)
            prob = torch.sigmoid(out["logits"]).cpu().numpy()
            all_prob.append(prob)
            all_y.append(y.cpu().numpy())

    y_true = np.concatenate(all_y)
    y_prob = np.concatenate(all_prob)
    y_pred = (y_prob >= 0.5).astype(int)

    try:
        auc_val = roc_auc_score(y_true, y_prob)
    except Exception:
        auc_val = float("nan")

    metrics = {
        "auc": float(auc_val),
        "f1": f1_score(y_true, y_pred),
        "acc": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "y_true": y_true,
        "y_prob": y_prob,
        "y_pred": y_pred,
        "cm": confusion_matrix(y_true, y_pred)
    }
    return metrics

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

# ---------- 单次训练：按 cfg 训练到早停 ----------
def train_eval_once(cfg: dict, run_dir: Path, seed: int = 42):
    """
    cfg: {
      'LR': float,
      'WARMUP_RATIO': float,
      'WEIGHT_DECAY': float,
      'GRAD_ACCUM': int,
      'EPOCHS': int
    }
    复用全局的: MODEL_NAME, device, USE_FP16, train_loader, val_loader, test_loader
    """
    set_seed(seed)
    run_dir.mkdir(parents=True, exist_ok=True)

    # 1) Build model / optimizer / scheduler / scaler
    model = CodeClassifier(
        MODEL_NAME,
        dropout_rate=cfg["DROPOUT_RATE"],
        hidden_mult=cfg["HEAD_WIDTH"],
        pos_weight=None,
        pool=cfg.get("POOL", "logit_mean"),
        neg_weight=cfg.get("NEG_WEIGHT", 1.5),
    ).to(device)

    # ↓↓↓ 加这两行：显著降显存占用（牺牲一点速度）
    try:
        model.encoder.gradient_checkpointing_enable()
    except Exception:
        pass
    for i, layer in enumerate(model.encoder.encoder.layer):
        if i < 1:
            for p in layer.parameters():
                p.requires_grad = False
    torch.cuda.empty_cache()  # 清理上一次 config 的遗留碎片
    head_params = list(model.classifier.parameters())
    backbone_params = [p for n,p in model.named_parameters() if not n.startswith("classifier")]
    optimizer = AdamW(
        [
            {"params": backbone_params, "lr": cfg["LR"]},
            {"params": head_params,     "lr": cfg["LR"] * cfg.get("HEAD_LR_MULT", 1.0)},
        ],
        weight_decay=cfg["WEIGHT_DECAY"]
    )

    total_steps  = cfg["EPOCHS"] * math.ceil(len(train_loader) / cfg["GRAD_ACCUM"])
    warmup_steps = min(200, int(total_steps * cfg["WARMUP_RATIO"]))
    from transformers import get_cosine_schedule_with_warmup
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)

    scaler = torch.amp.GradScaler("cuda", enabled=(USE_FP16 and device.type == "cuda"))
    early  = EarlyStopper(patience=2, mode="max")

    best_auc   = -1.0
    best_path  = run_dir / "best.safetensors"
    global_step = 0
    # 2) Train loop with early stop
    for epoch in range(1, cfg["EPOCHS"] + 1):
        running_loss = 0.0
        num_steps = 0
        model.train()
        prog = tqdm(train_loader, desc=f"Train epoch {epoch}",
                    leave=False, dynamic_ncols=True)
        optimizer.zero_grad(set_to_none=True)

        first_optim_step_done = False
        accum = int(cfg.get("GRAD_ACCUM", 1))
        max_gn = float(cfg.get("MAX_GRAD_NORM", 0.0))

        # ------------ 训练步循环【注意：在 epoch 里面】------------
        for step, batch in enumerate(prog, start=1):
            try:
                ids  = batch["input_ids"].to(device)
                attn = batch["attention_mask"].to(device)
                gb   = batch["group_bounds"].to(device)
                y    = batch["labels"].to(device)

                with torch.amp.autocast("cuda", enabled=(USE_FP16 and device.type == "cuda")):
                    out = model(ids, attn, gb, labels=y)
                    loss_raw = out["loss"]
                    loss = loss_raw / accum

                running_loss += float(loss_raw.detach().item())
                num_steps += 1

                scaler.scale(loss).backward()

                if step % accum == 0:
                    if max_gn > 0:
                        scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_gn)

                    scaler.step(optimizer)
                    scaler.update()

                    if first_optim_step_done:
                        scheduler.step()
                    else:
                        first_optim_step_done = True

                    optimizer.zero_grad(set_to_none=True)

                # 进度条信息
                try:
                    current_lr = scheduler.get_last_lr()[0] if first_optim_step_done else cfg["LR"]
                except Exception:
                    current_lr = cfg["LR"]

                avg_loss = running_loss / max(1, num_steps)
                prog.set_postfix(step=step, avg_loss=f"{avg_loss:.4f}", lr=f"{current_lr:.1e}")

            except torch.cuda.OutOfMemoryError:
                torch.cuda.empty_cache()
                continue

        # （可选）尾包刷新：若未被 accum 整除，按需补一次 step
        # ...

        prog.close()  # 关闭本轮进度条，避免后面的 print 被插断

        # ------------ 验证与日志【注意：也在 epoch 里面】------------
        avg_train_loss = running_loss / max(1, num_steps)
        print(f"[TRAIN] epoch={epoch} avg_loss={avg_train_loss:.4f}")

        val_m = evaluate(model, val_loader, device)
        print(f"[VAL] AUC={val_m['auc']:.4f}  F1={val_m['f1']:.4f}  "
            f"ACC={val_m['acc']:.4f}  P={val_m['precision']:.4f}  R={val_m['recall']:.4f}")

        if (not np.isnan(val_m["auc"])) and (val_m["auc"] > best_auc):
            best_auc = val_m["auc"]
            save_file(model.state_dict(), str(best_path))

            # 选阈值（例如按 F1 最大）
            val_thr = pick_threshold(val_m["y_true"], val_m["y_prob"], min_precision=0.65)
            print(f"[VAL] Picked threshold = {val_thr:.3f}")
            with (run_dir / "best_threshold.txt").open("w") as f:
                f.write(str(val_thr))

            with (run_dir / "val_metrics.json").open("w", encoding="utf-8") as f:
                json.dump({
                    "auc": val_m["auc"], "f1": float(val_m["f1"]),
                    "acc": float(val_m["acc"]), "precision": float(val_m["precision"]),
                    "recall": float(val_m["recall"]), "threshold": float(val_thr)
                }, f, indent=2)
            print(f"[OK] Saved best (safetensors) -> {best_path}")


        if early.step(val_m["auc"]):
            print(f"[EARLY STOP] Best AUC={early.best:.4f}")
            break

        torch.cuda.empty_cache()

    state_dict = load_file(str(best_path))
    model.load_state_dict(state_dict, strict=False)

    val_m  = evaluate(model, val_loader, device)
    val_thr = pick_threshold(val_m["y_true"], val_m["y_prob"], min_precision=0.75)
    (run_dir / "best_threshold.txt").write_text(str(val_thr))
    print(f"[VAL] Recomputed threshold on best = {val_thr:.3f}")

    # 3) Evaluate TEST on best
    state_dict = load_file(str(best_path))
    model.load_state_dict(state_dict, strict=False)

    test_m = evaluate(model, test_loader, device)  # 先拿到 y_true / y_prob

    # 读取验证时保存的最佳阈值
    try:
        val_thr = float((run_dir / "best_threshold.txt").read_text().strip())
    except Exception:
        val_thr = 0.5

    y_true = test_m["y_true"]
    y_prob = test_m["y_prob"]
    y_pred = (y_prob >= val_thr).astype(int)

    # 诊断：分数分布是否在 val/test 发生漂移
    print(
        f"[SCORE SHIFT] val mean prob={float(val_m['y_prob'].mean()):.3f}  "
        f"test mean prob={float(test_m['y_prob'].mean()):.3f}"
    )

    test_at_thr = {
        "auc":  float(roc_auc_score(y_true, y_prob)),
        "f1":   float(f1_score(y_true, y_pred)),
        "acc":  float(accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "recall":    float(recall_score(y_true, y_pred, zero_division=0)),
        "cm": confusion_matrix(y_true, y_pred),
    }

    with (run_dir / "test_metrics.json").open("w", encoding="utf-8") as f:
        json.dump({
            "auc": test_at_thr["auc"], "f1": test_at_thr["f1"],
            "acc": test_at_thr["acc"], "precision": test_at_thr["precision"],
            "recall": test_at_thr["recall"], "cm": test_at_thr["cm"].tolist(),
            "threshold": float(val_thr),
        }, f, indent=2)

    print("[TEST]", json.dumps({
        "auc": test_at_thr["auc"], "f1": test_at_thr["f1"],
        "acc": test_at_thr["acc"], "precision": test_at_thr["precision"],
        "recall": test_at_thr["recall"]
    }, indent=2))

    return {"val": val_m, "test": test_at_thr, "ckpt": str(best_path)}



In [None]:
# === 过拟合能力测试 ===
import random
from torch.utils.data import Subset

N_SMALL = 64   # 选 64 个样本
subset_idx = random.sample(range(len(train_ds)), N_SMALL)
small_train_ds = Subset(train_ds, subset_idx)

small_train_loader = DataLoader(
    small_train_ds, batch_size=8, shuffle=True,
    num_workers=0, pin_memory=(device.type=="cuda"),
    collate_fn=collator
)

# 用相同 val_loader
model = CodeClassifier(
    MODEL_NAME,
    dropout_rate=cfg["DROPOUT_RATE"],
    hidden_mult=cfg["HEAD_WIDTH"],
    pos_weight=None,
    pool=cfg.get("POOL", "logit_mean"),
).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
losses = []

for epoch in range(15):
    model.train()
    total = 0
    for batch in small_train_loader:
        ids, attn, gb, y = (batch["input_ids"].to(device),
                            batch["attention_mask"].to(device),
                            batch["group_bounds"].to(device),
                            batch["labels"].to(device))
        out = model(ids, attn, gb, labels=y)
        loss = out["loss"]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total += loss.item()
    print(f"Epoch {epoch+1}: train loss={total/len(small_train_loader):.4f}")
    val_m = evaluate(model, val_loader, device)
    print(f" → val AUC={val_m['auc']:.4f}, F1={val_m['f1']:.3f}")


Epoch 1: train loss=0.6461


                                                     

 → val AUC=0.4839, F1=0.000
Epoch 2: train loss=0.6306


                                                     

 → val AUC=0.4986, F1=0.000
Epoch 3: train loss=0.6076


                                                     

 → val AUC=0.4932, F1=0.000
Epoch 4: train loss=0.5956


                                                     

 → val AUC=0.4928, F1=0.138
Epoch 5: train loss=0.5846


                                                     

 → val AUC=0.4929, F1=0.209
Epoch 6: train loss=0.5685


                                                     

 → val AUC=0.4926, F1=0.306
Epoch 7: train loss=0.5597


                                                     

 → val AUC=0.4926, F1=0.132
Epoch 8: train loss=0.5130


                                                     

 → val AUC=0.4881, F1=0.414
Epoch 9: train loss=0.4578


                                                     

 → val AUC=0.4896, F1=0.296
Epoch 10: train loss=0.3885


                                                     

 → val AUC=0.4826, F1=0.409
Epoch 11: train loss=0.3073


                                                     

 → val AUC=0.4854, F1=0.105
Epoch 12: train loss=0.2509


                                                     

 → val AUC=0.4973, F1=0.157
Epoch 13: train loss=0.1893


                                                     

 → val AUC=0.5024, F1=0.491
Epoch 14: train loss=0.0933


                                                     

 → val AUC=0.4979, F1=0.131
Epoch 15: train loss=0.0625


                                                     

 → val AUC=0.4993, F1=0.358




In [4]:
with open(RESULTS_CSV, "w", newline="", encoding="utf-8") as fcsv:
    writer = csv.DictWriter(fcsv, fieldnames=FIELDNAMES)
    writer.writeheader()

    for combo in itertools.product(*values):
        cfg = dict(zip(keys, combo))
        run_name = f"run_{start_ts}_{hash('|'.join(f'{k}={cfg[k]}' for k in sorted(cfg))) & 0xfffffff}"
        run_dir  = TUNE_DIR / run_name

        try:
            result = train_eval_once(cfg, run_dir, seed=42)
        except torch.cuda.OutOfMemoryError:
            print("[OOM] Skipping config due to CUDA OOM:", "|".join(f"{k}={cfg[k]}" for k in sorted(cfg)))
            torch.cuda.empty_cache()
            # 只写我们声明过的列
            row = {
                "config": "|".join(f"{k}={cfg[k]}" for k in sorted(cfg)),
                **{k: cfg[k] for k in HYPER_COLS},
                "val_auc": 0.0, "val_f1": 0.0, "val_acc": 0.0,
                "test_auc": 0.0, "test_f1": 0.0, "test_acc": 0.0,
                "ckpt": "OOM", "run_dir": str(run_dir)
            }
            rows.append(row)
            writer.writerow(row); fcsv.flush()
            continue

        row = {
            "config": "|".join(f"{k}={cfg[k]}" for k in sorted(cfg)),
            **{k: cfg[k] for k in HYPER_COLS},
            "val_auc":  float(result["val"]["auc"]),
            "val_f1":   float(result["val"]["f1"]),
            "val_acc":  float(result["val"]["acc"]),
            "test_auc": float(result["test"]["auc"]),
            "test_f1":  float(result["test"]["f1"]),
            "test_acc": float(result["test"]["acc"]),
            "ckpt":     result["ckpt"],
            "run_dir":  str(run_dir)
        }
        rows.append(row)
        writer.writerow(row); fcsv.flush()

# 选最优
rows_sorted = sorted(rows, key=lambda r: r["val_auc"], reverse=True)
best = rows_sorted[0]
print("\n=================== Best by val_auc ===================")
for k in ["config","LR","WARMUP_RATIO","WEIGHT_DECAY","GRAD_ACCUM","EPOCHS","val_auc","val_f1","test_auc","test_f1","ckpt","run_dir"]:
    print(f"{k}: {best[k]}")

  return t.to(
                                                                                                       

[TRAIN] epoch=1 avg_loss=0.8502


                                                     

[VAL] AUC=0.7768  F1=0.0000  ACC=0.5485  P=0.0000  R=0.0000
[VAL] Picked threshold = 0.379
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_199660170\best.safetensors


                                                                                                       

[TRAIN] epoch=2 avg_loss=0.7490


                                                     

[VAL] AUC=0.8866  F1=0.8011  ACC=0.8061  P=0.7460  R=0.8650
[VAL] Picked threshold = 0.376
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_199660170\best.safetensors


                                                                                                       

[TRAIN] epoch=3 avg_loss=0.4696


                                                     

[VAL] AUC=0.9491  F1=0.8605  ACC=0.8670  P=0.8177  R=0.9080
[VAL] Picked threshold = 0.098
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_199660170\best.safetensors


                                                                                                       

[TRAIN] epoch=4 avg_loss=0.3497


                                                     

[VAL] AUC=0.9617  F1=0.8495  ACC=0.8753  P=0.9338  R=0.7791
[VAL] Picked threshold = 0.016
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_199660170\best.safetensors


                                                                                                       

[TRAIN] epoch=5 avg_loss=0.3105


                                                     

[VAL] AUC=0.9693  F1=0.8851  ACC=0.8892  P=0.8324  R=0.9448
[VAL] Picked threshold = 0.027
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_199660170\best.safetensors


                                                                                                       

[TRAIN] epoch=6 avg_loss=0.2763


                                                     

[VAL] AUC=0.9713  F1=0.8896  ACC=0.8975  P=0.8663  R=0.9141
[VAL] Picked threshold = 0.023
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_199660170\best.safetensors


                                                                                                       

[TRAIN] epoch=7 avg_loss=0.2617


                                                     

[VAL] AUC=0.9711  F1=0.8812  ACC=0.8864  P=0.8352  R=0.9325


                                                                                                       

[TRAIN] epoch=8 avg_loss=0.2274


                                                     

[VAL] AUC=0.9671  F1=0.8786  ACC=0.8837  P=0.8306  R=0.9325


                                                                                                       

[TRAIN] epoch=9 avg_loss=0.2045


                                                     

[VAL] AUC=0.9658  F1=0.8739  ACC=0.8809  P=0.8371  R=0.9141
[EARLY STOP] Best AUC=0.9713


                                                     

[VAL] Recomputed threshold on best = 0.023


                                                     

[SCORE SHIFT] val mean prob=0.466  test mean prob=0.476
[TEST] {
  "auc": 0.9744098442993471,
  "f1": 0.7735042735042735,
  "acc": 0.7356608478802993,
  "precision": 0.6306620209059234,
  "recall": 1.0
}


                                                                                                       

[TRAIN] epoch=1 avg_loss=0.8594


                                                     

[VAL] AUC=0.5510  F1=0.0000  ACC=0.5485  P=0.0000  R=0.0000
[VAL] Picked threshold = 0.306
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_175185057\best.safetensors


                                                                                                       

[TRAIN] epoch=2 avg_loss=0.8221


                                                     

[VAL] AUC=0.8034  F1=0.0000  ACC=0.5485  P=0.0000  R=0.0000
[VAL] Picked threshold = 0.337
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_175185057\best.safetensors


                                                                                                       

[TRAIN] epoch=3 avg_loss=0.7172


                                                     

[VAL] AUC=0.8584  F1=0.6494  ACC=0.7368  P=0.8148  R=0.5399
[VAL] Picked threshold = 0.166
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_175185057\best.safetensors


                                                                                                       

[TRAIN] epoch=4 avg_loss=0.5080


                                                     

[VAL] AUC=0.9383  F1=0.8468  ACC=0.8587  P=0.8294  R=0.8650
[VAL] Picked threshold = 0.162
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_175185057\best.safetensors


                                                                                                       

[TRAIN] epoch=5 avg_loss=0.3998


                                                     

[VAL] AUC=0.9558  F1=0.8650  ACC=0.8781  P=0.8650  R=0.8650
[VAL] Picked threshold = 0.048
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_175185057\best.safetensors


                                                                                                       

[TRAIN] epoch=6 avg_loss=0.3420


                                                     

[VAL] AUC=0.9663  F1=0.8647  ACC=0.8864  P=0.9357  R=0.8037
[VAL] Picked threshold = 0.021
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_175185057\best.safetensors


                                                                                                       

[TRAIN] epoch=7 avg_loss=0.3038


                                                     

[VAL] AUC=0.9643  F1=0.8682  ACC=0.8864  P=0.9122  R=0.8282


                                                                                                       

[TRAIN] epoch=8 avg_loss=0.2858


                                                     

[VAL] AUC=0.9557  F1=0.8459  ACC=0.8587  P=0.8333  R=0.8589


                                                                                                             

[TRAIN] epoch=9 avg_loss=0.2679


                                                     

[VAL] AUC=0.9673  F1=0.8544  ACC=0.8753  P=0.9041  R=0.8098
[VAL] Picked threshold = 0.007
[OK] Saved best (safetensors) -> outputs\tuning\run_1764395687_175185057\best.safetensors


                                                                                                        

[TRAIN] epoch=10 avg_loss=0.2466


                                                     

[VAL] AUC=0.9658  F1=0.8696  ACC=0.8837  P=0.8805  R=0.8589


                                                                                                        

[TRAIN] epoch=11 avg_loss=0.2368


                                                     

[VAL] AUC=0.9660  F1=0.8854  ACC=0.8975  P=0.8938  R=0.8773


                                                                                                        

[TRAIN] epoch=12 avg_loss=0.2252


                                                     

[VAL] AUC=0.9660  F1=0.8815  ACC=0.8920  P=0.8735  R=0.8896
[EARLY STOP] Best AUC=0.9673


                                                     

[VAL] Recomputed threshold on best = 0.007


                                                     

[SCORE SHIFT] val mean prob=0.403  test mean prob=0.409
[TEST] {
  "auc": 0.967754897036665,
  "f1": 0.7718550106609808,
  "acc": 0.7331670822942643,
  "precision": 0.6284722222222222,
  "recall": 1.0
}

config: DROPOUT_RATE=0.1|EPOCHS=20|GRAD_ACCUM=8|HEAD_LR_MULT=3.0|HEAD_WIDTH=1.0|LR=1e-05|MAX_GRAD_NORM=1.0|NEG_WEIGHT=1.2|POOL=topk2|WARMUP_RATIO=0.06|WEIGHT_DECAY=0.01
LR: 1e-05
WARMUP_RATIO: 0.06
WEIGHT_DECAY: 0.01
GRAD_ACCUM: 8
EPOCHS: 20
val_auc: 0.9713391584557228
val_f1: 0.8895522388059701
test_auc: 0.9744098442993471
test_f1: 0.7735042735042735
ckpt: outputs\tuning\run_1764395687_199660170\best.safetensors
run_dir: outputs\tuning\run_1764395687_199660170
