In [41]:
# 导入库与基础配置
import os
from pathlib import Path
import random

import numpy as np
import torch
from transformers import AutoTokenizer

# 设备与随机种子
device = torch.device("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
print("Using device:", device)

SEED = 302
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# checkpoint 目录
CHECKPOINT_DIR = Path("checkpoint")
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
LAST_CKPT = CHECKPOINT_DIR / "model1_last.pt"
BEST_CKPT = CHECKPOINT_DIR / "model1_best.pt"

print("Checkpoint dir:", CHECKPOINT_DIR.resolve())


Using device: cuda
Checkpoint dir: /home/ljy/projects/acafeed/checkpoint


In [27]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

Path to dataset files: /home/ljy/.cache/kagglehub/datasets/Cornell-University/arxiv/versions/260


In [42]:
# 定义数据集与多标签模型（Model1）

import pandas as pd
from datasets import Dataset

# 为了复用原始数据，这里从现有的 arxiv JSON 构建与 model.ipynb 一致的数据流程
file_path = os.path.expanduser("~/.cache/kagglehub/datasets/Cornell-University/arxiv/versions/260/arxiv-metadata-oai-snapshot.json")

chunks = pd.read_json(file_path, lines=True, chunksize=100000)
dfs = []
for chunk in chunks:
    dfs.append(chunk[["title", "categories"]])

df_small = pd.concat(dfs, ignore_index=True)
raw_df = df_small.rename(columns={"categories": "Category", "title": "Title"})
raw_df["Category"] = raw_df["Category"].str.split(" ")
raw_df["Title"] = raw_df["Title"].str.strip()

# 统计类别并过滤低频类别
all_categories = [c for sub in raw_df["Category"] for c in sub]
category_counts = pd.Series(all_categories).value_counts()
filtered_categories = category_counts[category_counts >= 20].index.tolist()

category_to_id = {cat: idx for idx, cat in enumerate(filtered_categories)}
label2id = category_to_id
id2label = {v: k for k, v in category_to_id.items()}

import numpy as np

def encode_labels(cats):
    labels = np.zeros(len(category_to_id), dtype=float)
    for cat in cats:
        if cat in category_to_id:
            labels[category_to_id[cat]] = 1.0
    return labels

filtered_df = raw_df[raw_df["Category"].apply(lambda cats: any(cat in filtered_categories for cat in cats))].reset_index(drop=True)
filtered_df["label"] = filtered_df["Category"].apply(encode_labels)

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(filtered_df, test_size=0.05, random_state=SEED)
print(f"Training samples: {len(train_df)}, Testing samples: {len(test_df)}")
print(f"Number of labels: {len(category_to_id)}")

# 转为 HF Dataset
# 限制个数
TRAINNUM = 50000
TESTNUM = 1000
# Randomly select a subset for quicker testing
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True).sample(TRAINNUM, random_state=SEED)).shuffle(seed=SEED)
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True).sample(TESTNUM, random_state=SEED)).shuffle(seed=SEED)

max_length = 64
# 与模型保持一致的分词器：SCIBERT（SciVocab, uncased）
tokenizer_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)


def tokenize(batch):
    tokenized = tokenizer(batch["Title"], truncation=True, padding="max_length", max_length=max_length)
    tokenized["labels"] = batch["label"]
    return tokenized

train_tokenized = train_ds.map(tokenize, batched=True)
test_tokenized = test_ds.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "labels"]
train_tokenized = train_tokenized.remove_columns([c for c in train_tokenized.column_names if c not in cols])
test_tokenized = test_tokenized.remove_columns([c for c in test_tokenized.column_names if c not in cols])

train_tokenized.set_format(type="torch")
test_tokenized.set_format(type="torch")

print(f"训练集: {len(train_tokenized)} 样本, 测试集: {len(test_tokenized)} 样本")

# 定义模型（多标签 DistilBERT）
from transformers import AutoConfig, AutoModelForSequenceClassification

base_model_name = "allenai/scibert_scivocab_uncased"
config = AutoConfig.from_pretrained(base_model_name)
config.num_labels = len(category_to_id)
config.problem_type = "multi_label_classification"
config.id2label = id2label
config.label2id = label2id
# 增强正则化以缓解过拟合
config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.1
# 部分模型使用 classifier_dropout（若不支持将被忽略）
setattr(config, "classifier_dropout", 0.2)

model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    config=config,
    ignore_mismatched_sizes=True,
)
model.to(device)

print("Model initialized:", model.__class__.__name__)
print("Num labels:", model.config.num_labels)
print("Problem type:", model.config.problem_type)


Training samples: 2740089, Testing samples: 144216
Number of labels: 174


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

训练集: 50000 样本, 测试集: 1000 样本


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized: BertForSequenceClassification
Num labels: 174
Problem type: multi_label_classification


In [29]:
# 实现 checkpoint 保存与加载工具函数

from typing import Dict, Any


def save_checkpoint(state: Dict[str, Any], filename: Path):
    """保存训练状态到指定文件。

    state 示例:
    {
        'epoch': int,
        'global_step': int,
        'model_state': model.state_dict(),
        'optimizer_state': optimizer.state_dict(),
        'best_metric': float
    }
    """
    filename = Path(filename)
    filename.parent.mkdir(parents=True, exist_ok=True)
    torch.save(state, filename)
    print(f"Checkpoint saved to {filename}")


def load_checkpoint(model, optimizer, filename: Path):
    """从指定文件加载训练状态，返回 (start_epoch, global_step, best_metric)。"""
    filename = Path(filename)
    if not filename.exists():
        print(f"No checkpoint found at {filename}, start from scratch.")
        return 0, 0, None

    ckpt = torch.load(filename, map_location=device)
    model.load_state_dict(ckpt["model_state"])
    if optimizer is not None and "optimizer_state" in ckpt:
        optimizer.load_state_dict(ckpt["optimizer_state"])
    start_epoch = ckpt.get("epoch", 0)
    global_step = ckpt.get("global_step", 0)
    best_metric = ckpt.get("best_metric", None)
    print(f"Loaded checkpoint from {filename}: epoch={start_epoch}, global_step={global_step}, best_metric={best_metric}")
    return start_epoch, global_step, best_metric


# 简单测试：仅在第一次运行时保存一个空的 state 示例
if not LAST_CKPT.exists():
    dummy_state = {
        "epoch": 0,
        "global_step": 0,
        "model_state": model.state_dict(),
        "best_metric": None,
    }
    save_checkpoint(dummy_state, LAST_CKPT)
else:
    print("Existing checkpoint found, skip dummy save.")


Existing checkpoint found, skip dummy save.


In [43]:
# 训练设置与 Trainer 初始化（集成 checkpoint 保存与自动加载）

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import f1_score

# 指标阈值
def sigmoid_np(x):
    return 1 / (1 + np.exp(-x))

THRESH = 0.4  # 多标签预测阈值


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid_np(logits)
    preds = (probs > THRESH).astype(int)

    exact_match = (preds == labels).all(axis=1).mean()
    f1_micro = f1_score(labels, preds, average="micro", zero_division=0)
    f1_macro = f1_score(labels, preds, average="macro", zero_division=0)

    return {
        "exact_match": exact_match,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
    }


output_dir = CHECKPOINT_DIR  # 直接复用 checkpoint 目录

training_args = TrainingArguments(
    output_dir=str(output_dir),
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    learning_rate=2e-5,
    eval_strategy="epoch",  # 正确字段名
    save_strategy="epoch",         # 每个 epoch 保存以便选择最优
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=20,
    report_to="none",
    use_mps_device=(device.type == "mps"),
    fp16=False,
    dataloader_num_workers=0,
)

# 如果需要手动优化器，可保留；Trainer 内部也会创建优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)

# 自动加载最新 checkpoint（如果存在）
start_epoch, global_step, best_metric = load_checkpoint(model, optimizer, LAST_CKPT)
if best_metric is None:
    best_metric = 0.0

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    compute_metrics=compute_metrics,
)
# 加入早停，进一步缓解过拟合
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=1e-4))

print("Trainer initialized. start_epoch=", start_epoch, "global_step=", global_step, "best_metric=", best_metric)


Loaded checkpoint from checkpoint/model1_last.pt: epoch=10, global_step=5474, best_metric=0.5403552508569648
Trainer initialized. start_epoch= 10 global_step= 5474 best_metric= 0.5403552508569648


In [44]:
# 主训练（使用 Trainer 自带循环 + 早停 + 最优模型自动加载）

from transformers.trainer_utils import get_last_checkpoint

print(f"Start training (will evaluate each epoch, early stopping enabled)")

# 仅从 HuggingFace 的 checkpoint-* 目录续训，避免使用 .pt 文件导致报错
last_hf_ckpt = None
try:
    last_hf_ckpt = get_last_checkpoint(str(output_dir))
except Exception:
    last_hf_ckpt = None

if last_hf_ckpt:
    print(f"Resuming from HF checkpoint: {last_hf_ckpt}")
    train_result = trainer.train(resume_from_checkpoint=last_hf_ckpt)
else:
    train_result = trainer.train()

# 训练结束后，已自动将最优模型权重加载到 model（load_best_model_at_end=True）
eval_metrics = trainer.evaluate()
print("Eval metrics:", eval_metrics)

# 使用自定义保存：保存最后与最优到我们的目录
state = {
    "epoch": int(training_args.num_train_epochs),
    "global_step": int(train_result.global_step),
    "model_state": model.state_dict(),
    "best_metric": float(eval_metrics.get("eval_f1_micro", 0.0)),
}
save_checkpoint(state, LAST_CKPT)
# 当前 model 为最优权重，另存一份到 BEST_CKPT 便于下游推理
save_checkpoint(state, BEST_CKPT)

print("Training finished. Best eval_f1_micro=", eval_metrics.get("eval_f1_micro", None))


Start training (will evaluate each epoch, early stopping enabled)
Resuming from HF checkpoint: checkpoint/checkpoint-5474


Epoch,Training Loss,Validation Loss,Exact Match,F1 Micro,F1 Macro
8,0.0259,0.024013,0.285,0.556888,0.319164
9,0.0254,0.023588,0.289,0.557556,0.320219
10,0.0241,0.023514,0.292,0.561826,0.324386


Eval metrics: {'eval_loss': 0.023513667285442352, 'eval_exact_match': 0.292, 'eval_f1_micro': 0.5618262523779328, 'eval_f1_macro': 0.32438641061709894, 'eval_runtime': 1.2133, 'eval_samples_per_second': 824.209, 'eval_steps_per_second': 13.187, 'epoch': 10.0}
Checkpoint saved to checkpoint/model1_last.pt
Checkpoint saved to checkpoint/model1_last.pt
Checkpoint saved to checkpoint/model1_best.pt
Training finished. Best eval_f1_micro= 0.5618262523779328
Checkpoint saved to checkpoint/model1_best.pt
Training finished. Best eval_f1_micro= 0.5618262523779328


In [45]:
# 推理与多标签预测示例（使用 best 或 last checkpoint）

# 优先加载最佳 checkpoint
if BEST_CKPT.exists():
    load_checkpoint(model, optimizer=None, filename=BEST_CKPT)
else:
    load_checkpoint(model, optimizer=None, filename=LAST_CKPT)

model.to(device)
model.eval()

sample_index = np.random.randint(0, len(filtered_df) - 1, 3)
sample_titles = [
    filtered_df.iloc[sample_index[0]]["Title"],
    filtered_df.iloc[sample_index[1]]["Title"],
    filtered_df.iloc[sample_index[2]]["Title"],
]

encoded = tokenizer(sample_titles, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

encoded = {k: v.to(device) for k, v in encoded.items()}

with torch.no_grad():
    outputs = model(**encoded)
    probs = torch.sigmoid(outputs.logits).cpu()

for title, prob in zip(sample_titles, probs):
    pred_indices = (prob > THRESH).nonzero(as_tuple=True)[0].tolist()
    pred_labels = [(id2label[i], float(prob[i])) for i in pred_indices]
    top5 = sorted([(id2label[i], float(p)) for i, p in enumerate(prob)], key=lambda x: x[1], reverse=True)[:5]

    print("\n" + "=" * 80)
    print("Title:", title[:100])
    print(f"预测标签 (>{THRESH}): {len(pred_labels)} 个")
    for label, score in pred_labels:
        print(f"  - {label}: {score:.3f}")
    print(f"真实标签: {filtered_df[filtered_df['Title'] == title]['Category'].values[0]}")
    print("\nTop 5 标签:")
    for label, score in top5:
        print(f"  - {label}: {score:.3f}")


Loaded checkpoint from checkpoint/model1_best.pt: epoch=10, global_step=7820, best_metric=0.5618262523779328

Title: Morphing the left atrium geometry: The role of the pulmonary veins on
  flow patterns and thrombus f
预测标签 (>0.4): 2 个
  - physics.bio-ph: 0.513
  - physics.med-ph: 0.429
真实标签: ['physics.med-ph', 'physics.flu-dyn']

Top 5 标签:
  - physics.bio-ph: 0.513
  - physics.med-ph: 0.429
  - physics.flu-dyn: 0.268
  - q-bio.TO: 0.262
  - q-bio.CB: 0.096

Title: Upon the existence of short-time approximations of any polynomial order
  for the computation of den
预测标签 (>0.4): 1 个
  - math.NA: 0.735
真实标签: ['physics.med-ph', 'physics.flu-dyn']

Top 5 标签:
  - physics.bio-ph: 0.513
  - physics.med-ph: 0.429
  - physics.flu-dyn: 0.268
  - q-bio.TO: 0.262
  - q-bio.CB: 0.096

Title: Upon the existence of short-time approximations of any polynomial order
  for the computation of den
预测标签 (>0.4): 1 个
  - math.NA: 0.735
真实标签: ['math-ph', 'cond-mat.stat-mech', 'math.MP', 'physics.chem-ph']

Top 