In [1]:
# Cell 1: Imports & Config

import os, csv, time, random
from dataclasses import dataclass
from datetime import datetime
from collections import Counter

import numpy as np
import torch
import torch.nn as nn

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from torch.optim import AdamW

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import pandas as pd

def nowstamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

# ---- Config ----
@dataclass
class Config:
    # Project / Paths
    project_name: str = "distilbert_bbcnews_classweight"
    base_dir: str = "/Users/mh/Downloads/Mini Project/Jan_TextClassification_Iteration"
    results_dir: str = "results"
    models_dir: str = "models"
    logs_dir: str = "logs"

    # Dataset
    dataset_name: str = "SetFit/bbc-news"   # 安定して使いやすいことが多い
    text_col: str = "text"
    label_col: str = "label"               # ←データ次第で "category" 等になることあり
    val_ratio: float = 0.1

    # Model
    model_name: str = "distilbert-base-uncased"
    max_seq_len: int = 128
    batch_size: int = 16

    # Train
    lr: float = 2e-5
    epochs: int = 3
    seed: int = 42
    early_stopping_patience: int = 2
    debug_max_steps_per_epoch: int | None = None  # 例: 200 で軽量実行

    # Class weight
    use_class_weight: bool = False

cfg = Config()

# ---- Device ----
DEVICE = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

# ---- Seed ----
random.seed(cfg.seed)
np.random.seed(cfg.seed)
torch.manual_seed(cfg.seed)

# ---- Make dirs ----
os.makedirs(cfg.base_dir, exist_ok=True)
os.chdir(cfg.base_dir)
os.makedirs(cfg.results_dir, exist_ok=True)
os.makedirs(cfg.models_dir, exist_ok=True)
os.makedirs(cfg.logs_dir, exist_ok=True)

print("CWD:", os.getcwd())
print("dirs:", cfg.results_dir, cfg.models_dir, cfg.logs_dir)

DEVICE: mps
CWD: /Users/mh/Downloads/Mini Project/Jan_TextClassification_Iteration
dirs: results models logs


In [2]:
# Cell 2: Dataset load & split

raw = load_dataset(cfg.dataset_name)

# 多くのHFデータは train/test だけ、あるいは train/validation/test がある
if "validation" in raw:
    train_full = raw["train"]
    val_ds = raw["validation"]
    test_ds = raw["test"] if "test" in raw else None
else:
    # train から val を作る
    train_full = raw["train"].shuffle(seed=cfg.seed)
    split = train_full.train_test_split(test_size=cfg.val_ratio, seed=cfg.seed)
    train_full = split["train"]
    val_ds = split["test"]
    test_ds = raw["test"] if "test" in raw else None

print("train:", len(train_full), "val:", len(val_ds), "test:", (len(test_ds) if test_ds is not None else None))
print("columns:", train_full.column_names)

# label_col が想定と違う場合に備えて、候補を表示
print("example:", {k: train_full[0][k] for k in train_full.column_names if k in [cfg.text_col, cfg.label_col]})

train: 1102 val: 123 test: 1000
columns: ['text', 'label', 'label_text']
example: {'text': 'wales want rugby league training wales could follow england s lead by training with a rugby league club.  england have already had a three-day session with leeds rhinos  and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval  but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week  while wales will play england in the opening six nations match on 5 february.  we have had an approach from wales   confirmed a saints spokesman.  it s in the very early stages but it is something we are giving serious consideration to.  st helens  who are proud of their welsh connections  are obvious partners for the welsh rugby union  despite a spat in 2001 over the collapse of kieron cunningham s proposed £500 000 move to union side swansea. a similar cross-code deal that took iestyn harri

In [3]:
# Cell 3: Ensure label is ClassLabel (encode if needed)

from datasets import ClassLabel

label_feature = train_full.features.get(cfg.label_col, None)
print("label feature:", label_feature)

# label が string(Value) の場合は class_encode_column をかける
# これで train_full.features[label_col].num_classes が使えるようになる
if not isinstance(label_feature, ClassLabel):
    train_full = train_full.class_encode_column(cfg.label_col)
    val_ds = val_ds.class_encode_column(cfg.label_col)
    if test_ds is not None:
        test_ds = test_ds.class_encode_column(cfg.label_col)

label_feature = train_full.features[cfg.label_col]
num_labels = label_feature.num_classes
label_names = label_feature.names

print("num_labels:", num_labels)
print("labels:", label_names)

label feature: Value('int64')
num_labels: 5
labels: ['0', '1', '2', '3', '4']


In [4]:
# Cell 4: Tokenize & DataLoader

tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

def tokenize_batch(examples):
    enc = tokenizer(
        examples[cfg.text_col],
        truncation=True,
        padding="max_length",
        max_length=cfg.max_seq_len,
    )
    # ★重要：学習用キーは "labels" に統一する（HuggingFaceが期待）
    enc["labels"] = examples[cfg.label_col]
    return enc

train_tok = train_full.map(tokenize_batch, batched=True, remove_columns=train_full.column_names)
val_tok   = val_ds.map(tokenize_batch, batched=True, remove_columns=val_ds.column_names)

cols = ["input_ids", "attention_mask", "labels"]
train_tok.set_format(type="torch", columns=cols)
val_tok.set_format(type="torch", columns=cols)

train_loader = DataLoader(train_tok, batch_size=cfg.batch_size, shuffle=True)
val_loader   = DataLoader(val_tok, batch_size=cfg.batch_size, shuffle=False)

print("batch keys:", next(iter(val_loader)).keys())
print("batch shapes:", next(iter(val_loader))["input_ids"].shape)

Map:   0%|          | 0/123 [00:00<?, ? examples/s]

batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
batch shapes: torch.Size([16, 128])


In [5]:
# Cell 5: Model & Optimizer (+ class weight)

model = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name,
    num_labels=num_labels,
).to(DEVICE)

optimizer = AdamW(model.parameters(), lr=cfg.lr)
print("model ready:", cfg.model_name)

# ---- class weights ----
# counts を train_full（ラベル列）から作る
counts = Counter(train_full[cfg.label_col])  # 0..num_labels-1 の頻度

# ラベルIDは 0..(num_labels-1) を仮定
freq = torch.tensor([counts[i] for i in range(num_labels)], dtype=torch.float)

weights = 1.0 / (freq + 1e-9)
weights = weights / weights.mean()  # 平均1に正規化（扱いやすい）

print("counts:", counts)
print("freq:", freq.tolist())
print("weights:", weights.tolist())

# criterion（重み付きCrossEntropyLoss）
if cfg.use_class_weight:
    criterion = nn.CrossEntropyLoss(weight=weights.to(DEVICE))
else:
    criterion = nn.CrossEntropyLoss()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model ready: distilbert-base-uncased
counts: Counter({1: 264, 2: 248, 4: 222, 0: 184, 3: 184})
freq: [184.0, 264.0, 248.0, 184.0, 222.0]
weights: [1.1715819835662842, 0.8165571093559265, 0.8692381381988525, 1.1715819835662842, 0.9710409045219421]


In [6]:
# Cell 6: eval_model

from sklearn.metrics import accuracy_score, f1_score

def eval_model(model, dataloader):
    """
    dataloader 全体に対して
      - 平均 loss
      - accuracy
      - macro F1
      - all_labels, all_preds
    を返す。
    """
    # 1) model を eval モード
    model.eval()

    # 2) 正解ラベル・予測ラベル用リスト
    all_labels, all_preds = [], []

    # 3) loss 集計用
    total_loss, n_steps = 0.0, 0

    # 4) 勾配は不要
    with torch.no_grad():
        for batch in dataloader:
            # 4-1) DEVICE へ転送
            batch = {k: v.to(DEVICE) for k, v in batch.items()}

            # 4-2) forward（HuggingFaceモデルへ）
            out = model(**batch)
            logits = out.logits  # [B, num_labels]

            # 4-3) loss を取得（原則 out.loss でもOKだが、criterionで統一してもよい）
            # ここでは「学習と同じ定義」に揃えるため criterion を使う
            # （class weight を eval loss にも反映したい場合）
            labels = batch["labels"] if "labels" in batch else batch["label"]
            loss = criterion(logits, labels)

            total_loss += float(loss.item())
            n_steps += 1

            # 4-4) preds を作る
            preds = logits.argmax(dim=-1)

            # 4-5) CPUへ戻して list に蓄積
            all_labels.extend(labels.detach().cpu().numpy().tolist())
            all_preds.extend(preds.detach().cpu().numpy().tolist())

    # 5) 平均 loss
    avg_loss = total_loss / max(1, n_steps)

    # 6) 指標
    acc = accuracy_score(all_labels, all_preds)
    f1  = f1_score(all_labels, all_preds, average="macro")

    # 7) return
    return avg_loss, acc, f1, all_labels, all_preds

In [7]:
# Cell 7: Training loop（logging + early stopping）

from tqdm.auto import tqdm

ts = nowstamp()
run_name = f"{cfg.project_name}_{ts}"
save_dir = os.path.join(cfg.models_dir, run_name)
os.makedirs(save_dir, exist_ok=True)

log_path = os.path.join(cfg.logs_dir, f"train_log_{ts}.tsv")
print("Save dir:", save_dir)
print("Log   :", log_path)

with open(log_path, "w") as f:
    f.write("epoch\tsteps\ttrain_loss\tval_loss\tval_acc\tval_f1\n")

best_val_loss = float("inf")
epochs_no_improve = 0

for ep in range(1, cfg.epochs + 1):
    model.train()
    total_loss, n_steps = 0.0, 0

    progress = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {ep}/{cfg.epochs}")

    for step, batch in enumerate(progress):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}

        optimizer.zero_grad()

        out = model(**batch)
        logits = out.logits
        labels = batch["labels"] if "labels" in batch else batch["label"]

        # ★重み付きloss（または通常loss）
        loss = criterion(logits, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += float(loss.item())
        n_steps += 1

        progress.set_postfix({"loss": float(loss.item())})

        if cfg.debug_max_steps_per_epoch is not None and (step + 1) >= cfg.debug_max_steps_per_epoch:
            break

    train_loss = total_loss / max(1, n_steps)
    val_loss, val_acc, val_f1, _, _ = eval_model(model, val_loader)

    if DEVICE == "mps":
        torch.mps.empty_cache()

    print(f"\nEpoch {ep}/{cfg.epochs} steps={n_steps} "
          f"train_loss={train_loss:.4f} val_loss={val_loss:.4f} "
          f"val_acc={val_acc:.4f} val_f1={val_f1:.4f}")

    with open(log_path, "a") as f:
        f.write(f"{ep}\t{n_steps}\t{train_loss:.6f}\t{val_loss:.6f}\t{val_acc:.6f}\t{val_f1:.6f}\n")

    # Early stopping & best model save
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print("  -> best model updated & saved.")
    else:
        epochs_no_improve += 1
        print(f"  -> no improvement ({epochs_no_improve}/{cfg.early_stopping_patience})")
        if epochs_no_improve >= cfg.early_stopping_patience:
            print("  -> Early stopping triggered.")
            break

Save dir: models/distilbert_bbcnews_classweight_20260112_182403
Log   : logs/train_log_20260112_182403.tsv


Epoch 1/3:   0%|          | 0/69 [00:00<?, ?it/s]


Epoch 1/3 steps=69 train_loss=0.8877 val_loss=0.2056 val_acc=0.9919 val_f1=0.9904
  -> best model updated & saved.


Epoch 2/3:   0%|          | 0/69 [00:00<?, ?it/s]


Epoch 2/3 steps=69 train_loss=0.1297 val_loss=0.0862 val_acc=0.9837 val_f1=0.9830
  -> best model updated & saved.


Epoch 3/3:   0%|          | 0/69 [00:00<?, ?it/s]


Epoch 3/3 steps=69 train_loss=0.0294 val_loss=0.0574 val_acc=0.9919 val_f1=0.9904
  -> best model updated & saved.


In [8]:
# Cell 8: errors.tsv + confusion matrix

# ベストモデルをロードし直す（保存済みを使って評価を安定させる）
best_model = AutoModelForSequenceClassification.from_pretrained(save_dir).to(DEVICE)

val_loss, val_acc, val_f1, y_true, y_pred = eval_model(best_model, val_loader)

print("FINAL val_loss:", val_loss, "val_acc:", val_acc, "val_f1:", val_f1)

# val 元データ（テキスト/ラベル）も参照できるように index を揃える
val_texts = val_ds[cfg.text_col]
val_labels = val_ds[cfg.label_col]  # int

# errors.tsv
errors_path = os.path.join(cfg.results_dir, f"errors_{ts}.tsv")
with open(errors_path, "w", newline="") as f:
    w = csv.writer(f, delimiter="\t")
    w.writerow(["idx", "text", "true_id", "true_label", "pred_id", "pred_label"])
    for i, (t, p) in enumerate(zip(y_true, y_pred)):
        if t != p:
            w.writerow([i, val_texts[i], t, label_names[t], p, label_names[p]])

print("Saved:", errors_path)

# confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=list(range(num_labels)))
cm_path = os.path.join(cfg.results_dir, f"confusion_{ts}.csv")
pd.DataFrame(cm, index=label_names, columns=label_names).to_csv(cm_path)

print("Saved:", cm_path)

# ついでに簡易レポート
print(classification_report(y_true, y_pred, target_names=label_names))

FINAL val_loss: 0.05743349879048765 val_acc: 0.991869918699187 val_f1: 0.9904273504273504
Saved: results/errors_20260112_182403.tsv
Saved: results/confusion_20260112_182403.csv
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       0.96      1.00      0.98        22
           2       1.00      1.00      1.00        27
           3       1.00      1.00      1.00        26
           4       1.00      0.95      0.97        20

    accuracy                           0.99       123
   macro avg       0.99      0.99      0.99       123
weighted avg       0.99      0.99      0.99       123

