In [1]:
#Cell 0: フォルダ作成
import os, time, random
from dataclasses import dataclass
from datetime import datetime

PROJECT_DIR = "/Users/mh/Downloads/Mini Project/Dec30_BBCNews"
RESULTS_DIR = os.path.join(PROJECT_DIR, "results")
MODELS_DIR  = os.path.join(PROJECT_DIR, "models")
LOGS_DIR    = os.path.join(PROJECT_DIR, "logs")

for d in [RESULTS_DIR, MODELS_DIR, LOGS_DIR]:
    os.makedirs(d, exist_ok=True)

print("PROJECT_DIR:", PROJECT_DIR)
print("RESULTS_DIR:", RESULTS_DIR)
print("MODELS_DIR :", MODELS_DIR)
print("LOGS_DIR   :", LOGS_DIR)

def nowstamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

PROJECT_DIR: /Users/mh/Downloads/Mini Project/Dec30_BBCNews
RESULTS_DIR: /Users/mh/Downloads/Mini Project/Dec30_BBCNews/results
MODELS_DIR : /Users/mh/Downloads/Mini Project/Dec30_BBCNews/models
LOGS_DIR   : /Users/mh/Downloads/Mini Project/Dec30_BBCNews/logs


In [2]:
#Cell 1: Imports & Config
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)
from torch.utils.data import DataLoader
from torch.optim import AdamW

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# --- device ---
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
print("DEVICE:", DEVICE)

@dataclass
class BBCNewsConfig:
    model_name: str = "distilbert-base-uncased"
    dataset_name: str = "SetFit/bbc-news"   # ← bbc-news
    text_col: str = "text"
    label_col: str = "label"

    max_len: int = 256
    batch_size: int = 16
    lr: float = 2e-5
    epochs: int = 3
    seed: int = 42

    # 軽量化（Macでも回る）
    max_train_examples: int = 8000
    max_val_examples: int = 1000
    max_test_examples: int = 2000

    # デバッグ（1epochの最大step数。Noneで無制限）
    debug_max_steps_per_epoch: int | None = 300

    # early stopping
    early_stopping_patience: int = 1

    # 出力先
    results_dir: str = RESULTS_DIR
    models_dir: str = MODELS_DIR
    logs_dir: str = LOGS_DIR

cfg = BBCNewsConfig()
cfg
#print(cfg)

DEVICE: mps


BBCNewsConfig(model_name='distilbert-base-uncased', dataset_name='SetFit/bbc-news', text_col='text', label_col='label', max_len=256, batch_size=16, lr=2e-05, epochs=3, seed=42, max_train_examples=8000, max_val_examples=1000, max_test_examples=2000, debug_max_steps_per_epoch=300, early_stopping_patience=1, results_dir='/Users/mh/Downloads/Mini Project/Dec30_BBCNews/results', models_dir='/Users/mh/Downloads/Mini Project/Dec30_BBCNews/models', logs_dir='/Users/mh/Downloads/Mini Project/Dec30_BBCNews/logs')

In [3]:
#Cell 2: Seed固定 & Datasetロード + 分割
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(cfg.seed)

ds = load_dataset(cfg.dataset_name)
print(ds)

# SetFit/bbc-news は train / test があることが多い。なければ自前split。
if "train" in ds and "test" in ds:
    train_full = ds["train"].shuffle(seed=cfg.seed)
    test_ds    = ds["test"].shuffle(seed=cfg.seed)
else:
    # 念のため fallback
    full = ds[list(ds.keys())[0]].shuffle(seed=cfg.seed)
    tmp = full.train_test_split(test_size=0.2, seed=cfg.seed)
    train_full, test_ds = tmp["train"], tmp["test"]

# train -> train/val
split = train_full.train_test_split(test_size=0.1, seed=cfg.seed)
train_ds, val_ds = split["train"], split["test"]

# 軽量化（必要なら）
def take_n(d, n):
    if n is None: 
        return d
    n = min(n, len(d))
    return d.select(range(n))

train_ds = take_n(train_ds, cfg.max_train_examples)
val_ds   = take_n(val_ds, cfg.max_val_examples)
test_ds  = take_n(test_ds, cfg.max_test_examples)

print("train:", len(train_ds), "val:", len(val_ds), "test:", len(test_ds))
print(train_ds[0])

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1225
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1000
    })
})
train: 1102 val: 123 test: 1000
{'text': 'wales want rugby league training wales could follow england s lead by training with a rugby league club.  england have already had a three-day session with leeds rhinos  and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval  but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week  while wales will play england in the opening six nations match on 5 february.  we have had an approach from wales   confirmed a saints spokesman.  it s in the very early stages but it is something we are giving serious consideration to.  st helens  who are proud of their welsh connections  are obvious partners for the welsh r

In [4]:
#Cell 3: Tokenizer & Tokenize（-100はここでは不要）
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

def tokenize_batch(batch):
    return tokenizer(
        batch[cfg.text_col],
        truncation=True,
        max_length=cfg.max_len,
    )

train_tok = train_ds.map(tokenize_batch, batched=True, remove_columns=[cfg.text_col])
val_tok   = val_ds.map(tokenize_batch, batched=True, remove_columns=[cfg.text_col])
test_tok  = test_ds.map(tokenize_batch, batched=True, remove_columns=[cfg.text_col])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

cols = ["input_ids", "attention_mask", cfg.label_col]
train_tok.set_format(type="torch", columns=cols)
val_tok.set_format(type="torch", columns=cols)
test_tok.set_format(type="torch", columns=cols)

train_tok[0]

{'label': tensor(2),
 'input_ids': tensor([  101,  3575,  2215,  4043,  2223,  2731,  3575,  2071,  3582,  2563,
          1055,  2599,  2011,  2731,  2007,  1037,  4043,  2223,  2252,  1012,
          2563,  2031,  2525,  2018,  1037,  2093,  1011,  2154,  5219,  2007,
          7873, 24091,  2015,  1998,  3575,  2024,  2245,  2000,  2022,  4699,
          1999,  1037,  2714,  9349,  2007,  9169,  2358, 24074,  1012,  6586,
          2873,  4775,  4971,  7652,  2038,  2445,  2010,  6226,  2021,  2065,
          2009,  2515,  4148,  2009,  2003,  9832,  2000,  2022,  2023,  2161,
          1012,  6586,  2031,  1037,  2733,  1055,  2731,  1999,  5978,  2279,
          2733,  2096,  3575,  2097,  2377,  2563,  1999,  1996,  3098,  2416,
          3741,  2674,  2006,  1019,  2337,  1012,  2057,  2031,  2018,  2019,
          3921,  2013,  3575,  4484,  1037,  6586, 14056,  1012,  2009,  1055,
          1999,  1996,  2200,  2220,  5711,  2021,  2009,  2003,  2242,  2057,
          2024,  3

In [5]:
#Cell 4: DataLoader
train_loader = DataLoader(
    train_tok,
    batch_size=cfg.batch_size,
    shuffle=True,
    collate_fn=data_collator,
)
val_loader = DataLoader(
    val_tok,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn=data_collator,
)
test_loader = DataLoader(
    test_tok,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

batch = next(iter(train_loader))
{k: (v.shape if hasattr(v, "shape") else type(v)) for k, v in batch.items()}

{'input_ids': torch.Size([16, 256]),
 'attention_mask': torch.Size([16, 256]),
 'labels': torch.Size([16])}

In [6]:
#Cell 5: Model & Optimizer
import numpy as np

# label は Value(int) なので num_classes は無い → データから数える
unique_labels = sorted(set(train_ds[cfg.label_col]))
num_labels = len(unique_labels)

print("unique_labels:", unique_labels)
print("num_labels:", num_labels)

# label_names も自前で作る（今回は 0..N-1 をそのまま名前にする）
label_names = [str(i) for i in unique_labels]
print("label_names:", label_names)

model = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name,
    num_labels=num_labels,
).to(DEVICE)

optimizer = AdamW(model.parameters(), lr=cfg.lr)

unique_labels: [0, 1, 2, 3, 4]
num_labels: 5
label_names: ['0', '1', '2', '3', '4']


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
#Cell 6: eval_model
from sklearn.metrics import accuracy_score, f1_score

# Cell X: eval_model (with key check + robust label key handling)
from sklearn.metrics import accuracy_score, f1_score
import torch

def eval_model(model, dataloader):
    """
    returns: avg_loss, acc, f1_macro, all_labels, all_preds
    """

    # 0) （デバッグ）dataloader が返す batch のキーを 1回だけ確認する
    #    ※重い処理ではないが、毎回の評価で何度も表示したくないので try/except にする
    try:
        first_batch = next(iter(dataloader))
        print("[eval_model] batch keys:", first_batch.keys())
    except StopIteration:
        raise ValueError("dataloader is empty (no batches).")

    # 1) model を eval モードにする
    model.eval()

    # 2) 正解ラベル・予測ラベルを入れるリストを用意する
    all_labels = []
    all_preds  = []

    # 3) total_loss と n_steps を初期化する
    total_loss = 0.0
    n_steps = 0

    # 4) 勾配計算をオフにして dataloader をループする
    with torch.no_grad():
        for batch in dataloader:
            # 4-1) バッチを DEVICE に転送する
            batch = {k: v.to(DEVICE) for k, v in batch.items()}

            # 4-2) model(**batch) で出力を得る
            out = model(**batch)

            # 4-3) loss を蓄積し、ステップ数を数える
            loss = out.loss
            total_loss += float(loss.item())
            n_steps += 1

            # 4-4) logits から予測ラベル（argmax）を計算する
            logits = out.logits
            preds = logits.argmax(dim=-1)

            # 4-5) labels キー名の揺れ（labels / label / cfg.label_col）を吸収して取り出す
            if "cfg" in globals() and hasattr(cfg, "label_col") and cfg.label_col in batch:
                labels = batch[cfg.label_col]
            elif "labels" in batch:
                labels = batch["labels"]
            elif "label" in batch:
                labels = batch["label"]
            else:
                raise KeyError(f"batch has no labels key. keys={list(batch.keys())}")

            # 4-6) CPU に戻して list に追加する
            all_labels.extend(labels.detach().cpu().numpy().tolist())
            all_preds.extend(preds.detach().cpu().numpy().tolist())

    # 5) 平均 loss を計算する
    avg_loss = total_loss / max(1, n_steps)

    # 6) accuracy と macro F1 を計算する
    acc = accuracy_score(all_labels, all_preds)
    f1  = f1_score(all_labels, all_preds, average="macro")

    # 7) (avg_loss, acc, f1, all_labels, all_preds) を返す
    return avg_loss, acc, f1, all_labels, all_preds

In [8]:
#Cell 7: Training loop（logging + early stopping）
from tqdm.auto import tqdm

ts = nowstamp()
run_name = f"distilbert_bbcnews_{ts}"
save_dir = os.path.join(cfg.models_dir, run_name)
os.makedirs(save_dir, exist_ok=True)

log_path = os.path.join(cfg.logs_dir, f"train_log_{ts}.tsv")

print("Save dir:", save_dir)
print("Log   :", log_path)

with open(log_path, "w") as f:
    f.write("epoch\tsteps\ttrain_loss\tval_loss\tval_acc\tval_f1\n")

best_val_loss = float("inf")
epochs_no_improve = 0

for ep in range(1, cfg.epochs + 1):
    model.train()
    total_loss = 0.0
    n_steps = 0

    progress = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {ep}/{cfg.epochs}")

    for step, batch in enumerate(progress): # enumerate()「今、何番目の処理か」
        batch = {k: v.to(DEVICE) for k, v in batch.items()}

        optimizer.zero_grad()
        out = model(**batch)
        loss = out.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        n_steps += 1
        progress.set_postfix({"loss": float(loss.item())})

        if cfg.debug_max_steps_per_epoch is not None and (step + 1) >= cfg.debug_max_steps_per_epoch:
            break

    train_loss = total_loss / max(1, n_steps)
    val_loss, val_acc, val_f1, _, _ = eval_model(model, val_loader)

    if DEVICE == "mps":
        torch.mps.empty_cache()

    print(f"\nEpoch {ep}/{cfg.epochs} steps={n_steps} "
          f"train_loss={train_loss:.4f} val_loss={val_loss:.4f} "
          f"val_acc={val_acc:.4f} val_f1={val_f1:.4f}")

    with open(log_path, "a") as f:
        f.write(f"{ep}\t{n_steps}\t{train_loss:.6f}\t{val_loss:.6f}\t{val_acc:.6f}\t{val_f1:.6f}\n")

    # Early stopping & best model save
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print("  -> best model updated & saved.")
    else:
        epochs_no_improve += 1
        print(f"  -> no improvement ({epochs_no_improve}/{cfg.early_stopping_patience})")
        if epochs_no_improve >= cfg.early_stopping_patience:
            print("  -> Early stopping triggered.")
            break

Save dir: /Users/mh/Downloads/Mini Project/Dec30_BBCNews/models/distilbert_bbcnews_20260104_143108
Log   : /Users/mh/Downloads/Mini Project/Dec30_BBCNews/logs/train_log_20260104_143108.tsv


Epoch 1/3:   0%|          | 0/69 [00:00<?, ?it/s]

[eval_model] batch keys: KeysView({'input_ids': tensor([[  101, 10687,  2753,  ...,  2718,  2774,   102],
        [  101, 17235,  2850,  ...,     0,     0,     0],
        [  101,  7436, 17853,  ..., 18510,  4626,   102],
        ...,
        [  101,  3617, 11972,  ...,  2241,  2006,   102],
        [  101,  3163,  2538,  ...,  2272,  2117,   102],
        [  101,  6261,  2557,  ..., 14534,  2005,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([3, 1, 3, 0, 3, 1, 3, 3, 4, 3, 3, 2, 1, 0, 2, 0])})

Epoch 1/3 steps=69 train_loss=0.8598 val_loss=0.1928 val_acc=0.9919 val_f1=0.9904
  -> best model updated & saved.


Epoch 2/3:   0%|          | 0/69 [00:00<?, ?it/s]

[eval_model] batch keys: KeysView({'input_ids': tensor([[  101, 10687,  2753,  ...,  2718,  2774,   102],
        [  101, 17235,  2850,  ...,     0,     0,     0],
        [  101,  7436, 17853,  ..., 18510,  4626,   102],
        ...,
        [  101,  3617, 11972,  ...,  2241,  2006,   102],
        [  101,  3163,  2538,  ...,  2272,  2117,   102],
        [  101,  6261,  2557,  ..., 14534,  2005,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([3, 1, 3, 0, 3, 1, 3, 3, 4, 3, 3, 2, 1, 0, 2, 0])})

Epoch 2/3 steps=69 train_loss=0.1187 val_loss=0.0556 val_acc=0.9919 val_f1=0.9904
  -> best model updated & saved.


Epoch 3/3:   0%|          | 0/69 [00:00<?, ?it/s]

[eval_model] batch keys: KeysView({'input_ids': tensor([[  101, 10687,  2753,  ...,  2718,  2774,   102],
        [  101, 17235,  2850,  ...,     0,     0,     0],
        [  101,  7436, 17853,  ..., 18510,  4626,   102],
        ...,
        [  101,  3617, 11972,  ...,  2241,  2006,   102],
        [  101,  3163,  2538,  ...,  2272,  2117,   102],
        [  101,  6261,  2557,  ..., 14534,  2005,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([3, 1, 3, 0, 3, 1, 3, 3, 4, 3, 3, 2, 1, 0, 2, 0])})

Epoch 3/3 steps=69 train_loss=0.0324 val_loss=0.0522 val_acc=0.9919 val_f1=0.9904
  -> best model updated & saved.


In [9]:
#Cell 8: Test評価 + Confusion Matrix + errors.tsv 保存
# best model をロード（念のため）
model = AutoModelForSequenceClassification.from_pretrained(save_dir).to(DEVICE)

test_loss, test_acc, test_f1, y_true, y_pred = eval_model(model, test_loader)
print(f"TEST  loss={test_loss:.4f} acc={test_acc:.4f} f1={test_f1:.4f}")

print("\nClassification report:")
print(classification_report(y_true, y_pred, target_names=label_names))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(cm, index=label_names, columns=label_names)
cm_path = os.path.join(cfg.results_dir, f"confusion_matrix_{ts}.csv")
cm_df.to_csv(cm_path)
print("Saved:", cm_path)
cm_df

[eval_model] batch keys: KeysView({'input_ids': tensor([[  101,  3956,  3504,  ...,  2006,  3956,   102],
        [  101,  3782,  6473,  ...,  2793,  2790,   102],
        [  101,  2844,  5157,  ...,  2000,  1002,   102],
        ...,
        [  101,  9580, 21208,  ...,  2009,  1999,   102],
        [  101,  7842, 16294,  ...,  2052,  2663,   102],
        [  101,  6172,  6925,  ...,  4791,  2013,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([1, 3, 1, 4, 0, 1, 4, 0, 2, 4, 4, 0, 2, 0, 2, 3])})
TEST  loss=0.1214 acc=0.9720 f1=0.9716

Classification report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       189
           1       0.96      0.96      0.96       224
           2       1.00      1.00      1.00       236
   

Unnamed: 0,0,1,2,3,4
0,185,3,1,0,0
1,6,214,0,0,4
2,0,1,235,0,0
3,5,1,0,169,1
4,1,5,0,0,169


In [10]:
# errors.tsv（誤分類を保存）
# 元データ（test_ds）からテキストも引っ張る
errors = []
for i, (yt, yp) in enumerate(zip(y_true, y_pred)):
    if yt != yp:
        ex = test_ds[i]
        errors.append({
            "idx": i,
            "true_id": yt,
            "pred_id": yp,
            "true_label": label_names[yt],
            "pred_label": label_names[yp],
            "text": ex[cfg.text_col],
        })

err_df = pd.DataFrame(errors)
err_path = os.path.join(cfg.results_dir, f"errors_{ts}.tsv")
err_df.to_csv(err_path, sep="\t", index=False)
print("Saved:", err_path)
err_df.head(10)

Saved: /Users/mh/Downloads/Mini Project/Dec30_BBCNews/results/errors_20260104_143108.tsv


Unnamed: 0,idx,true_id,pred_id,true_label,pred_label,text
0,36,0,1,0,1,pc ownership to double by 2010 the number of...
1,74,3,1,3,1,music mogul fuller sells company pop idol supr...
2,109,0,1,0,1,fast lifts rise into record books two high-spe...
3,119,4,1,4,1,baa support ahead of court battle uk airport o...
4,214,1,4,1,4,ban on forced retirement under 65 employers wi...
5,256,4,1,4,1,eu fraud clampdown urged eu member states are ...
6,265,1,0,1,0,orange colour clash set for court a row over t...
7,267,3,0,3,0,new media battle for bafta awards the bbc lead...
8,329,0,2,0,2,piero gives rugby perspective bbc sport unveil...
9,335,3,0,3,0,pupils to get anti-piracy lessons lessons on m...
