In [1]:
# Kaggle HF Trainer: full fine-tuning on GLUE using Hugging Face task-specific head
# Default: bert-base-uncased with AutoModelForSequenceClassification (no manual head)
# Run in a Kaggle notebook cell. Turn Internet and GPU on.

!pip -q install "transformers>=4.43" "datasets>=2.20" "evaluate>=0.4" "accelerate>=0.33" scikit-learn wandb -U

import os
import json
from dataclasses import dataclass
from datetime import datetime
from typing import Optional

import numpy as np
import torch

from datasets import load_dataset
import evaluate

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
)

# ---------------------------------
# Config
# ---------------------------------
@dataclass
class RunConfig:
    task_name: str = "sst2"              # cola sst2 mrpc qqp stsb mnli qnli rte wnli
    model_name: str = "bert-base-uncased" # any compatible encoder model
    output_dir: str = "/kaggle/working/glue_ft"
    num_train_epochs: float = 3.0
    per_device_train_batch_size: int = 32
    per_device_eval_batch_size: int = 64
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    warmup_ratio: float = 0.06
    seed: int = 42
    save_strategy: str = "epoch"         # or "steps"
    eval_strategy: str = "epoch"         # or "steps"
    save_total_limit: int = 2
    fp16: bool = True                     # set False on CPU
    bf16: bool = False                    # set True on A100/Hopper if desired
    # W&B settings
    wandb_enable: bool = True
    wandb_project: Optional[str] = "glue-ft"
    wandb_entity: Optional[str] = None
    wandb_run_name: Optional[str] = None
    wandb_offline_fallback: bool = True

CFG = RunConfig()

GLUE_SENTENCE_KEYS = {
    "cola": ("sentence", None),
    "sst2": ("sentence", None),
    "mrpc": ("sentence1", "sentence2"),
    "qqp": ("question1", "question2"),
    "stsb": ("sentence1", "sentence2"),
    "mnli": ("premise", "hypothesis"),
    "qnli": ("question", "sentence"),
    "rte": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

def is_regression_task(task: str) -> bool:
    return task.lower() == "stsb"

# ---------------------------------
# Weights & Biases setup
# ---------------------------------

def setup_wandb(cfg: RunConfig, task: str) -> str:
    """Login and init W&B. Uses Kaggle Secrets if available or env WANDB_API_KEY.
    Falls back to offline if no key and offline fallback is True.
    Returns a run_name string."""
    import wandb

    run_name = cfg.wandb_run_name or f"{task}-{cfg.model_name}-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}"

    # Try environment variable or Kaggle Secrets
    key = os.environ.get("WANDB_API_KEY")
    if key is None:
        try:
            from kaggle_secrets import UserSecretsClient
            key = UserSecretsClient().get_secret("WANDB_API_KEY")
        except Exception:
            key = None

    if key:
        wandb.login(key=key)
    elif cfg.wandb_offline_fallback:
        os.environ["WANDB_MODE"] = "offline"

    wandb.init(
        project=cfg.wandb_project,
        entity=cfg.wandb_entity,
        name=run_name,
        config={k: v for k, v in vars(cfg).items() if "wandb" not in k},
    )
    return run_name

# ---------------------------------
# Main
# ---------------------------------

def main(cfg: RunConfig):
    os.makedirs(cfg.output_dir, exist_ok=True)
    set_seed(cfg.seed)

    task = cfg.task_name.lower()
    if task not in GLUE_SENTENCE_KEYS:
        raise ValueError(f"Unknown GLUE task: {task}")

    # ---------------- Data ----------------
    raw = load_dataset("glue", task)

    if is_regression_task(task):
        num_labels = 1
        label_list = None
    else:
        label_list = raw["train"].features["label"].names
        num_labels = len(label_list)

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

    sent1_key, sent2_key = GLUE_SENTENCE_KEYS[task]

    def preprocess(batch):
        if sent2_key is None:
            tokenized = tokenizer(batch[sent1_key], truncation=True)
        else:
            tokenized = tokenizer(batch[sent1_key], batch[sent2_key], truncation=True)
        if "label" in batch:
            tokenized["labels"] = batch["label"]
        return tokenized

    encoded = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # ---------------- Metrics ----------------
    metric = evaluate.load("glue", task)

    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        if is_regression_task(task):
            preds = np.squeeze(preds)
            res = metric.compute(predictions=preds, references=labels)
            # avg Pearson + Spearman as an extra combined score
            res["combined_score"] = float((res.get("pearson", 0.0) + res.get("spearmanr", 0.0)) / 2.0)
            return res
        else:
            preds = np.argmax(preds, axis=1)
            res = metric.compute(predictions=preds, references=labels)
            if task in {"mrpc", "qqp"}:
                res["combined_score"] = float((res.get("f1", 0.0) + res.get("accuracy", 0.0)) / 2.0)
            return res

    if task == "cola":
        best_metric = "matthews_correlation"
    elif task == "stsb":
        best_metric = "combined_score"
    elif task in {"mrpc", "qqp"}:
        best_metric = "f1"
    else:
        best_metric = "accuracy"

    # ---------------- Model (HF-native) ----------------
    # Full finetuning via AutoModelForSequenceClassification (no custom head code)
    config = AutoConfig.from_pretrained(cfg.model_name, num_labels=num_labels)
    if is_regression_task(task):
        config.problem_type = "regression"  # ensures MSE loss and float outputs
    model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name, config=config)

    # ---------------- Splits ----------------
    train_ds = encoded["train"]
    if task == "mnli":
        eval_ds = encoded["validation_matched"]
        eval_mm_ds = encoded["validation_mismatched"]
    else:
        eval_ds = encoded["validation"]
        eval_mm_ds = None

    # ---------------- Trainer ----------------
    if cfg.wandb_enable:
        run_name = setup_wandb(cfg, task)
        report_targets = ["wandb"]
    else:
        run_name = f"{task}-{cfg.model_name}"
        report_targets = ["none"]

    args = TrainingArguments(
        output_dir=cfg.output_dir,
        learning_rate=cfg.learning_rate,
        per_device_train_batch_size=cfg.per_device_train_batch_size,
        per_device_eval_batch_size=cfg.per_device_eval_batch_size,
        num_train_epochs=cfg.num_train_epochs,
        weight_decay=cfg.weight_decay,
        warmup_ratio=cfg.warmup_ratio,
        eval_strategy=cfg.eval_strategy,
        save_strategy=cfg.save_strategy,
        save_total_limit=cfg.save_total_limit,
        load_best_model_at_end=True,
        metric_for_best_model=best_metric,
        greater_is_better=True,
        fp16=cfg.fp16 and torch.cuda.is_available(),
        bf16=cfg.bf16 and torch.cuda.is_available(),
        logging_steps=50,
        report_to=report_targets,
        run_name=run_name,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # ---------------- Train ----------------
    trainer.train()

    # Log a small marker so the run has a final step in W&B
    try:
        if cfg.wandb_enable:
            import wandb
            wandb.log({"final/global_step": trainer.state.global_step})
    except Exception:
        pass

    # Save best model for later evaluation / reuse
    best_dir = os.path.join(cfg.output_dir, "best_model")
    os.makedirs(best_dir, exist_ok=True)
    trainer.save_model(best_dir)          # saves config + model weights
    tokenizer.save_pretrained(best_dir)

    # ---------------- Evaluate ----------------
    val_metrics = trainer.evaluate(eval_dataset=eval_ds)
    print("Validation:", val_metrics)

    if cfg.wandb_enable:
        try:
            import wandb
            wandb.log({f"val/{k}": v for k, v in val_metrics.items()})
        except Exception:
            pass

    if eval_mm_ds is not None:
        mm_metrics = trainer.evaluate(eval_dataset=eval_mm_ds)
        print("Validation mismatched:", mm_metrics)
        if cfg.wandb_enable:
            try:
                import wandb
                wandb.log({f"val_mm/{k}": v for k, v in mm_metrics.items()})
            except Exception:
                pass

    with open(os.path.join(cfg.output_dir, "val_metrics.json"), "w") as f:
        json.dump(val_metrics, f, indent=2)
    if eval_mm_ds is not None:
        with open(os.path.join(cfg.output_dir, "val_mm_metrics.json"), "w") as f:
            json.dump(mm_metrics, f, indent=2)

    # Optionally dump logits for later analysis
    def dump_preds(ds, name):
        preds = trainer.predict(ds)
        np.save(os.path.join(cfg.output_dir, f"{name}_logits.npy"), preds.predictions)
        np.save(os.path.join(cfg.output_dir, f"{name}_labels.npy"), preds.label_ids)

    dump_preds(eval_ds, "val")
    if eval_mm_ds is not None:
        dump_preds(eval_mm_ds, "val_mismatched")

    if "test" in encoded:
        test_ds = encoded["test"]
    # Ensure no label columns exist to avoid CrossEntropy on invalid targets
    for col in ("label", "labels"):
        if col in test_ds.column_names:
            test_ds = test_ds.remove_columns(col)
    try:
        test_preds = trainer.predict(test_ds, metric_key_prefix="test").predictions
        np.save(os.path.join(cfg.output_dir, "test_logits.npy"), test_preds)
    except Exception as e:
        print("[WARN] Skipping test prediction due to:", e)
    
    print("Saved best model to:", best_dir)

    # W&B finish
    try:
        if cfg.wandb_enable:
            import wandb
            wandb.finish()
    except Exception:
        pass

# ---------------------------------
# Helper: reload a saved checkpoint and evaluate (HF-native)
# ---------------------------------

def load_for_eval(checkpoint_dir: str, task: str):
    task = task.lower()
    raw = load_dataset("glue", task)

    is_reg = is_regression_task(task)
    if is_reg:
        num_labels = 1
    else:
        num_labels = len(raw["train"].features["label"].names)

    tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir, use_fast=True)

    sent1_key, sent2_key = GLUE_SENTENCE_KEYS[task]

    def preprocess(batch):
        if sent2_key is None:
            tokenized = tokenizer(batch[sent1_key], truncation=True)
        else:
            tokenized = tokenizer(batch[sent1_key], batch[sent2_key], truncation=True)
        if "label" in batch:
            tokenized["labels"] = batch["label"]
        return tokenized

    encoded = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
    eval_ds = encoded["validation_matched"] if task == "mnli" else encoded["validation"]

    # load the exact model back (config carries num_labels and problem type)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint_dir)

    metric = evaluate.load("glue", task)

    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        if is_reg:
            preds = np.squeeze(preds)
            res = metric.compute(predictions=preds, references=labels)
            res["combined_score"] = float((res.get("pearson", 0.0) + res.get("spearmanr", 0.0)) / 2.0)
            return res
        preds = np.argmax(preds, axis=1)
        res = metric.compute(predictions=preds, references=labels)
        if task in {"mrpc", "qqp"}:
            res["combined_score"] = float((res.get("f1", 0.0) + res.get("accuracy", 0.0)) / 2.0)
        return res

    collator = DataCollatorWithPadding(tokenizer)
    args = TrainingArguments(output_dir=os.path.join(checkpoint_dir, "eval_tmp"), per_device_eval_batch_size=64, report_to=["none"]) 
    trainer = Trainer(model=model, args=args, eval_dataset=eval_ds, tokenizer=tokenizer, data_collator=collator, compute_metrics=compute_metrics)
    metrics = trainer.evaluate()
    print("Reloaded checkpoint metrics:", metrics)
    return metrics


if __name__ == "__main__":
    main(CFG)

"""
Quick how to on Kaggle

1. Keep Internet and GPU on.
2. To use W&B, add your key via Kaggle Secrets: Workspace > Add-ons > Secrets > Create new secret with key WANDB_API_KEY. Or set it in an env var.
3. Run this cell. Default model is bert-base-uncased with AutoModelForSequenceClassification.
4. Artifacts:
   - best model at /kaggle/working/glue_ft/best_model
   - JSON metrics in the output dir
   - optional logits .npy files for validation and test
5. Later evaluation:

from Kaggle_HF_Trainer_GLUE_full_finetune import load_for_eval
load_for_eval("/kaggle/working/glue_ft/best_model", "sst2")

W&B tips
- Set CFG.wandb_enable = True to log. Set False to disable.
- Project and entity are configurable: CFG.wandb_project, CFG.wandb_entity.
- If no key is found and offline fallback is True, the run will log locally in offline mode.

Notes
- This is full finetuning. Nothing is frozen.
- For STS-B, config.problem_type="regression" is set and MSE is used automatically.
- To switch models, set CFG.model_name (for example, roberta-base).
"""

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m108.2 MB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.8/375.8 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m123.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m95.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m31.7 MB/s[0m eta [36m0

2025-11-01 10:06:50.849613: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761991611.049698      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761991611.105289      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1524,0.215355,0.928899
2,0.0959,0.273911,0.915138
3,0.068,0.275303,0.932339


Validation: {'eval_loss': 0.2753034234046936, 'eval_accuracy': 0.9323394495412844, 'eval_runtime': 1.3989, 'eval_samples_per_second': 623.343, 'eval_steps_per_second': 10.008, 'epoch': 3.0}


Saved best model to: /kaggle/working/glue_ft/best_model


0,1
eval/accuracy,▇▁██
eval/loss,▁███
eval/runtime,▁▁▄█
eval/samples_per_second,██▅▁
eval/steps_per_second,██▅▁
final/global_step,▁
test/accuracy,▁
test/loss,▁
test/runtime,▁█
test/samples_per_second,█▁

0,1
eval/accuracy,0.93234
eval/loss,0.2753
eval/runtime,1.3989
eval/samples_per_second,623.343
eval/steps_per_second,10.008
final/global_step,6315
test/accuracy,0.93234
test/loss,0.2753
test/runtime,2.8434
test/samples_per_second,640.433


'\nQuick how to on Kaggle\n\n1. Keep Internet and GPU on.\n2. To use W&B, add your key via Kaggle Secrets: Workspace > Add-ons > Secrets > Create new secret with key WANDB_API_KEY. Or set it in an env var.\n3. Run this cell. Default model is bert-base-uncased with AutoModelForSequenceClassification.\n4. Artifacts:\n   - best model at /kaggle/working/glue_ft/best_model\n   - JSON metrics in the output dir\n   - optional logits .npy files for validation and test\n5. Later evaluation:\n\nfrom Kaggle_HF_Trainer_GLUE_full_finetune import load_for_eval\nload_for_eval("/kaggle/working/glue_ft/best_model", "sst2")\n\nW&B tips\n- Set CFG.wandb_enable = True to log. Set False to disable.\n- Project and entity are configurable: CFG.wandb_project, CFG.wandb_entity.\n- If no key is found and offline fallback is True, the run will log locally in offline mode.\n\nNotes\n- This is full finetuning. Nothing is frozen.\n- For STS-B, config.problem_type="regression" is set and MSE is used automatically.\n