# DistilBERT Base Code Training

In [1]:

# Install PyTorch 2.8.0 with CUDA 12.9 support
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu129

# Install HuggingFace Transformers + Datasets for DistilBERT training
%pip install transformers datasets accelerate
%pip install tqdm


Looking in indexes: https://download.pytorch.org/whl/cu129
Collecting torch
  Using cached https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp312-cp312-win_amd64.whl.metadata (29 kB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu129/torchaudio-2.8.0%2Bcu129-cp312-cp312-win_amd64.whl.metadata (7.4 kB)
Collecting filelock (from torch)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Using cached https://download.pytorch.org/whl/typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Using cached networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jinja2 (from torch)
  Using cached https:

In [2]:
# Code Cell 1.2 — Simulated 8 GB VRAM Cap (Allocator-Level)

import torch

assert torch.cuda.is_available(), "CUDA is not available."

# Explicit CUDA device index (required)
device_index = 0

# RTX 5060 Ti = 16 GB physical VRAM
TOTAL_VRAM_GB = 16
TARGET_VRAM_GB = 8

memory_fraction = TARGET_VRAM_GB / TOTAL_VRAM_GB

# Limit how much VRAM PyTorch is allowed to reserve
torch.cuda.set_per_process_memory_fraction(memory_fraction, device=device_index)

# Clear any cached allocations and reset stats
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

print(
    f"PyTorch VRAM usage capped at ~{TARGET_VRAM_GB} GB "
    f"({memory_fraction:.2f} of total device memory) on cuda:{device_index}"
)


PyTorch VRAM usage capped at ~8 GB (0.50 of total device memory) on cuda:0


In [3]:
# Code Cell 2

import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 2.8.0+cu129
CUDA available: True
CUDA device: NVIDIA GeForce RTX 5060 Ti


In [4]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast

dataset = load_dataset("ag_news")

print("Dataset splits:", dataset)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

MAX_LENGTH = 128 

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

tokenized_dataset = dataset.map(tokenize_batch, batched=True)

tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

print("Sample tokenized batch shape:")
print("Train dataset example input_ids shape:", train_dataset[0]["input_ids"].shape)
print("Train dataset example attention_mask shape:", train_dataset[0]["attention_mask"].shape)
print("Label example:", train_dataset[0]["label"])


Dataset splits: DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
Sample tokenized batch shape:
Train dataset example input_ids shape: torch.Size([128])
Train dataset example attention_mask shape: torch.Size([128])
Label example: tensor(2)


In [5]:
from torch.utils.data import DataLoader

BATCH_SIZE = 128
NUM_EPOCHS = 30
SUBSET_TRAIN_SIZE = 10_000

print(f"Batch size: {BATCH_SIZE}")
print(f"Planned epochs: {NUM_EPOCHS}")
print(f"Using training subset size: {SUBSET_TRAIN_SIZE}")

train_subset = train_dataset.select(range(SUBSET_TRAIN_SIZE))

NUM_WORKERS = 8

train_loader = DataLoader(
    train_subset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True
)
print("Train batches (subset):", len(train_loader))
print("Test batches:", len(test_loader))
print(f"Dataloaders initialized with num_workers={NUM_WORKERS}, pin_memory=True")

metrics = {
    "epoch": [],
    "train_samples_per_sec": [],
    "epoch_time_sec": [],
    "train_loss": [],
    "val_accuracy": []
}
print("Metric tracking dict initialized:", list(metrics.keys()))


Batch size: 128
Planned epochs: 30
Using training subset size: 10000
Train batches (subset): 79
Test batches: 60
Dataloaders initialized with num_workers=8, pin_memory=True
Metric tracking dict initialized: ['epoch', 'train_samples_per_sec', 'epoch_time_sec', 'train_loss', 'val_accuracy']


In [6]:
# Code Cell 5

from transformers import DistilBertForSequenceClassification

# 1. Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 2. Load DistilBERT model for 4-class classification (AG News)
num_labels = 4
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)
model.to(device)
print("Model loaded and moved to device.")

# 3. Define optimizer (AdamW is standard for transformers)
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)

print("Optimizer initialized (AdamW) with learning rate:", learning_rate)


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to device.
Optimizer initialized (AdamW) with learning rate: 5e-05


In [7]:
# Code Cell 6

import time
import torch.nn.functional as F
model.train()

def evaluate(model, data_loader, device):
    """Simple evaluation loop to compute validation accuracy."""
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    model.train() 
    return correct / total if total > 0 else 0.0


total_train_samples = len(train_dataset)

print("Starting training...")
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    running_loss = 0.0
    num_batches = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        num_batches += 1

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_start_time
    samples_per_sec = total_train_samples / epoch_time if epoch_time > 0 else 0.0
    avg_train_loss = running_loss / num_batches if num_batches > 0 else 0.0

    val_acc = evaluate(model, test_loader, device)

    metrics["epoch"].append(epoch)
    metrics["train_samples_per_sec"].append(samples_per_sec)
    metrics["epoch_time_sec"].append(epoch_time)
    metrics["train_loss"].append(avg_train_loss)
    metrics["val_accuracy"].append(val_acc)

    print(
        f"Epoch {epoch}/{NUM_EPOCHS} | "
        f"Time: {epoch_time:.2f}s | "
        f"Throughput: {samples_per_sec:.2f} samples/s | "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Acc: {val_acc:.4f}"
    )

print("Training complete.")


Starting training...


Epoch 1/30: 100%|██████████| 79/79 [00:35<00:00,  2.22it/s]


Epoch 1/30 | Time: 35.65s | Throughput: 3366.02 samples/s | Train Loss: 0.4826 | Val Acc: 0.8914


Epoch 2/30: 100%|██████████| 79/79 [00:35<00:00,  2.25it/s]


Epoch 2/30 | Time: 35.19s | Throughput: 3410.30 samples/s | Train Loss: 0.2238 | Val Acc: 0.9125


Epoch 3/30: 100%|██████████| 79/79 [00:35<00:00,  2.24it/s]


Epoch 3/30 | Time: 35.21s | Throughput: 3408.34 samples/s | Train Loss: 0.1385 | Val Acc: 0.9109


Epoch 4/30: 100%|██████████| 79/79 [00:35<00:00,  2.24it/s]


Epoch 4/30 | Time: 35.22s | Throughput: 3407.37 samples/s | Train Loss: 0.0918 | Val Acc: 0.8989


Epoch 5/30: 100%|██████████| 79/79 [00:35<00:00,  2.24it/s]


Epoch 5/30 | Time: 35.29s | Throughput: 3400.06 samples/s | Train Loss: 0.0683 | Val Acc: 0.9049


Epoch 6/30: 100%|██████████| 79/79 [00:35<00:00,  2.24it/s]


Epoch 6/30 | Time: 35.22s | Throughput: 3406.78 samples/s | Train Loss: 0.0471 | Val Acc: 0.9101


Epoch 7/30: 100%|██████████| 79/79 [00:35<00:00,  2.24it/s]


Epoch 7/30 | Time: 35.31s | Throughput: 3398.00 samples/s | Train Loss: 0.0307 | Val Acc: 0.9054


Epoch 8/30: 100%|██████████| 79/79 [00:35<00:00,  2.24it/s]


Epoch 8/30 | Time: 35.27s | Throughput: 3402.58 samples/s | Train Loss: 0.0245 | Val Acc: 0.9089


Epoch 9/30: 100%|██████████| 79/79 [00:35<00:00,  2.24it/s]


Epoch 9/30 | Time: 35.31s | Throughput: 3398.62 samples/s | Train Loss: 0.0181 | Val Acc: 0.9033


Epoch 10/30: 100%|██████████| 79/79 [00:35<00:00,  2.24it/s]


Epoch 10/30 | Time: 35.26s | Throughput: 3403.58 samples/s | Train Loss: 0.0167 | Val Acc: 0.9022


Epoch 11/30: 100%|██████████| 79/79 [00:36<00:00,  2.17it/s]


Epoch 11/30 | Time: 36.48s | Throughput: 3289.69 samples/s | Train Loss: 0.0143 | Val Acc: 0.8947


Epoch 12/30: 100%|██████████| 79/79 [00:35<00:00,  2.19it/s]


Epoch 12/30 | Time: 35.99s | Throughput: 3333.82 samples/s | Train Loss: 0.0248 | Val Acc: 0.8928


Epoch 13/30: 100%|██████████| 79/79 [00:35<00:00,  2.20it/s]


Epoch 13/30 | Time: 35.86s | Throughput: 3346.13 samples/s | Train Loss: 0.0153 | Val Acc: 0.9039


Epoch 14/30: 100%|██████████| 79/79 [00:36<00:00,  2.19it/s]


Epoch 14/30 | Time: 36.05s | Throughput: 3328.78 samples/s | Train Loss: 0.0110 | Val Acc: 0.9053


Epoch 15/30: 100%|██████████| 79/79 [00:36<00:00,  2.17it/s]


Epoch 15/30 | Time: 36.45s | Throughput: 3292.57 samples/s | Train Loss: 0.0065 | Val Acc: 0.9071


Epoch 16/30: 100%|██████████| 79/79 [00:36<00:00,  2.17it/s]


Epoch 16/30 | Time: 36.44s | Throughput: 3293.43 samples/s | Train Loss: 0.0079 | Val Acc: 0.9066


Epoch 17/30: 100%|██████████| 79/79 [00:36<00:00,  2.18it/s]


Epoch 17/30 | Time: 36.28s | Throughput: 3307.25 samples/s | Train Loss: 0.0078 | Val Acc: 0.9025


Epoch 18/30: 100%|██████████| 79/79 [00:36<00:00,  2.18it/s]


Epoch 18/30 | Time: 36.23s | Throughput: 3311.85 samples/s | Train Loss: 0.0076 | Val Acc: 0.9018


Epoch 19/30: 100%|██████████| 79/79 [00:35<00:00,  2.20it/s]


Epoch 19/30 | Time: 35.83s | Throughput: 3348.84 samples/s | Train Loss: 0.0069 | Val Acc: 0.8938


Epoch 20/30: 100%|██████████| 79/79 [00:35<00:00,  2.21it/s]


Epoch 20/30 | Time: 35.68s | Throughput: 3362.96 samples/s | Train Loss: 0.0091 | Val Acc: 0.9001


Epoch 21/30: 100%|██████████| 79/79 [00:35<00:00,  2.22it/s]


Epoch 21/30 | Time: 35.66s | Throughput: 3365.03 samples/s | Train Loss: 0.0071 | Val Acc: 0.9042


Epoch 22/30: 100%|██████████| 79/79 [00:35<00:00,  2.22it/s]


Epoch 22/30 | Time: 35.53s | Throughput: 3377.44 samples/s | Train Loss: 0.0058 | Val Acc: 0.9059


Epoch 23/30: 100%|██████████| 79/79 [00:35<00:00,  2.21it/s]


Epoch 23/30 | Time: 35.72s | Throughput: 3359.73 samples/s | Train Loss: 0.0079 | Val Acc: 0.9026


Epoch 24/30: 100%|██████████| 79/79 [00:35<00:00,  2.21it/s]


Epoch 24/30 | Time: 35.73s | Throughput: 3358.40 samples/s | Train Loss: 0.0111 | Val Acc: 0.9047


Epoch 25/30: 100%|██████████| 79/79 [00:35<00:00,  2.19it/s]


Epoch 25/30 | Time: 36.00s | Throughput: 3333.47 samples/s | Train Loss: 0.0046 | Val Acc: 0.9007


Epoch 26/30: 100%|██████████| 79/79 [00:36<00:00,  2.19it/s]


Epoch 26/30 | Time: 36.12s | Throughput: 3322.34 samples/s | Train Loss: 0.0023 | Val Acc: 0.8980


Epoch 27/30: 100%|██████████| 79/79 [00:36<00:00,  2.18it/s]


Epoch 27/30 | Time: 36.28s | Throughput: 3307.70 samples/s | Train Loss: 0.0074 | Val Acc: 0.8862


Epoch 28/30: 100%|██████████| 79/79 [00:36<00:00,  2.18it/s]


Epoch 28/30 | Time: 36.29s | Throughput: 3306.70 samples/s | Train Loss: 0.0072 | Val Acc: 0.9038


Epoch 29/30: 100%|██████████| 79/79 [00:36<00:00,  2.19it/s]


Epoch 29/30 | Time: 36.06s | Throughput: 3328.19 samples/s | Train Loss: 0.0075 | Val Acc: 0.9021


Epoch 30/30: 100%|██████████| 79/79 [00:36<00:00,  2.18it/s]


Epoch 30/30 | Time: 36.30s | Throughput: 3305.69 samples/s | Train Loss: 0.0043 | Val Acc: 0.9036
Training complete.
