# DistilBERT Base Code Training

In [None]:
# 1) ROCm environment packages
%pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl
%pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/rocm_sdk_devel-0.1.dev0-py3-none-win_amd64.whl
%pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl
%pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/rocm-0.1.dev0.tar.gz

# 2) PyTorch ROCm wheels
%pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl
%pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl
%pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl

# 3) Extra deps for DistilBERT benchmark
%pip install transformers datasets accelerate
%pip install tqdm

In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# Check GPU / backend availability
print("PyTorch version:", torch.__version__)
cuda_available = torch.cuda.is_available()
print("GPU backend available (torch.cuda):", cuda_available)
print("torch.version.cuda:", getattr(torch.version, "cuda", None))
print("torch.version.hip :", getattr(torch.version, "hip", None))

if cuda_available:
    print("GPU device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected by torch.cuda â€“ using CPU.")


In [None]:

from datasets import load_dataset
from transformers import DistilBertTokenizerFast
# 1. Load AG News dataset (this will automatically download it if not cached)
dataset = load_dataset("ag_news")
print("Dataset splits:", dataset)
# 2. Load DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

MAX_LENGTH = 128

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )
tokenized_dataset = dataset.map(tokenize_batch, batched=True)
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]
print("Sample tokenized batch shape:")
print("Train dataset example input_ids shape:", train_dataset[0]["input_ids"].shape)
print("Train dataset example attention_mask shape:", train_dataset[0]["attention_mask"].shape)
print("Label example:", train_dataset[0]["label"])


In [None]:
from torch.utils.data import DataLoader
# 1. Training configuration
BATCH_SIZE = 128
NUM_EPOCHS = 30
SUBSET_TRAIN_SIZE = 10_000   # reduced to 10k for faster epochs

print(f"Batch size: {BATCH_SIZE}")
print(f"Planned epochs: {NUM_EPOCHS}")
print(f"Using training subset size: {SUBSET_TRAIN_SIZE}")
train_subset = train_dataset.select(range(SUBSET_TRAIN_SIZE))
NUM_WORKERS = 8

train_loader = DataLoader(
    train_subset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

print("Train batches (subset):", len(train_loader))
print("Test batches:", len(test_loader))
print(f"Dataloaders initialized with num_workers={NUM_WORKERS}, pin_memory=True")

metrics = {
    "epoch": [],
    "train_samples_per_sec": [],
    "epoch_time_sec": [],
    "train_loss": [],
    "val_accuracy": []
}

print("Metric tracking dict initialized:", list(metrics.keys()))


In [None]:
from transformers import DistilBertForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    try:
        print("GPU detected:", torch.cuda.get_device_name(0))
    except:
        print("GPU detected (no name available)")
else:
    print("WARNING: Running on CPU. ROCm GPU was not detected.")

num_labels = 4
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)

model.to(device)
print("Model loaded and moved to device.")
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)

print("Optimizer initialized (AdamW) with learning rate:", learning_rate)


In [None]:
import time
import torch.nn.functional as F

model.train()

def evaluate(model, data_loader, device):
    """Simple evaluation loop to compute validation accuracy."""
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    model.train()
    return correct / total if total > 0 else 0.0

total_train_samples = len(train_subset)
print("Effective train samples (subset):", total_train_samples)

print("Starting training...")
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    running_loss = 0.0
    num_batches = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        num_batches += 1

    # Epoch timing and throughput
    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_start_time
    samples_per_sec = total_train_samples / epoch_time if epoch_time > 0 else 0.0
    avg_train_loss = running_loss / num_batches if num_batches > 0 else 0.0

    # Validation accuracy
    val_acc = evaluate(model, test_loader, device)

    metrics["epoch"].append(epoch)
    metrics["train_samples_per_sec"].append(samples_per_sec)
    metrics["epoch_time_sec"].append(epoch_time)
    metrics["train_loss"].append(avg_train_loss)
    metrics["val_accuracy"].append(val_acc)

    print(
        f"Epoch {epoch}/{NUM_EPOCHS} | "
        f"Time: {epoch_time:.2f}s | "
        f"Throughput: {samples_per_sec:.2f} samples/s | "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Acc: {val_acc:.4f}"
    )

print("Training complete.")
