###  Weights and Biases

### Tracking Experiments Using WandB

In [15]:
# Log in to your W&B account
import wandb
import random
import math

In [16]:
wandb.login()

True

### Tracking Dummy Machine Learning Training Loop

In [17]:
import wandb
import random

project="basic-intro"
config = {
    "learning_rate": 0.02,
    "architecture": "CNN",
    "dataset": "CIFAR-100",
    "epochs": 10,
}

with wandb.init(project=project, config=config) as run:
  # This block simulates a training loop logging metrics
  epochs = 10
  offset = random.random() / 5
  for epoch in range(2, epochs):
      acc = 1 - 2 ** -epoch - random.random() / epoch - offset
      loss = 2 ** -epoch + random.random() / epoch + offset

      # 2️. Log metrics from your script to W&B
      run.log({"acc": acc, "loss": loss})


0,1
acc,▁▅▇█████
loss,▇█▆▄▃▁▂▁

0,1
acc,0.83354
loss,0.10415


### Tracking ML Experiment using PyTorch

W&B runs automatically log metrics, system information, hyperparameters, terminal output and you’ll see an interactive table with model inputs and outputs.

In [18]:
# !pip install --upgrade Pillow

In [19]:
# !pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu

In [20]:
import wandb
import torch, torchvision
import torch.nn as nn
from torchvision.datasets import MNIST
import torchvision.transforms as T

MNIST.mirrors = [
    mirror for mirror in MNIST.mirrors if "http://yann.lecun.com/" not in mirror
]

device = "cuda:0" if torch.cuda.is_available() else "cpu"


def get_dataloader(is_train, batch_size, slice=5):
    "Get a training dataloader"
    full_dataset = MNIST(
        root=".", train=is_train, transform=T.ToTensor(), download=True
    )
    sub_dataset = torch.utils.data.Subset(
        full_dataset, indices=range(0, len(full_dataset), slice)
    )
    loader = torch.utils.data.DataLoader(
        dataset=sub_dataset,
        batch_size=batch_size,
        shuffle=True if is_train else False,
        pin_memory=False,
        num_workers=2,
    )
    return loader


def get_model(dropout):
    "A simple model"
    model = nn.Sequential(
        nn.Flatten(),
        nn.Linear(28 * 28, 256),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(256, 10),
    ).to(device)
    return model


def validate_model(model, valid_dl, loss_func, log_images=False, batch_idx=0):
    "Compute performance of the model on the validation dataset and log a wandb.Table"
    model.eval()
    val_loss = 0.0
    with torch.inference_mode():
        correct = 0
        for i, (images, labels) in enumerate(valid_dl):
            images, labels = images.to(device), labels.to(device)

            # Forward pass ➡
            outputs = model(images)
            val_loss += loss_func(outputs, labels) * labels.size(0)

            # Compute accuracy and accumulate
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

            # Log one batch of images to the dashboard, always same batch_idx.
            if i == batch_idx and log_images:
                log_image_table(images, predicted, labels, outputs.softmax(dim=1))
    return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)


In [21]:
def log_image_table(images, predicted, labels, probs):
    "Log a wandb.Table with (img, pred, target, scores)"
    # Create a wandb Table to log images, labels and predictions to
    table = wandb.Table(
        columns=["image", "pred", "target"] + [f"score_{i}" for i in range(10)]
    )
    for img, pred, targ, prob in zip(
        images.to("cpu"), predicted.to("cpu"), labels.to("cpu"), probs.to("cpu")
    ):
        table.add_data(wandb.Image(img[0].numpy() * 255), pred, targ, *prob.numpy())

    with wandb.init() as run:
        run.log({"predictions_table": table}, commit=False)

In [23]:
import wandb

config = {
    "epochs": 5,
    "batch_size": 128,
    "lr": 1e-3,
    "dropout": random.uniform(0.01, 0.80),
}

project = "pytorch-intro"

# initialise a wandb run
with wandb.init(project=project, config=config) as run:

    try:
        # Optionally copy your config
        config = run.config

        # Get the data
        train_dl = get_dataloader(is_train=True, batch_size=config.batch_size)
        valid_dl = get_dataloader(is_train=False, batch_size=2 * config.batch_size)
        n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)

        # A simple MLP model
        model = get_model(config.dropout)

        # Make the loss and optimizer
        loss_func = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

        # Training
        example_ct = 0
        step_ct = 0
        for epoch in range(config.epochs):
            model.train()
            for step, (images, labels) in enumerate(train_dl):
                images, labels = images.to(device), labels.to(device)

                outputs = model(images)
                train_loss = loss_func(outputs, labels)
                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

                example_ct += len(images)
                metrics = {
                    "train/train_loss": train_loss,
                    "train/epoch": (step + 1 + (n_steps_per_epoch * epoch))
                    / n_steps_per_epoch,
                    "train/example_ct": example_ct,
                }

                if step + 1 < n_steps_per_epoch:
                    # Log train metrics to wandb
                    run.log(metrics)

                step_ct += 1

                val_loss, accuracy = validate_model(
                    model, valid_dl, loss_func, log_images=(epoch == (config.epochs - 1))
                )

            # Log train and validation metrics to wandb
            val_metrics = {"val/val_loss": val_loss, "val/val_accuracy": accuracy}
            run.log({**metrics, **val_metrics})

            # Save the model checkpoint to wandb
            torch.save(model, "my_model.pt")
            run.log_model(
                "./my_model.pt",
                "my_mnist_model",
                aliases=[f"epoch-{epoch+1}_dropout-{round(run.config.dropout, 4)}"],
            )

            print(
                f"Epoch: {epoch+1}, Train Loss: {train_loss:.3f}, Valid Loss: {val_loss:3f}, Accuracy: {accuracy:.2f}"
            )

        # If you had a test set, this is how you could log it as a Summary metric
        run.summary["test_accuracy"] = 0.8

    except wandb.errors.UsageError as e:
        print(f"An error occurred: {e}")

    # finally:
    #     run.finish()


Epoch: 1, Train Loss: 0.277, Valid Loss: 0.377812, Accuracy: 0.89
Epoch: 2, Train Loss: 0.394, Valid Loss: 0.311924, Accuracy: 0.90
Epoch: 3, Train Loss: 0.301, Valid Loss: 0.273264, Accuracy: 0.92
Epoch: 4, Train Loss: 0.272, Valid Loss: 0.255016, Accuracy: 0.92


0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train/example_ct,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/train_loss,█▄▄▄▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▂▂▁▁▁▁▂▁▂▁▂▁▁▁▁▁
val/val_accuracy,▁▄▆█
val/val_loss,█▄▂▁

0,1
train/epoch,4.01064
train/example_ct,48128.0
train/train_loss,0.39767
val/val_accuracy,0.9245
val/val_loss,0.25502


An error occurred: Run (jx10coux) is finished. The call to `log` will be ignored. Please make sure that you are using an active run.
