In [1]:
import wandb

import os
from torch import optim, nn, utils, Tensor
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger

from dotenv import load_dotenv

## Basics

In [2]:
load_dotenv(dotenv_path=".env")

for key, value in os.environ.items():
    if key == "WANDB_API_KEY":
        print(f"{key}: {value}")

WANDB_API_KEY: 5d4665253966ac5e6e7af16679e76f021f4b865d


In [3]:
wandb.login(verify=True, key=os.environ.get("WANDB_API_KEY"))
# wandb.login(key=os.getenv("WANDB_API_KEY"))

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\Maciek\_netrc
wandb: Currently logged in as: maciej-kaczkowski (maciej-kaczkowski-wut) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

## Example 


In [4]:
class LitAutoEncoder(pl.LightningModule):
    def __init__(self, lr=1e-3, inp_size=28, optimizer="Adam"):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(inp_size * inp_size, 64), nn.ReLU(), nn.Linear(64, 3)
        )
        self.decoder = nn.Sequential(
            nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, inp_size * inp_size)
        )
        self.lr = lr

        # save hyper-parameters to self.hparamsm auto-logged by wandb
        self.save_hyperparameters()

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = nn.functional.mse_loss(x_hat, x)

        # log metrics to wandb
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

In [5]:
# init the autoencoder
autoencoder = LitAutoEncoder(lr=1e-3, inp_size=28)

In [6]:
# setup data
batch_size = 32
dataset = MNIST(os.getcwd(), download=True, transform=ToTensor())
train_loader = utils.data.DataLoader(dataset, shuffle=True)

In [7]:
# initialise the wandb logger and name your wandb project
wandb_logger = WandbLogger(project="test-project")

In [8]:
# add your batch size to the wandb config
wandb_logger.experiment.config["batch_size"] = batch_size

In [9]:
# pass wandb_logger to the Trainer
trainer = pl.Trainer(limit_train_batches=750, max_epochs=5, logger=wandb_logger)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
# train the model
trainer.fit(model=autoencoder, train_dataloaders=train_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 50.4 K
1 | decoder | Sequential | 51.2 K
---------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)
d:\__repos\ml_concepts\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoade

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [11]:
# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

0,1
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆███████
train_loss,▆▅█▇▅▄▇▅▄▅▃▄▆▁▄▅▇▃▆▄▆▃▅▃▄▃▁▆▄▆▆▂▆▆▅▆▆▅▄▄
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██

0,1
epoch,4.0
train_loss,0.04152
trainer/global_step,3749.0
