In [None]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pl_bolts.datamodules import MNISTDataModule
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
import wandb
wandb.login()

In [2]:
class Model(pl.LightningModule):
    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28 * 28, self.config['hidden_size']),
            nn.ReLU(),
            nn.Linear(config['hidden_size'], 10),
        )

    def forward(self, x):
        return self.model(x)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.config['learning_rate'])
        
    def compute_loss_and_accuracy(self, batch):

        x, y = batch
        logits = self(x)
        
        loss = F.cross_entropy(logits, y)
        preds = torch.argmax(logits, dim=1)
        accuracy = torchmetrics.functional.accuracy(preds, y)

        return loss, accuracy

    def training_step(self, batch, batch_idx):

        loss, accuracy = self.compute_loss_and_accuracy(batch)
        self.log("train_loss", loss)
        self.log("train_acc", accuracy)

        return loss

    def validation_step(self, batch, batch_idx):

        loss, accuracy = self.compute_loss_and_accuracy(batch)
        self.log("val_loss", loss)
        self.log("val_acc", accuracy)


In [3]:
sweep_config = {}
sweep_config['method'] = 'random'
sweep_config['metric'] = {'name': 'val_loss', 'goal': 'minimize'}
sweep_config['parameters'] = {'hidden_size': {'values': [128, 256, 512]},
                              'learning_rate': {'distribution': 'uniform', 'min': 0, 'max': 0.1},
                              'batch_size': {'values': [32, 64, 128]},}

sweep_id = wandb.sweep(sweep_config, project="test")

Create sweep with ID: kjcr1wl7
Sweep URL: https://wandb.ai/maximilienlc/test/sweeps/kjcr1wl7


In [4]:
def train_fn():

    with wandb.init():

        wandb_logger = WandbLogger(project="test")

        model = Model(wandb.config)

        data_module = MNISTDataModule(num_workers=8, batch_size=wandb.config['batch_size'])
        
        trainer = pl.Trainer(
            max_epochs=3,
            gpus=-1,
            logger=wandb_logger,
        )
        
        trainer.fit(model, data_module)

In [5]:
wandb.agent(sweep_id, train_fn, count=5)

[34m[1mwandb[0m: Agent Starting Run: n167qfz8 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.03810167602541973
wandb: ERROR Failed to sample metric: Not Supported


  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_deprecation(
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 407 K 
-------------------------------------
407 K     Trainable params
0         Non-trainable params
407 K     Total params
1.628     Total estimated model params size (MB)


Epoch 2: 100%|██████████| 469/469 [00:05<00:00, 84.41it/s, loss=0.236, v_num=qfz8]


wandb: ERROR Failed to serialize metric: division by zero


0,1
epoch,▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅████████
train_acc,▃▅▇▄▅▁▅▅▇▆▇▇█▇█▆▇▇▇▇▄█
train_loss,▆▃▃▆▂█▇▅▃▄▃▄▃▂▁▃▁▂▄▄▆▁
trainer/global_step,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇███
val_acc,▁█▃
val_loss,▁▆█

0,1
epoch,2.0
train_acc,0.96094
train_loss,0.11817
trainer/global_step,1124.0
val_acc,0.929
val_loss,0.29071


[34m[1mwandb[0m: Agent Starting Run: i4aszmp1 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.06243240327483501
wandb: ERROR Failed to sample metric: Not Supported


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 203 K 
-------------------------------------
203 K     Trainable params
0         Non-trainable params
203 K     Total params
0.814     Total estimated model params size (MB)


Epoch 2: 100%|██████████| 938/938 [00:09<00:00, 95.26it/s, loss=0.498, v_num=zmp1] 


wandb: ERROR Failed to serialize metric: division by zero


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_acc,▂▃▁▃█▂▃▄▅▄▆▃▄▃▆▄▅▅▅▄▅▅▆▅▄▄▂▅▄▅▅▆▅▅▅▇▆▆▄▂
train_loss,▆▄▅▃▁█▆▃▃▄▂▅▃▃▂▄▂▃▃▃▃▂▂▅▄▃▄▂▃▄▂▂▅▃▃▁▂▂▂▃
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,█▆▁
val_loss,▂▁█

0,1
epoch,2.0
train_acc,0.8125
train_loss,0.52327
trainer/global_step,2249.0
val_acc,0.8725
val_loss,0.62803


[34m[1mwandb[0m: Agent Starting Run: b74pmsim with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.05705226822041537
wandb: ERROR Failed to sample metric: Not Supported


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 407 K 
-------------------------------------
407 K     Trainable params
0         Non-trainable params
407 K     Total params
1.628     Total estimated model params size (MB)


Epoch 2: 100%|██████████| 469/469 [00:05<00:00, 86.83it/s, loss=0.321, v_num=msim]


wandb: ERROR Failed to serialize metric: division by zero


0,1
epoch,▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅████████
train_acc,▃▆▅▅▅▆▇▁▅▇▄█▅▅▇▅▅▅▆▆▅▆
train_loss,▅▂▂▃▂▄▂█▂▁▅▁▃▃▃▂▄▃▃▃▂▂
trainer/global_step,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇███
val_acc,▅▁█
val_loss,▂█▁

0,1
epoch,2.0
train_acc,0.92969
train_loss,0.27692
trainer/global_step,1124.0
val_acc,0.92475
val_loss,0.31048


[34m[1mwandb[0m: Agent Starting Run: q9ioew2h with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.046147615970653935
wandb: ERROR Failed to sample metric: Not Supported


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 101 K 
-------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)


Epoch 2: 100%|██████████| 938/938 [00:09<00:00, 99.87it/s, loss=0.37, v_num=ew2h]  


wandb: ERROR Failed to serialize metric: division by zero


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_acc,▃▆▅▅▂▇▆▇▅▃▄▁▆▆▆▆▇▅▆▆▅▅▅▅█▆▇▅▆▇▅▆▅▆▅▆▆▄▄▇
train_loss,▅▃▄▄▅▁▂▁▃▄▅█▃▄▃▄▃▄▃▃▄▇█▂▁▅▂▅▂▁▅▃▄▁▃▃▆▅▄▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▄█
val_loss,█▄▁

0,1
epoch,2.0
train_acc,0.95312
train_loss,0.22967
trainer/global_step,2249.0
val_acc,0.91683
val_loss,0.36608


[34m[1mwandb[0m: Agent Starting Run: 4in1w4cd with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.01315060474598724
wandb: ERROR Failed to sample metric: Not Supported


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 407 K 
-------------------------------------
407 K     Trainable params
0         Non-trainable params
407 K     Total params
1.628     Total estimated model params size (MB)


Epoch 1:  53%|█████▎    | 990/1875 [00:09<00:08, 99.59it/s, loss=0.228, v_num=w4cd] 

wandb: ERROR Failed to serialize metric: division by zero


Epoch 2: 100%|██████████| 1875/1875 [00:18<00:00, 100.66it/s, loss=0.175, v_num=w4cd] 


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_acc,▁▆▅▁▅▆▆▅▅█▃▃▃▆▃▆▆▃▅█▃▅▆▅█▃███▆▃▆▆▃▆▃▅▁▅▆
train_loss,█▁▂▄▂▁▂▂▂▁▃▄▄▁▂▁▂▃▂▁▂▂▁▂▁▂▁▁▁▂▃▂▁▃▂▂▂▅▂▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▆█
val_loss,▁▂█

0,1
epoch,2.0
train_acc,0.96875
train_loss,0.07635
trainer/global_step,4499.0
val_acc,0.94875
val_loss,0.23533
