<a href="https://colab.research.google.com/github/JayThibs/Deep-Learning-With-Python-Projects/blob/master/How_to_Use_PyTorch_Lightning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyTorch Lightning Masterclass

In [None]:
import torch
from torch import nn
from torch import optim
from torchvision import datasets, transforms
from torch.utils.data import random_split
from torch.nn import functional as F

In [None]:
%%capture
!pip install pytorch_lightning==1.1.4

------
# PyTorch Lightning

1. Model
2. Optimizer
3. Data
4. Training Loop "the magic"
5. Validation Loop "the validation magic"

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy

class ResNet(pl.LightningModule):
  def __init__(self):
      super().__init__()
      self.l1 = nn.Linear(28 * 28, 64)
      self.l2 = nn.Linear(64, 64)
      self.l3 = nn.Linear(64, 10)
      self.do = nn.Dropout(0.1)
      self.loss = nn.CrossEntropyLoss()

  def forward(self, x):
      h1 = nn.functional.relu(self.l1(x))
      h2 = nn.functional.relu(self.l2(h1))
      # this part right here increases the training speed of the model, 
      # it's what we call "highway networks", the calcs skip layers
      do = self.do(h2 + h1)
      logits = self.l3(do)
      return logits

  def configure_optimizers(self): # this is an added method from pl.LightningModule (not found in nn.Module)
      # Define my optimizer
      optimizer = optim.SGD(self.parameters(), lr=1e-2)
      return optimizer

  def training_step(self, batch, batch_idx): # this is an added method from pl.LightningModule (not found in nn.Module)
      x, y = batch

      b = x.size(0)
      x = x.view(b, -1)

      logits = self(x)

      loss = self.loss(logits, y) # J: this is the loss
      preds = torch.argmax(logits, dim=1)
      acc = accuracy(preds, y)


      # Calling self.log will surface up scalars for you in TensorBoard
      self.log('train_loss', loss, prog_bar=True)
      self.log('train_acc', acc, prog_bar=True)
      return loss

  def validation_step(self, batch, batch_idx):
      results = self.training_step(batch, batch_idx)
      results['progress_bar']['val_acc'] = results['progress_bar']['train_acc']
      del results['progress_bar']['train_acc']
      return results

  def validation_epoch_end(self, val_step_outputs):
      avg_val_loss = torch.tensor([x['loss'] for x in outputs]).mean()
      avg_val_acc = torch.tensor([x['progress_bar']['val_acc'] for x in outputs]).mean()
      
      self.log('val_loss', avg_val_loss, prog_bar=True)
      self.log('val_acc', avg_val_acc, prog_bar=True)
      return {'val_loss': avg_val_loss}

  def prepare_data(self):
      datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor()) # downloading data here

  def setup(self): # for multiple gpus
      # include (e.g. image) transforms here.

      dataset = datasets.MNIST('./data', train=True, download=False, transform=transforms.ToTensor())
      self.train, self.val = random_split(train_data, [55000, 5000])

  def train_dataloader(self):
      # self.train, self.val = random_split(train_data, [55000, 5000]) # use here if not using multiple gpus
      train_loader = torch.utils.data.DataLoader(train_data, batch_size=10) # since we have 10 classes, it's actually better to have a batch size of 10
      # val_loader = torch.utils.data.DataLoader(val, batch_size=10)
      return train_loader

  def val_dataloader(self):
      val_loader = DataLoader(self.val, batch_size=10)
      return val_loader

model = ResNet()

In [None]:
# use num_nodes for number of machines on a cluster
# So, if you have gpus=8 and num_nodes=32, that means you are training across 8*32 total gpus.
# Need to include in your "SLURM" script that you are using 8*32, PL will do the rest.
# Make sure to include the "setup" method in your class model to make sure the data is only
# downloaded once for all gpus and machines.
trainer = pl.Trainer(progress_bar_refresh_rate=20, max_epochs=5, gpus=1)
trainer.fit(model)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type             | Params
------------------------------------------
0 | l1   | Linear           | 50.2 K
1 | l2   | Linear           | 4.2 K 
2 | l3   | Linear           | 650   
3 | do   | Dropout          | 0     
4 | loss | CrossEntropyLoss | 0     
------------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1

In [None]:
!ls lightning_logs/version_2/checkpoints # best saved checkpoint

'epoch=4-step=29999.ckpt'


In [None]:
# # Define a simple model
# model = nn.Sequential(
#     nn.Linear(28 * 28, 64),
#     nn.ReLU(),
#     nn.Linear(64, 64),
#     nn.ReLU(),
#     nn.Linear(64, 10)
# )

In [None]:
#  # Define a more flexible model

# class ResNet(nn.Module):
#   def __init__(self):
#     super().__init__()
#     self.l1 = nn.Linear(28 * 28, 64)
#     self.l2 = nn.Linear(64, 64)
#     self.l3 = nn.Linear(64, 10)
#     self.do = nn.Dropout(0.1)

#   def forward(self, x):
#     h1 = nn.functional.relu(self.l1(x))
#     h2 = nn.functional.relu(self.l2(h1))
#     # this part right here increases the training speed of the model, 
#     # it's what we call "highway networks", the calcs skip layers
#     do = self.do(h2 + h1)
#     logits = self.l3(do)
#     return logits

# model = ResNet().cuda() # .cuda() to send the model to gpu

In [None]:
# # Define my optimizer
# params = model.parameters()
# optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [None]:
# # Define my loss
# loss = nn.CrossEntropyLoss()

In [None]:
# # train, val split
# train_data = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
# train, val = random_split(train_data, [55000, 5000])
# train_loader = torch.utils.data.DataLoader(train, batch_size=10) # since we have 10 classes, it's actually better to have a batch size of 10
# val_loader = torch.utils.data.DataLoader(val, batch_size=10)

In [None]:
# # My training loop
# nb_epochs = 5
# for epoch in range(nb_epochs):
#   losses = list()
#   accuracies = list()
#   model.train() # because we use Dropout
#   for batch in train_loader:
#     x, y = batch

#     # x size: b * 1 * 28 * 28
#     b = x.size(0)
#     x = x.view(b, -1).cuda() # reshape image to vector (aka flatten)

#     # 1. forward
#     logits = model(x) # logit is the output of the last layer in your model

#     # import pdb; pdb.set_trace()

#     # 2. computer the objective function
#     J = loss(logits, y.cuda()) # J is the objective function

#     # 3. Cleaning the gradients
#     model.zero_grad()
#     # or optimizer.zero_grad()
#     # params.grad.zero_()


#     # 4. Accumulate the partial derivatives of J wrt parameters (wrt mean "with respect to")
#     J.backward() # accumulates the new gradients to the previous one
#     # params.grad.add_(dJ/dparams)

#     # 5. step in the opposite direction of the gradient
#     optimizer.step()
#     # in the optimizer step, we are basically doing:
#     # with torch.no_grad(): params = params - eta * params.grad
#     # Meaning, don't make any computational graph here

#     losses.append(J.item())
#     accuracies.append(y.eq(logits.detach().argmax(dim=1).cpu()).float().mean())

#   print(f'Epoch {epoch + 1}', end=', ')
#   print(f'Validation loss: {torch.tensor(losses).mean():.2f}', end=', ')
#   print(f'Validation accuracy: {torch.tensor(accuracies).mean():.2f}')

#   losses = list()
#   accuracies = list()
#   model.eval()

#   for batch in val_loader:
#     x, y = batch

#     # x size: b * 1 * 28 * 28
#     b = x.size(0)
#     x = x.view(b, -1).cuda()

#     # 1. forward
#     with torch.no_grad(): # only compute the final output, do not keep track of gradients and computational graphs
#       logits = model(x) # logit is the output of the last layer in your model

#     # 2. computer the objective function
#     J = loss(logits, y.cuda()) # J is the objective function

#     losses.append(J.item())
#     accuracies.append(y.eq(logits.detach().argmax(dim=1).cpu()).float().mean())

#   print(f'Epoch {epoch + 1}', end=', ')
#   print(f'Validation loss: {torch.tensor(losses).mean():.2f}', end=', ')
#   print(f'Validation accuracy: {torch.tensor(accuracies).mean():.2f}')

Epoch 1, Validation loss: 0.51, Validation accuracy: 0.86
Epoch 1, Validation loss: 0.25, Validation accuracy: 0.92
Epoch 2, Validation loss: 0.25, Validation accuracy: 0.93
Epoch 2, Validation loss: 0.18, Validation accuracy: 0.94
Epoch 3, Validation loss: 0.19, Validation accuracy: 0.95
Epoch 3, Validation loss: 0.15, Validation accuracy: 0.95
Epoch 4, Validation loss: 0.15, Validation accuracy: 0.96
Epoch 4, Validation loss: 0.12, Validation accuracy: 0.96
Epoch 5, Validation loss: 0.13, Validation accuracy: 0.96
Epoch 5, Validation loss: 0.11, Validation accuracy: 0.96


# Implementing a PyTorch Trainer

In [None]:
%%capture
!pip  install pytorch-lightning

In [None]:
import torch
from torch import nn
import pytorch_lightning as pl
from torchvision.datasets import MNIST
from torch.optim import Adam

In [None]:
# for access to MNIST via CloudFlare protect
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

In [None]:
train_loader = MNIST('', download=True)
encoder = nn.Linear(28*28, 10)
optimizer = Adam(encoder.parameters())

In [None]:
class NormCallback(pl.Callback):

  def on_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
    torch.norm(model.encoder.weights.grad, 2)

In [None]:
class Trainer:
  def __init__(max_epochs=10, callback=None): # we use callbacks=None because it is bad practice to put a list in an init
    self.max_epochs = max_epochs
    if callbacks is None:
      self.callbacks = []

  def fit(self, model, train_loader):
    self._train(model, train_loader)

  def on_after_backward(self): # from our NormCallback class
    model.on_after_backward() # hook from model
    for cb in self.callbacks():
      cb.on_after_backward() # hook from callback

  def _train(self, model, train_loader):

    batch_idx = 0
    dataloader_idx = 0

    for epoch in range(self.max_epochs):
      for batch in train_loader:
        # hook_1 = hook_1(hook_variables) # example hook

        x, y = batch

        loss = model.training_step(self, batch, batch_idx)
        loss.backward()

        self.on_after_backward(self, model, batch, batch_idx, dataload_idx) # from our NormCallback class

        optimizer.step()
        optimizer.zero_grad()

        batch_idx += 1

In [None]:
# Here's the model we put in the Trainer, and we include the hooks like "on_batch_start" and "training_step"
class LitModel(pl.LightningModule):
  def __init__(self):
    self.encoder = nn.Linear(28*28, 10)

  def forward(self): # for inference/predictions
    return self.encoder(x)

  def training_step(self, batch, batch_ids):
    x = x.view(x.size(0), -1)
    logit = self.encoder(x)
    loss = cross_entropy(logit, y)
    loss.backward()
    return loss

  def on_batch_start(self, batch):
    return (1, 2)

In [None]:
cb = NormCallback()
trainer = Trainer(max_epochs=10, callbacks=[cb])
trainer.fit(encoder, train_loader, optimizer)

## What's a hook?

Imagine this is your programs training process:

START ------(HOOK 1)-----------(HOOK 2)---------------- ENDS

A hook lets you do something during the training process.

So what you do is that you create a method inside your Model class and you are able to call it during the training process. It helps make your code more reusable because you are able to simply call methods/function in your training loop.

Let's say you want to print a statement or calculate some type of number, you can just call it from your model instead of writing it all in your training loop.

The model can have hooks, but our callback class can also have hooks.

## What's a callback?

A callback is a term from the software engineering world. You can think of it as if you are sending something to a website and you are "waiting for it to call back to you." 

"Is any executable code that is passes as an argument to other code, that other code is expected to call back (execute) the argument at a given time."

Maybe you need to log something for your research and log the loss. Or maybe inspect your gradients. Or take the norm of the weight (torch.norm(encoder.weights.grad, 2))).

Instead of putting it in your training loop, you simply create a class for your callback. Inside the callback, you will be placing all of your hooks, there can be many!

A hook in your callback can be as simple as a message saying that your training has started to something like logging loss.

In [None]:
class MyCallback(pl.Callback):
  def hook1()

# Integrate Weights & Biases with PyTorch

Use W&B for machine learning experiment tracking, dataset versioning, and project collaboration.

This section shows you how to integrate W&B with your PyTorch code to add experiment tracking to your pipeline. That includes:

1. Storing hyperparameters and metadata in a config.
2. Tracking your model with wandb.watch to automatically log model gradients and parameters. We'll also grab a bunch of system metrics, like GPU and CPU utilization.
3. Using the wandb.log API to log everything else, like the loss.
4. Saving you model in Netron-compatible format so it can be viewed on W&B.

## Install, Import, and Log In



In [None]:
import random

import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm.notebook import tqdm

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# To make MNIST downloadable from Yann's Cloudflare blockage
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

### Step 0: Install W&B

In [None]:
%%capture
!pip install wandb --upgrade

Step 1: Import W&B and Login

In order to log data to our web service, you'll ne to log in.

If this is your first time using W&B, you'll need to sign up for a free account at the link that appears.

In [None]:
import wandb

wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Define the Pipeline

### Step 2: Track metadata and hyperparameters with wandb.init

Programmatically, the first thing we do is define our experiment:
what are the hyperparameters? what metadata is associated with this run?

It's a pretty common workflow to store this information in a `config` dictionary
(or similar object)
and then access it as needed.

For this example, we're only letting a few hyperparameters vary
and hand-coding the rest.
But any part of your model can be part of the `config`!

We also include some metadata: we're using the MNIST dataset and a convolutional
architecture. If we later work with, say,
fully-connected architectures on CIFAR in the same project,
this will help us separate our runs.

In [None]:
# this could be done with a yaml file or other approaches, here we use a dictionary.

config = dict(
    epochs = 5,
    classes = 10,
    kernels = [16, 32],
    batch_size = 128,
    learning_rate = 0.005,
    dataset = 'MNIST',
    architecture = 'CNN'
)

Now, let's define the overall pipeline, which is pretty typical for model-training:

1. we first `make` a model, plus associated data and optimizer, then
2. we `train` the model accordingly and finally
3. `test` it to see how training went.

We'll implement these functions below.

In [None]:
# For this tutorial, we are using pure PyTorch and not not PyTorch Lightning.
# Therefore, we need to list things off one by one, instead of having PL's boilerplate code.

def model_pipeline(hyperparameters):

  # with block is a context manager. It ensures that even if your code has an error/runs into an issue,
  # everything we use within it will get undone if there's an issue.
  # This is why we use it when opening files, we don't want to leave the file open if there's an error.
  # tell wandb to get started
  with wandb.init(project='pytorch-lit-mnist-test', config=hyperparameters):
    # access all hyperparameters through wandb.config, so logging matches execution!
    config = wandb.config

    ## Need to implement the following functions: make(), train(), and test().

    # make the model, data, and optimization problem
    model, train_loader, test_loader, criterion, optimizer = make(config)
    print(model)

    # and use them to train the model
    train(model, train_loader, criterion, optimizer, config)

    # and test its final performances
    test(model, test_loader)

  return model

The only difference here from a standard pipeline
is that it all occurs inside the context of `wandb.init`.
Calling this function sets up a line of communication
between your code and our servers.

Passing the `config` dictionary to `wandb.init`
immediately logs all that information to us,
so you'll always know what hyperparameter values
you set your experiment to use.

To ensure the values you chose and logged are always the ones that get used
in your model, we recommend using the `wandb.config` copy of your object.
Check the definition of `make` below to see some examples.

> *Side Note*: We take care to run our code in separate processes,
so that any issues on our end
(e.g. a giant sea monster attacks our data centers)
don't crash your code.
Once the issue is resolved (e.g. the Kraken returns to the deep)
you can log the data with `wandb sync`.

In [None]:
# This is the same classic PyTorch we would use, except that we will be using the
# config file to input all our hyperparameters.

def make(config):
  # Make the data
  train, test = get_data(train=True), get_data(train=False)
  train_loader = make_loader(train, batch_size=config.batch_size)
  test_loader = make_loader(test, batch_size=config.batch_size)

  # Make the model
  model = ConvNet(config.kernels, config.classes).to(device)

  # Make the loss and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(
      model.parameters(), lr=config.learning_rate
  )

  return model, train_loader, test_loader, criterion, optimizer

## Define the Data Loading and Model

Now, we need to specify how the data is loaded and what the model looks like.

This part is very important, but it's no different from what it would be without wandb, so we won't dwell on it.

In [None]:
# Functions for loading the data with PyTorch. Pure PyTorch, no different with wandb.

def get_data(slice=5, train=True):

  # Download dataset

  full_dataset = torchvision.datasets.MNIST(root='.',
                                            train=train,
                                            transform=transforms.ToTensor(),
                                            download=True)
  # equivalent to slicing with [::slice]
  sub_dataset = torch.utils.data.Subset(
      full_dataset, indices=range(0, len(full_dataset), slice)
  )
  
  return sub_dataset

# making the data loader

def make_loader(dataset, batch_size):
  loader = torch.utils.data.DataLoader(dataset=dataset,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      pin_memory=True, num_workers=8)
  return loader

Defining the model is normally the fun part!

But nothing changes with `wandb`,
so we're gonna stick with a standard ConvNet architecture.

Don't be afraid to mess around with this and try some experiments --
all your results will be logged on [wandb.ai](https://wandb.ai)!

In [None]:
# Conventional and convolutional neural network

class ConvNet(nn.Module):
  def __init__(self, kernels, classes=10):
    super().__init__()

    self.layer1 = nn.Sequential(
        nn.Conv2d(1, kernels[0], kernel_size=5, stride=1, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2)
    )
    self.layer2 = nn.Sequential(
        nn.Conv2d(16, kernels[1], kernel_size=5, stride=1, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2)
    )
    self.fc = nn.Linear(7 * 7 * kernels[-1], classes)

  def forward(self, x):
    x = self.layer1(x)
    x = self.layer2(x)
    x = x.reshape(x.size(0), -1)
    x = self.fc(x)
    return x

## Define Training Logic

Moving on in our `model_pipeline`, it's time to specify how we `train`.

Two `wandb` functions come into play here: `watch` and `log`.

### Step 3: Track gradients with wandb.watch and everything else with wandb.log

`wandb.watch` will log the gradients and the parameters of your model,
every `log_freq` steps of training.

All you need to do is call it before you start training.

The rest of the training code remains the same:
we iterate over epochs and batches,
running forward and backward passes
and applying our `optimizer`.

In [None]:
def train(model, loader, criterion, optimizer, config):
  # tell wandb to watch what the model gets up to: gradients, weights, and more!
  wandb.watch(model, criterion, log='all', log_freq=10) # we're about to start training the model, wandb, pls watch the model

  # Run training and track with wandb
  total_batches = len(loader) * config.epochs
  example_ct = 0 # number of examples seen
  batch_ct = 0
  for epoch in tqdm(range(config.epochs)):
    for _, (images, labels) in enumerate(loader):

      loss = train_batch(images, labels, model, optimizer, criterion) # where we do the training, defined below
      example_ct += len(images)
      batch_ct += 1

      # Report metrics every 25th batch
      # Since we use example
      if ((batch_ct + 1) % 25) == 0:
        train_log(loss, example_ct, epoch) # defined in next code cell, included because of wandb

def train_batch(images, labels, model, optimizer, criterion):
  images, labels = images.to(device), labels.to(device)

  # Forward pass
  outputs = model(images)
  loss = criterion(outputs, labels)

  # Backward pass
  optimizer.zero_grad()
  loss.backward()

  # Step with optimizer
  optimizer.step()

  return loss

The only difference is in the logging code: where previously you might have reported metrics by printing to the terminal, now you pass the same information to `wandb.log`.

`wandb.log` expects a dictionary with strings as keys.
These strings identify the objects being logged, which make up the values.
You can also optionally log which `step` of training you're on.

> *Side Note*: I like to use the number of examples the model has seen,
since this makes for easier comparison across batch sizes,
but you can use raw steps or batch count. For longer training runs, it can also make sense to log by `epoch`.

In [None]:
# example_ct is the number of images that our neural network has seen
# every step (which doesn't need to be 1), we're telling wandb to log the following

def train_log(loss, example_ct, epoch):
  loss = float(loss)

  # where the magic happens
  # in log, you can store more than just numbers. You can also story data like
  # video, audio, 3D objects, images
  wandb.log({'epoch': epoch, 'loss': loss}, step = example_ct) # step means we log for every example_ct here, but could be epoch
  print(f'Loss after ' + str(example_ct).zfill(5) + f' examples: {loss:.3f}')

## Define Testing Logic

Once the model is done training we want to test it: run it against some fresh data from production, perhaps, or apply it to some hand-curated "hard examples".

### Optional Step 4: Call wandb.save

#### 4️⃣ Optional Step 4: Call `wandb.save`

This is also a great time to save the model's architecture
and final parameters to disk.
For maximum compatibility, we'll `export` our model in the
[Open Neural Network eXchange (ONNX) format](https://onnx.ai/).

Passing that filename to `wandb.save` ensures that the model parameters
are saved to W&B's servers: no more losing track of which `.h5` or `.pb`
corresponds to which training runs!

For more advanced `wandb` features for storing, versioning, and distributing
models, check out our [Artifacts tools](https://www.wandb.com/artifacts).

In [None]:
def test(model, test_loader):
  model.eval()

  # Run the model on some test examples
  with torch.no_grad():
    correct, total = 0, 0
    for images, labels in test_loader:
      images, labels = images.to(device), labels.to(device)
      outputs = model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    print(f'Accuracy of the model on the {total} ' +
          f'test images: {100 * correct / total}%')
    
    wandb.log({'test_accuracy': correct / total})

  # save the model in the ONNX format
  torch.onnx.export(model, images, 'model.onnx')
  wandb.save('model.onnx')

Now that we've defined the whole pipeline and slipped in
those few lines of W&B code,
we're ready to run our fully-tracked experiment.

We'll report a few links to you:
our documentation,
the Project page, which organizes all the runs in a project, and
the Run page, where this run's results will be stored.

Navigate to the Run page and check out these tabs:

1. **Charts**, where the model gradients, parameter values, and loss are logged throughout training
2. **System**, which contains a variety of system metrics, including Disk I/O utilization, CPU and GPU metrics (watch that temperature soar 🔥), and more
3. **Logs**, which has a copy of anything pushed to standard out during training
4. **Files**, where, once training is complete, you can click on the `model.onnx` to view our network with the [Netron model viewer](https://github.com/lutzroeder/netron).

Once the run in finished
(i.e. the `with wandb.init` block is exited),
we'll also print a summary of the results in the cell output.

In [None]:
# Build, train and analyze the model with the pipeline
model = model_pipeline(config)

ConvNet(
  (layer1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=1568, out_features=10, bias=True)
)


  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))



Loss after 03072 examples: 0.528
Loss after 06272 examples: 0.159
Loss after 09472 examples: 0.189
Loss after 12640 examples: 0.109
Loss after 15840 examples: 0.145
Loss after 19040 examples: 0.117
Loss after 22240 examples: 0.150
Loss after 25408 examples: 0.055
Loss after 28608 examples: 0.055
Loss after 31808 examples: 0.091
Loss after 35008 examples: 0.107
Loss after 38176 examples: 0.008
Loss after 41376 examples: 0.098
Loss after 44576 examples: 0.047
Loss after 47776 examples: 0.043
Loss after 50944 examples: 0.025
Loss after 54144 examples: 0.036
Loss after 57344 examples: 0.061

Accuracy of the model on the 2000 test images: 97.7%


VBox(children=(Label(value=' 0.00MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.01773711097…

0,1
_runtime,7.0
_timestamp,1617664074.0
_step,57344.0
epoch,4.0
loss,0.06123
test_accuracy,0.977


0,1
_runtime,▁▁▁▁▃▃▃▃▃▅▅▅▅▅▆▆▆▆█
_timestamp,▁▁▁▁▃▃▃▃▃▅▅▅▅▅▆▆▆▆█
_step,▁▁▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇██
epoch,▁▁▁▃▃▃▃▅▅▅▅▆▆▆▆███
loss,█▃▃▂▃▂▃▂▂▂▂▁▂▂▁▁▁▂
test_accuracy,▁


# Supercharge you Training with PyTorch Lightning + Wandb

In [None]:
%%capture
!pip install -qqq wandb pytorch-lightning

In [None]:
# numpy for non-GPU array math
import numpy as np

# Vanilla PyTorch
import torch
from torch.nn import functional as F
from torch import nn
from torch.utils.data import DataLoader, random_split

# Torchvision for CV
from torchvision.datasets import MNIST
from torchvision import transforms

In [None]:
# PyTorch Lightning
import pytorch_lightning as pl
pl.seed_everything(hash('setting random seeds') % 2**32 -1)

# weights and biases
import wandb

# lightning plus wandb
from pytorch_lightning.loggers import WandbLogger

wandb.login()

Global seed set to 2151364606


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

> _Note_: If you're executing your training in a terminal, rather than a notebook, you don't need to include `wandb.login()` in your script.
Instead, call `wandb login` in the terminal and we'll keep you logged in for future runs.

# 🏗️ Building a Model with Lightning

In PyTorch Lightning, models are built with `LightningModule` ([docs here](https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html)), which has all the functionality of a vanilla `torch.nn.Module` (🍦) but with a few delicious cherries of added functionality on top (🍨).
These cherries are there to cut down on boilerplate and
help separate out the ML engineering code
from the actual machine learning.

For example, the mechanics of iterating over batches
as part of an epoch are extracted away,
so long as you define what happens on the `training_step`.

To make a working model out of a `LightningModule`,
we need to define a new `class` and add a few methods on top.

We'll demonstrate this process with `LitMLP`,
which applies a two-layer perceptron
(aka two fully-connected layers and
a fully-connected softmax readout layer)
to input `Tensors`.

> _Note_: It is common in the Lightning community to shorten "Lightning" to "Lit".
This sometimes it sound like your code was written by
[Travis Scott](https://www.urbandictionary.com/define.php?term=it%27s%20lit).
We consider this a good thing.

# `__init__ `and `forward`

First, we need to add two methods that
are part of any vanilla PyTorch model.

Those methods are:
* `__init__` to do any setup, just like any Python class
* `forward` for inference, just like a PyTorch Module

The `forward` pass method is standard,
and it'll be different for every project,
so we won't comment on it.

The `__init__` method,
which `init`ializes new instances of the class,
is a good place to log hyperparameter information to `wandb`.

This is done with the `save_hyperparameters` method,
which captures all of the arguments to the initializer
and adds them to a dictionary at `self.hparams` --
that all comes for free as part of the `LightningModule`.

> _Note_: `hparams` is logged to `wandb` as the `config`,
so you'll never lose track of the arguments you used to run a model again!

In [None]:
# PyTorch Lightning deals with all the boilerplate code like `to(device)`

class LitMLP(pl.LightningModule):

  def __init__(self, in_dims, 
               n_classes=10, n_layer_1=128, n_layer_2=256, lr=1e-4):
    super().__init__()

    # we flatten the input Tensors and pass them through an MLP
    self.layer_1 = nn.Linear(np.prod(in_dims), n_layer_1)
    self.layer_2 = nn.Linear(n_layer_1, n_layer_2)
    self.layer_3 = nn.Linear(n_layer_2, n_classes)

    # log hyperparameters
    self.save_hyperparameters()

    # compute the accuracy -- no need to roll your own!
    self.train_acc = pl.metrics.Accuracy()
    self.valid_acc = pl.metrics.Accuracy()
    self.test_acc = pl.metrics.Accuracy()

  def forward(self, x):
      """
      Defines a forward pass using the Stem-Learner-Task
      design pattern from Deep Learning Design Patterns:
      https://www.manning.com/books/deep-learning-design-patterns
      """
      batch_size, *dims = x.size()

      # stem: flatten
      x = x.view(batch_size, -1)

      # learner: two fully-connected layers
      x = F.relu(self.layer_1(x))
      x = F.relu(self.layer_2(x))

      # task: compute class logits
      x = self.layer_3(x)
      x = F.log_softmax(x, dim=1)

      return x

  # convenient method to get the loss on a batch
  def loss(self, xs, ys):
      logits = self(xs) # this calls self.forward
      loss = F.nll_loss(logits, ys)
      return logits, loss

  # training loop

  def training_step(self, batch, batch_idx):
      xs, ys =  batch
      logits, loss = self.loss(xs, ys) # run the loss function up there ^^
      preds = torch.argmax(logits, 1) # take the highest predicted number

      # logging metrics we calculated by hand
      self.log('train/loss', loss, on_epoch=True)

      # logging a pl.Metric
      self.train_acc(preds, ys)
      self.log('train/acc', self.train_acc, on_epoch=True)

      return loss

  def configure_optimizers(self):
      return torch.optim.Adam(self.parameters(), lr=self.hparams['lr'])

  # Testing the model and logging

  def test_step(self, batch, batch_idx):
      xs, ys = batch
      logits, loss = self.loss(xs, ys)
      preds = torch.argmax(logits, 1)

      self.test_acc(preds, ys)
      self.log('test/loss_epoch', loss, on_step=False, on_epoch=True)
      self.log('test/acc_epoch', self.test_acc, on_step=False, on_epoch=True)

  # Saving in ONNX format

  def test_epoch_end(self, test_step_outputs): # args are defined as part of pl API
      dummy_input = torch.zeros(self.hparams['in_dims'], device=self.device)
      model_filename = 'model_final.onnx'
      torch.onnx.export(self, dummy_input, model_filename) # save weights and biases in onnx file
      wandb.save(model_filename)

  def validation_step(self, batch, batch_idx):
      xs, ys = batch
      logits, loss = self.loss(xs, ys)
      preds = torch.argmax(logits, 1)
      self.valid_acc(preds, ys)

      self.log('valid/loss_epoch', loss) # defaults on val/test is on_epoch only
      self.log('valid/acc_epoch', self.valid_acc)

      return logits

  def validation_epoch_end(self, validation_step_outputs):

      # Saving our model weights and biases every epoch.
      # This way, if we overfit, we can just roll back our weights to the saved weights at the best epoch
      dummy_input = torch.zeros(self.hparams['in_dims'], device=self.device)
      model_filename = f'model_{str(self.global_step).zfill(5)}.onnx'
      torch.onnx.export(self, dummy_input, model_filename)
      wandb.save(model_filename)

      # flatten validation step outputs as a pytorch tensor and turn into a histogram
      flattened_logits = torch.flatten(torch.cat(validation_step_outputs))
      self.logger.experiment.log(
          {'valid/logits': wandb.Histogram(flattened_logits.to('cpu')),
           'global_step': self.global_step}
      )


# Logging more fancy stuff with W&B

If you only want to log things like metrics, then you can do that with Lightning. However, if you want to start logging more fancy stuff like your model inputs and outputs (different media), then you will need to combine W&B with PyTorch Lightning.

Useful for debugging. Everything can seem fine, but there is one class performing poorly. Sometimes it's only possible to debug by looking at the inputs and outputs directly. It's for finding silent bugs.

In [None]:
# For every validation_epoch, log input images and output predictions using W&B's Image logger.

class ImagePredictionLogger(pl.Callback):
    def __init__(self, val_samples, num_samples=12):
        super().__init__()
        self.val_imgs, self.val_labels = val_samples
        self.val_imgs = self.val_imgs[:num_samples]
        self.val_labels = self.val_labels[:num_samples]

    def on_validation_epoch_end(self, trainer, pl_module):
        val_imgs = self.val_imgs.to(device=pl_module.device)

        logits = pl_module(val_imgs) # compute outputs of our models on the images
        preds = torch.argmax(logits, 1)

        trainer.logger.experiment.log({
            'examples': [wandb.Image(x, caption=f'Pred:{pred}, Label:{y}')
            for x, pred, y in zip(val_imgs, preds, self.val_labels)],
            'global_step': trainer.global_step
        })

# Defining our data pipeline

Can use PyTorch (DataLoader) or PyTorch Lightning (DataModules).

`DataModules` are more structured definition, which allows for additional optimizations such as automated distribution of workload between CPU & GPU.
Using `DataModules` is recommended whenever possible!

A `DataModule` is also defined by an interface:
* `prepare_data` (optional) which is called only once and on 1 GPU -- typically something like the data download step we have below
* `setup`, which is called on each GPU separately and accepts `stage` to define if we are at `fit` or `test` step
* `train_dataloader`, `val_dataloader` and `test_dataloader` to load each dataset

In [None]:
class MNISTDataModule(pl.LightningDataModule):

    def __init__(self, data_dir='./', batch_size=128):
      super().__init__()
      self.data_dir = data_dir
      self.batch_size = batch_size
      self.transform = transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307,), (0.3081,))
      ])

    def prepare_data(self):
        # download data, train then test
        MNIST(self.data_dir, train=True, download=True)
        MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):

        # we set up only relevant datasets when stage is specified
        if stage == 'fit' or stage is None:
            mnist = MNIST(self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist, [55000, 5000])
        if stage == 'test' or stage is None:
            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)

    # we define separate DataLoader for each of train/val/test
    def train_dataloader(self):
        mnist_train = DataLoader(self.mnist_train, batch_size=self.batch_size)
        return mnist_train

    def val_dataloader(self):
        mnist_val = DataLoader(self.mnist_val, batch_size=10 * self.batch_size)
        return mnist_val

    def test_dataloader(self):
        mnist_test = DataLoader(self.mnist_test, batch_size=10 * self.batch_size)
        return mnist_test

# setup data
mnist = MNISTDataModule()
mnist.prepare_data()
mnist.setup()

# grab samples to log predictions on
samples = next(iter(mnist.val_dataloader()))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=9912422.0), HTML(value='')))


Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=28881.0), HTML(value='')))


Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=1648877.0), HTML(value='')))


Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=4542.0), HTML(value='')))


Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw

Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


# Making a Trainer

The `DataLoader` and the `LightningModule`
are brought together by a `Trainer`,
which orchestrates data loading,
gradient calculation,
optimizer logic,
and logging. 

Luckily, we don't need to sub-class the `Trainer`,
we just need to configure it with keyword arguments.

In [None]:
wandb_logger = WandbLogger(project='lit-wandb')

> _Note_: Check out [the documentation](https://docs.wandb.com/library/integrations/lightning) for customization options. I like `group`s and `tag`s!.

We can then set up our `Trainer` and customize several options, such as gradient accumulation, half precision training and distributed computing.

We'll stick to the basics for this example,
but half-precision training and easy scaling to distributed settings are two of the major reasons why folks like PyTorch Lightning!

In [None]:
trainer = pl.Trainer(
    logger=wandb_logger,    # W&B integration
    log_every_n_steps=50,   # set the logging frequency
    gpus=-1,                # use all GPUs
    max_epochs=5,           # number of epochs
    deterministic=True,     # keep it deterministic
    callbacks=[ImagePredictionLogger(samples)] # see Callbacks section
    )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


# Running our Model

In [None]:
# Setup the model
model = LitMLP(in_dims=(1, 28, 28))

# fit the model
trainer.fit(model, mnist)

# evaluate the model on a test set
trainer.test(datamodule=mnist,
             ckpt_path=None) # uses last-saved model, default simply uses the best model

wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[34m[1mwandb[0m: Currently logged in as: [33mjacquesthibs[0m (use `wandb login --relogin` to force relogin)



  | Name      | Type     | Params
---------------------------------------
0 | layer_1   | Linear   | 100 K 
1 | layer_2   | Linear   | 33.0 K
2 | layer_3   | Linear   | 2.6 K 
3 | train_acc | Accuracy | 0     
4 | valid_acc | Accuracy | 0     
5 | test_acc  | Accuracy | 0     
---------------------------------------
136 K     Trainable params
0         Non-trainable params
136 K     Total params
0.544     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]







HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test/acc_epoch': 0.9505000114440918, 'test/loss_epoch': 0.16673018038272858}
--------------------------------------------------------------------------------


VBox(children=(Label(value=' 0.52MB of 0.52MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss_step,0.08813
train/acc_step,0.97727
epoch,4.0
trainer/global_step,2150.0
_runtime,79.0
_timestamp,1617744445.0
_step,48.0
train/loss_epoch,0.17654
train/acc_epoch,0.94845
test/loss_epoch,0.16673


0,1
train/loss_step,█▅▃▃▂▃▂▂▃▂▂▁▁▂▂▂▂▁▂▂▂▂▂▂▂▁▂▂▁▁▂▁▂▁▁▂▁▁▁▁
train/acc_step,▁▃▅▅▇▆▆▆▅▇▆▇▇▇▇▆▅▇▇▇▇▆▇▆▆▆▆▆█▇▆█▇▇▇▆▇▇▇█
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆█████████
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/loss_epoch,█▂▂▁▁
train/acc_epoch,▁▆▇██
test/loss_epoch,▁


> _Note_: In notebooks, we need to call `wandb.finish()` to indicate when we've finished our run. This isn't necessary in scripts.