All the building blocks we need to build a neural network in PyTorch can be found using the torch.nn module.

This example builds a neural network to classify images in the FashionMNIST dataset.


In [12]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [13]:
# Get a GPU containing device for training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print("Using {} device".format(device))

Using cuda device


# Define a class for building a neural network

In [14]:
class NeuralNetwork(nn.Module):

  def __init__(self):
    """

    """
    super().__init__()
    self.flatten = nn.Flatten()
    self.linear_relu_stack = nn.Sequential(
        nn.Linear(28*28, 512),
        nn.ReLU(),
        nn.Linear(512, 512),
        nn.ReLU(),
        nn.Linear(512,10),
    )

  def forward(self, x):
    """
    """
    x = self.flatten(x)
    logits = self.linear_relu_stack(x)
    return logits

In [15]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [16]:
#   Create a minibatch of 1 image of size 28 x 28 pixels
X = torch.rand(1, 28, 28, device=device)

#   Use model by passing it input data to execute model's forward function
logits = model(X)

#   Compute output using softmax activation function
pred_probab = nn.Softmax(dim=1)(logits)

#   Predicted output from model
y_pred = pred_probab.argmax(1)

print("Predicted class: {}".format(y_pred))

Predicted class: tensor([1], device='cuda:0')


## Understanding model layers

In [17]:
#   Image size
img_size = (28, 28)
img_size[0]

28

In [18]:
#   Initialise the size of the input image
input_img = torch.rand(3, img_size[0], img_size[1])
print(input_img.size())

torch.Size([3, 28, 28])


In [19]:
#   Convert a 28 x 28 image into a contiguous array (so looks like 1D?) of 784 pixel values
flatten = nn.Flatten()
flat_img = flatten(input_img)
print(flat_img.size())

torch.Size([3, 784])


### nn.Linear

In [20]:
#   Applies a linear transformation on the input image using weights and biases
layer1 = nn.Linear(in_features=img_size[0]*img_size[1], out_features=20)
hidden1 = layer1(flat_img)
print(hidden1.size())

torch.Size([3, 20])


### nn.ReLU

In [21]:
##  ReLU is a non-linear activation which introduces nonlinearity between
##  linear layers to create complex mappings between model inputs and output
##  (and therefore learn a wide variety of phenomena)

##  ReLU works as follows: y = x (if x > 0), else y = 0.
print("Before ReLU: {}".format(hidden1))
hidden1 = nn.ReLU()(hidden1)
print("After ReLU: {}".format(hidden1))

Before ReLU: tensor([[ 2.6399e-01, -1.7711e-01, -1.5105e-01, -2.3161e-01, -3.0043e-01,
          7.5082e-02, -6.6009e-03, -3.2847e-02, -4.5309e-02, -3.0627e-01,
         -9.4765e-02, -8.4134e-01, -3.2944e-01, -2.4268e-01,  2.3555e-01,
         -2.8056e-01, -2.2567e-02, -5.4899e-02, -3.4474e-01, -4.0673e-01],
        [ 1.5378e-01, -2.8663e-01,  2.9110e-03, -1.3168e-01, -1.7538e-01,
          1.3143e-01,  7.3966e-03,  4.1303e-01,  2.2569e-01, -4.3359e-01,
         -1.3441e-01, -5.0002e-01, -2.0246e-01, -4.0853e-01,  3.5603e-01,
         -2.3559e-01, -1.8593e-01,  2.9791e-02,  2.8838e-01, -2.8012e-01],
        [ 3.5205e-01, -4.4486e-01, -4.2821e-01, -1.9053e-02, -2.3811e-01,
          1.9506e-01, -3.9960e-01,  3.7430e-01, -2.1434e-02, -1.9904e-01,
          2.3641e-01, -7.8826e-01, -1.6622e-01, -4.5980e-01,  1.2718e-01,
         -1.5431e-01, -3.5686e-01, -6.7784e-05,  4.8975e-02, -5.5473e-01]],
       grad_fn=<AddmmBackward0>)
After ReLU: tensor([[0.2640, 0.0000, 0.0000, 0.0000, 0.0000, 0

### nn.Sequential

In [22]:
##  nn.Sequential is ordered container of modules, which determines the
##  route of data being passed through the network.

seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)

input_img = torch.rand(3, 28, 28)
logits = seq_modules(input_img)

In [23]:
##  nn.Softmax scales the outputs of the last linear layer of NN to values from
##  0 to 1, as these values are logits (between -inf and inf).

softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

In [27]:
print("Model structure: {}".format(model))

for name, param in model.named_parameters():
  print("Layer: {} | Size: {} | Values: {} \n".
        format(name, param.size(), param[:2]))

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values: tensor([[ 0.0315,  0.0254,  0.0256,  ..., -0.0116, -0.0069, -0.0164],
        [-0.0226, -0.0326,  0.0136,  ..., -0.0335, -0.0276, -0.0195]],
       device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values: tensor([0.0341, 0.0261], device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values: tensor([[-0.0070, -0.0411,  0.0338,  ..., -0.0288,  0.0358,  0.0399],
        [-0.0325, -0.0072,  0.0408,  ..., -0.0227, -0.0253, -0.0226]],
       device='cuda:0', grad_fn=<SliceBack