In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

want to make it use the gpu or mps to save cpu so check if they are available, if not then use cpu

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


now define nn
subclass nn.module
initialise the nn layers in __init__
every nn.module subclass implements operations on input data in the forward method

In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

create nn
move it to device
print it structure

In [5]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Calling the model on the input returns a 2-dimensional tensor with dim=0 corresponding to each output of 10 raw predicted values for each class, and dim=1 corresponding to the individual values of each output. We get the prediction probabilities by passing it through an instance of the nn.Softmax module.

softmax computes the probabilities that a specific model output will happen

In [6]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([1])


sample of 3 images of size 28x28

In [7]:
input_image = torch.rand(3,28,28)
print(input_image.size())

torch.Size([3, 28, 28])


initialise nn.flatten to convert 2D 28x28 image into array of 784 pixel values

dimension at dim=0 is maintained (i.e. 3)

In [8]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


nn.linear applies a linear transformation on the input

uses its stored weights and biases

In [9]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


non-linear activations create complex mappings between the models inputs and outputs

applied after the linear transformations to introduce nonlinearity

this helps nn learn a wide variety of phenomena

here we use nn.ReLU between our linear layers but there are other activations to introduce nonlinearity that you can use

In [10]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 0.1099,  0.2619, -0.6145, -0.1715, -0.1676,  0.6837, -0.2687,  0.0374,
         -0.1929, -0.0753,  0.5777, -0.2202, -0.0323, -0.3198,  0.2157,  0.0267,
          0.4238, -0.0433, -0.0225,  0.2625],
        [ 0.1114,  0.2204, -0.7426, -0.0093, -0.4769,  0.4209, -0.3581,  0.2782,
         -0.1460, -0.0359,  0.5744, -0.3687, -0.2297, -0.2222,  0.1656, -0.1844,
          0.5251, -0.1242, -0.3958, -0.0666],
        [ 0.1953,  0.4200, -0.4616,  0.0877, -0.2641,  0.6450, -0.3378,  0.3888,
         -0.5240,  0.0343,  0.4985,  0.0515, -0.0881, -0.4135,  0.1396,  0.0665,
          0.3488,  0.0184, -0.0731, -0.2351]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.1099, 0.2619, 0.0000, 0.0000, 0.0000, 0.6837, 0.0000, 0.0374, 0.0000,
         0.0000, 0.5777, 0.0000, 0.0000, 0.0000, 0.2157, 0.0267, 0.4238, 0.0000,
         0.0000, 0.2625],
        [0.1114, 0.2204, 0.0000, 0.0000, 0.0000, 0.4209, 0.0000, 0.2782, 0.0000,
         0.0000, 0.5744, 0.0000, 0.0000, 0.0000, 0.16

nn.sequential - ordered container of modules

data passed through all the modules in the same order as defined

can use sequential containers to put together a quick network like seq_modules

so basically I think its like a way to define the process and flow of the network so that the network can be repeatedly applied to data in a concise manner which maintains the order

In [11]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)

nn.softmax - last linear layer of nn returns logits

logits are raw values in range [-infty,infty]

they are passed to the softmax module

logits are scaled to values in range [0,1]

these represent the model's predicted probabilities for each class

the 'dim' parameter indicates the dimension along which the values must sum to 1

In [12]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

Many layers inside a neural network are parameterized, i.e. have associated weights and biases that are optimized during training

Subclassing nn.Module automatically tracks all fields defined inside your model object, and makes all parameters accessible using your model’s parameters() or named_parameters() methods

In this example, we iterate over each parameter, and print its size and a preview of its values

In [13]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0312, -0.0226, -0.0019,  ..., -0.0211,  0.0104, -0.0333],
        [ 0.0220,  0.0353,  0.0101,  ..., -0.0035,  0.0062,  0.0029]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([ 0.0033, -0.0351], grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[-0.0331,  0.0173, -0.0274,  ...,  0.0431, -0.0371,  0.0045],
        [ 0.0379, -0.0194,  0.0105,  ...,  0.0231,  0.0312,  0.0216]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.bias | 