In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

### Get Device for Training

In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


### Define the Class

Defining our neural network by subclassing **nn.Module**, and initialize the neural network layers in __init__.
- Every nn.Module subclass implements the operations on input data in the forward method.

In [3]:
#Subclassing refers to the process of creating a new class (the subclass - NeuralNetwork)
#that inherits attributes and behaviors from an existing class (the superclass - n.Module).

#a constructor is a special method within a class that is automatically called when an object of that class is created or instantiated.

class NeuralNetwork(nn.Module):
    def __init__(self):#as we know __init__ is a constructor of the class
        #Below line calls the constructor of the superclass (nn.Module) to ensure that the necessary initializations 
        #for a Module object are performed.
        super().__init__()
        #Below line creates an instance of nn.Flatten(), which is used to flatten the input tensor to a 1D tensor.
        self.flatten = nn.Flatten()
        #Following 6 lines we define the nn layers of our neural network
        #linear layers and ReLU as the activation function
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),#a fully connected layer with 784 input features and 512 output features (neurons).
            nn.ReLU(),#adds a ReLU activation function after the first linear layer
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        #x above is the input
        x = self.flatten(x)
        #above line converts the 2D image tensors into 1D vectors
        #Below line passes the flattened input x through the linear_relu_stack 
        #layers defined earlier. The result is the raw output of the neural network, 
        #often referred to as "logits."
        logits = self.linear_relu_stack(x)
        return logits

In [4]:
#We create an instance of NeuralNetwork, and move it to the device, and print its structure.
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


To use the model, we pass it the input data. This executes the model’s forward, along with some background operations. 
- **Do not call *model.forward()* directly!**

- The code below demonstrates how to use a PyTorch model (model) to make predictions on a random input tensor (X) of shape (1, 28, 28) and then extract the predicted class.

In [5]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
#Below line applies the softmax function along the specified dimension to the logits.
#The softmax function converts the raw scores (logits) into probabilities, 
#making it easier to interpret the model's predictions.
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
#above line finds the index of the maximum value along dimension 1, 
#which corresponds to the class with the highest probability.
print(f"Predicted class: {y_pred}")

Predicted class: tensor([0])


### Model Layers
- **Break down of the layers in the FashionMNIST model**

In [6]:
input_image = torch.rand(3,28,28)#Batch size of 3(images) of size 28*28
print(input_image.size())

torch.Size([3, 28, 28])


#### nn.Flatten

In [7]:
#to convert each 2D 28x28 image into a contiguous array of 784 pixel values
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


#### nn.Linear

In [9]:
#The linear layer is a module that applies a linear transformation on the input using its stored weights and biases.
layer1=nn.Linear(in_features=28*28,out_features=20)
hidden1=layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


- Non-linear activations are what create the complex mappings between the model’s inputs and outputs. They are applied after linear transformations to introduce nonlinearity, helping neural networks learn a wide variety of phenomena.

#### nn.ReLU

In [10]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 0.0859, -0.6808,  0.2399,  0.0856, -0.1637,  0.0539, -0.0994, -0.1990,
         -0.6223,  0.0593,  0.0030,  0.3440,  0.1669, -0.2074, -0.0371,  0.4668,
          0.0810,  0.0818,  0.4791, -0.4517],
        [-0.2074, -0.4285,  0.2611,  0.0362, -0.1046, -0.0901, -0.1770,  0.2857,
         -0.7465, -0.1358, -0.1198,  0.5912,  0.0560,  0.1943,  0.4678,  0.2722,
         -0.0309,  0.0732,  0.3472, -0.1581],
        [-0.1182, -0.5810,  0.2936,  0.2258, -0.2580, -0.0518, -0.5427, -0.1304,
         -0.4635, -0.3667, -0.1084,  0.2728, -0.2227, -0.2490,  0.1111,  0.4727,
         -0.3359,  0.2391,  0.4377, -0.5289]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.0859, 0.0000, 0.2399, 0.0856, 0.0000, 0.0539, 0.0000, 0.0000, 0.0000,
         0.0593, 0.0030, 0.3440, 0.1669, 0.0000, 0.0000, 0.4668, 0.0810, 0.0818,
         0.4791, 0.0000],
        [0.0000, 0.0000, 0.2611, 0.0362, 0.0000, 0.0000, 0.0000, 0.2857, 0.0000,
         0.0000, 0.0000, 0.5912, 0.0560, 0.1943, 0.46

- nn.Sequential is an ordered container of modules. The data is passed through all the modules in the same order as defined. You can use sequential containers to put together a quick network like seq_modules.

#### nn.Sequential

In [11]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)

- The last linear layer of the neural network returns logits - raw values in [-infty, infty] - which are passed to the nn.Softmax module. 
- The logits are scaled to values [0, 1] representing the model’s predicted probabilities for each class. 
- dim parameter indicates the dimension along which the values must sum to 1.

#### nn.Softmax

In [12]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

In [13]:
pred_probab

tensor([[0.1339, 0.0930, 0.0663, 0.1290, 0.1003, 0.1001, 0.0782, 0.1156, 0.1036,
         0.0800],
        [0.1381, 0.1054, 0.0771, 0.1193, 0.0902, 0.0964, 0.0947, 0.0925, 0.1076,
         0.0787],
        [0.1318, 0.0941, 0.0817, 0.1147, 0.0823, 0.1045, 0.0926, 0.0954, 0.1128,
         0.0902]], grad_fn=<SoftmaxBackward0>)

### Model Parameters

- Many layers inside a neural network are parameterized, i.e. have associated weights and biases that are optimized during training. 
- Subclassing nn.Module automatically tracks all fields defined inside your model object, and makes all parameters accessible using your **model’s parameters()** or **named_parameters()** methods.

In [14]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0245,  0.0324, -0.0072,  ..., -0.0309, -0.0332,  0.0102],
        [-0.0018,  0.0276,  0.0044,  ..., -0.0313,  0.0024, -0.0218]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([ 0.0259, -0.0124], grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[-0.0409, -0.0207,  0.0323,  ...,  0.0018,  0.0057, -0.0339],
        [ 0.0302,  0.0235,  0.0251,  ...,  0.0137,  0.0371,  0.0289]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.bias | 