# Compose Nueral Network model

torch.nn provide all of components for composing neural network

All module of pytorch is subclass of nn.Module

In [1]:
import os 
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

## Getting device for training

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [3]:
class NeuralNetwork(nn.Module):
  def __init__(self):
    super(NeuralNetwork, self).__init__()
    self.flatten = nn.Flatten()
    self.linear_relu_stack = nn.Sequential(
        nn.Linear(28*28, 512),
        nn.ReLU(),
        nn.Linear(512, 512),
        nn.ReLU(),
        nn.Linear(512, 10),
        nn.ReLU()
    )

  def forward(self, x):
    x = self.flatten(x)
    logits = self.linear_relu_stack(x)
    return logits

Create instance of NeuralNetwork and move to device, and print structure

In [5]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)


Do not call model.forward()

In [7]:
X = torch.rand(1, 28, 28, device = device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
print(f"Predicted probability : {pred_probab}")
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted probability : tensor([[0.0978, 0.1032, 0.0989, 0.0978, 0.0978, 0.0978, 0.0978, 0.0989, 0.1081,
         0.1016]], device='cuda:0', grad_fn=<SoftmaxBackward>)
Predicted class: tensor([8], device='cuda:0')


## Layer

In [8]:
input_image = torch.rand(3, 28, 28)
print(input_image.size())

torch.Size([3, 28, 28])


sustian mini-batch dimension of dim=0

In [9]:
# nn.Flatten
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


In [10]:
# nn.Linear

layer1 = nn.Linear(in_features = 28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


In [11]:
# nn.ReLU
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"Ater ReLU: {hidden1}")

Before ReLU: tensor([[ 0.2926,  1.0303, -0.0877,  0.2363, -0.3285, -0.3749,  0.1224, -0.0027,
          0.1360,  0.2933,  0.1547, -0.4837, -0.6447,  0.3367, -0.2275, -0.1536,
          0.0609, -0.3961, -0.0052,  0.0616],
        [ 0.3709,  0.7450,  0.0831,  0.6098, -0.4641, -0.0414, -0.0506,  0.0745,
         -0.0319,  0.2704,  0.1502, -0.5864, -0.0793, -0.1945, -0.2818,  0.3000,
          0.0265, -0.2618, -0.0499,  0.1321],
        [ 0.4215,  0.9400, -0.3301,  0.2909, -0.2525,  0.1410,  0.0050,  0.0733,
          0.1948,  0.0424,  0.1537, -0.5199,  0.0779,  0.2661,  0.2082,  0.2866,
         -0.0782, -0.1430,  0.3006,  0.2886]], grad_fn=<AddmmBackward>)


Ater ReLU: tensor([[0.2926, 1.0303, 0.0000, 0.2363, 0.0000, 0.0000, 0.1224, 0.0000, 0.1360,
         0.2933, 0.1547, 0.0000, 0.0000, 0.3367, 0.0000, 0.0000, 0.0609, 0.0000,
         0.0000, 0.0616],
        [0.3709, 0.7450, 0.0831, 0.6098, 0.0000, 0.0000, 0.0000, 0.0745, 0.0000,
         0.2704, 0.1502, 0.0000, 0.0000, 0.0000, 0.0000

In [12]:
# nn.Sequential

seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3, 28, 28)
logits = seq_modules(input_image)

In [14]:
# nn.Softmax
# parameter dim is dimension of sum that is 1
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

In [15]:
# Parameter of Model

print("Model structure: ", model, "\n\n")

for name, param in model.named_parameters():
  print(f"Layer: {name} | Size : {param.size()} | Values : {param[:2]} \n")

Model structure:  NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
) 


Layer: linear_relu_stack.0.weight | Size : torch.Size([512, 784]) | Values : tensor([[-0.0121,  0.0042, -0.0092,  ...,  0.0201,  0.0266,  0.0209],
        [ 0.0324,  0.0206, -0.0276,  ...,  0.0033,  0.0278,  0.0249]],
       device='cuda:0', grad_fn=<SliceBackward>) 

Layer: linear_relu_stack.0.bias | Size : torch.Size([512]) | Values : tensor([ 0.0281, -0.0014], device='cuda:0', grad_fn=<SliceBackward>) 

Layer: linear_relu_stack.2.weight | Size : torch.Size([512, 512]) | Values : tensor([[-0.0020,  0.0364,  0.0045,  ..., -0.0426,  0.0280, -0.0052],
        [-0.0085,  0.0044,  0.0400,  ..., -0.0399, -0.0180, -0.0029]],
       device='c

# Automatic differentiate

In [25]:
import torch

x = torch.ones(5) # input tensor
y = torch.zeros(3) # expected output
w = torch.randn(5, 3, requires_grad = True)
b = torch.randn(3, requires_grad = True)
# for calculate grad of loss
z = torch.matmul(x, w) + b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

In [26]:
# reference of backprobagation gradient is saved in grad_fn
print("Gradient function for z = ", z.grad_fn)
print("Gradinet function for loss = ", loss.grad_fn)

Gradient function for z =  <AddBackward0 object at 0x7f36b4f19c10>
Gradinet function for loss =  <BinaryCrossEntropyWithLogitsBackward object at 0x7f36b4f19d90>


We only can get the grad of leaf which the requires_grad attribute is set to True

In [27]:
# Calculate Gradient
# call is code
loss.backward()
print(w.grad)
print(b.grad)

tensor([[0.3107, 0.3064, 0.0399],
        [0.3107, 0.3064, 0.0399],
        [0.3107, 0.3064, 0.0399],
        [0.3107, 0.3064, 0.0399],
        [0.3107, 0.3064, 0.0399]])
tensor([0.3107, 0.3064, 0.0399])


In [28]:
# Stop tracking the gradient

z = torch.matmul(x, w) + b
print(z.requires_grad)

with torch.no_grad():
  z = torch.matmul(x, w) + b

print(z.requires_grad)

True
False


In [29]:
# Same output

z = torch.matmul(x, w) + b
z_det = z.detach()
print(z_det.requires_grad)

False


## Additional information of Operation Graph

In [31]:
# Optional Reading : degree of tensor change and Jacobian Product

inp = torch.eye(5, requires_grad = True)
out = (inp+1).pow(2)

out.backward(torch.ones_like(inp), retain_graph = True)
print("First call\n", inp.grad)
# degree of tensor change changes if call backward with same parameter
out.backward(torch.ones_like(inp), retain_graph = True)
print("\nSecond call\n", inp.grad)
# So we must make grad atrribute 0 first.
# In real training, optimizer help this process
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph = True)
print("\nCall after zeroing gradients\n", inp.grad)

First call
 tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])

Second call
 tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.],
        [4., 4., 4., 4., 8.]])

Call after zeroing gradients
 tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])
