In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class NN(nn.Module):
    def __init__(self):
        super(NN,self).__init__()
#         convolutional layers
#         input - 1 image , output - 6 images, filters - 5x5
        self.conv1 = nn.Conv2d(1,6,5)
        self.conv2 = nn.Conv2d(6,16,5)
#         fully connected layers
        self.fc1 = nn.Linear(16*5*5, 120) # 5*5 image dimens
        self.fc2 = nn.Linear(120,84)
        self.fc3 = nn.Linear(84,10)
    
    def forward(self,x):
        x = F.max_pool2d(F.relu(self.conv1(x)),(2,2))
        x = F.max_pool2d(F.relu(self.conv2(x)),2)
        x = torch.flatten(x,1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = NN()
print(net)

NN(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


You just have to define the forward function, and the `backward` function (where gradients are computed) is automatically defined for you using `autograd`. You can use any of the Tensor operations in the forward function.

The learnable parameters of a model are returned by `net.parameters()`

In [3]:
params = list(net.parameters())
print(len(params))
print(params[0].size())
print(params[1].size())
print(params[2].size())

print(params[3].size())
print(params[4].size())
print(params[5].size())

print(params[6].size())
print(params[7].size())
print(params[8].size())

10
torch.Size([6, 1, 5, 5])
torch.Size([6])
torch.Size([16, 6, 5, 5])
torch.Size([16])
torch.Size([120, 400])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])


In [4]:
print(list(net.parameters()))

[Parameter containing:
tensor([[[[ 0.1129, -0.1543,  0.1894, -0.0873,  0.0558],
          [ 0.1122,  0.0764,  0.0842, -0.1780, -0.0038],
          [ 0.1025,  0.1396,  0.1447,  0.1959,  0.0023],
          [ 0.1412, -0.1511, -0.0797, -0.0573,  0.0873],
          [ 0.0733,  0.0653,  0.0248,  0.1874,  0.0979]]],


        [[[ 0.2000, -0.0453,  0.0401, -0.0214,  0.0733],
          [-0.1756, -0.0392,  0.1902,  0.0837, -0.0861],
          [ 0.1155,  0.0031, -0.0609, -0.0369,  0.0726],
          [-0.1490, -0.1675, -0.1598,  0.0702,  0.0430],
          [-0.1798, -0.1586,  0.0250,  0.0433, -0.1062]]],


        [[[ 0.0743,  0.1898,  0.0143,  0.1941,  0.0569],
          [-0.1562,  0.0343, -0.1326,  0.1083,  0.1096],
          [ 0.0588, -0.1843,  0.0973,  0.0279,  0.1441],
          [-0.0956,  0.1857,  0.0722,  0.1358, -0.0069],
          [-0.0865,  0.1120, -0.1386, -0.1523, -0.0560]]],


        [[[-0.0784,  0.0010, -0.1464, -0.0872,  0.0580],
          [ 0.0723,  0.1299, -0.1781, -0.1723, -0.046

In [5]:
# Let’s try a random 32x32 input. Note: expected input size of this net (LeNet) is 32x32
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

tensor([[ 0.1010, -0.0959,  0.0516, -0.0998, -0.1566, -0.0059,  0.0391, -0.0844,
          0.0312, -0.0267]], grad_fn=<AddmmBackward0>)


Zero the gradient buffers of all parameters and backprops with random gradients:

In [6]:
net.zero_grad()
out.backward(torch.randn(1,10))

In [7]:
print(out)

tensor([[ 0.1010, -0.0959,  0.0516, -0.0998, -0.1566, -0.0059,  0.0391, -0.0844,
          0.0312, -0.0267]], grad_fn=<AddmmBackward0>)


In [8]:
a = torch.tensor([[[1,2],[2,4]]])
a

tensor([[[1, 2],
         [2, 4]]])

In [9]:
print(a.unsqueeze(2))

tensor([[[[1, 2]],

         [[2, 4]]]])


## NOTE

``torch.nn`` only supports mini-batches. The entire ``torch.nn`` package only supports inputs that are a mini-batch of samples, and not a single sample.

For example, nn.Conv2d will take in a 4D Tensor of ``nSamples x nChannels x Height x Width``.

If you have a single sample, just use ``input.unsqueeze(0)`` to add a fake batch dimension.

## Recap

- ``torch.tensor`` - *a multidimensional array* with support autograd and also stores the gradient with respect to that tensor
- ```nn.Module``` - Neural network module.*Convenient way of encapsulating parameters*
- ```nn.Parameter``` -  kind of Tensor, that is automatically registered as a parameter when assigned as an attribute to a Module.
- ```autograd.Function``` - Implements ``forward`` and ``backward`` definitions of an autograd operation. Every Tensor operation creates at least a single Function node that connects to functions that created a Tensor and encodes its history.```

## Loss

A loss function takes the (output, target) pair of inputs, and computes a value that estimates how far away the output is from the target.

There are several different loss functions under the nn package . A simple loss is: nn.MSELoss which computes the mean-squared error between the output and the target.

In [10]:
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(2.0263, grad_fn=<MseLossBackward0>)


Now, if you follow loss in the backward direction, using its .grad_fn attribute, you will see a graph of computations that looks like this:

input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d<br>
      -> flatten -> linear -> relu -> linear -> relu -> linear(output probabilities)<br>
      -> MSELoss<br>
      -> loss
      
So, when we call loss.backward(), the whole graph is differentiated w.r.t. the neural net parameters, and all Tensors in the graph that have requires_grad=True will have their .grad Tensor accumulated with the gradient.

In [11]:
# mse loss
print(loss.grad_fn)

<MseLossBackward0 object at 0x000001DF6577D520>


In [12]:
# linear layer of output probab
print(loss.grad_fn.next_functions[0][0])

<AddmmBackward0 object at 0x000001DF4F0826D0>


In [13]:
# ReLU layer
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])

<AccumulateGrad object at 0x000001DF65791580>


In [14]:
net.zero_grad()
# gradients are now set to zero
print("Conv1 bias before backward",net.conv1.bias.grad)
loss.backward()
print("Conv2 bias after backward",net.conv1.bias.grad)

Conv1 bias before backward None
Conv2 bias after backward tensor([ 0.0007, -0.0157,  0.0008,  0.0151,  0.0213, -0.0062])


In [15]:
# we can update the weights using gradient descent
# wt = wt - learning rate * gradient

learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data*learning_rate)

In [17]:
# this weight updates happen in an optimizer `torch.optim`
optimizer = torch.optim.SGD(net.parameters(),lr = 1e-3)

# in training loop 
optimizer.zero_grad()
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()

In [18]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()# Does the update