# PyTorch 2 - torch.nn

Based on:<br>
* [torch.nn tutorial](https://pytorch.org/tutorials/beginner/nn_tutorial.html)
* [torch.nn docs](https://pytorch.org/docs/stable/nn.html)

## Imports

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# check version number
torch.__version__

'1.0.1.post2'

In [3]:
# check if CUDA/GPU is available
torch.cuda.is_available()

False

In [4]:
# if CUDA/GPU is available print CUDA version
if torch.cuda.is_available(): torch.version.cuda

## torch.nn.Module

In [5]:
# create class instance
lin = nn.Linear(10,5)

In [6]:
lin

Linear(in_features=10, out_features=5, bias=True)

In [7]:
# access weights and biases
lin.weight, lin.bias

(Parameter containing:
 tensor([[-0.1394,  0.0499,  0.0403,  0.2514,  0.0284,  0.2818, -0.0930, -0.1490,
          -0.0486, -0.1828],
         [-0.0944,  0.2403,  0.1509,  0.2226,  0.0861, -0.0057, -0.2761, -0.1078,
          -0.0332, -0.2867],
         [ 0.2942, -0.1235, -0.0893, -0.0893,  0.1752,  0.1301, -0.0922, -0.1348,
          -0.2011, -0.2492],
         [-0.2428, -0.2016, -0.0362,  0.0569, -0.2164,  0.1301, -0.0795, -0.2278,
          -0.2559, -0.0671],
         [-0.2640,  0.0109,  0.0875,  0.2905,  0.1151,  0.2092, -0.0602,  0.2911,
           0.0954, -0.2006]], requires_grad=True), Parameter containing:
 tensor([ 0.2120, -0.1809, -0.2026, -0.1381,  0.2816], requires_grad=True))

In [9]:
# add ".data" to only get the data
# .item() gets a Python number from a tensor containing a single value.
lin.weight.data

tensor([[ 0.2379, -0.1646,  0.0473, -0.1541, -0.1795,  0.1025,  0.0527,  0.2035,
          0.0800, -0.0874],
        [-0.1407,  0.0054,  0.0078,  0.2456, -0.3129, -0.0460,  0.1887,  0.1275,
          0.2313,  0.2686],
        [-0.0712, -0.2136,  0.1927, -0.2583, -0.2773, -0.0944,  0.0034, -0.2987,
         -0.1498,  0.1514],
        [ 0.0770,  0.1774, -0.0245, -0.2541, -0.2489, -0.1568,  0.1003,  0.2622,
         -0.2504, -0.2198],
        [ 0.0106, -0.1580, -0.2525,  0.2910, -0.0370,  0.2523,  0.1660, -0.1993,
          0.0585, -0.2405]])

In [10]:
#dir(lin.weight)

In [11]:
# .data gets the tensor, and in combination with .numpy() an np array.
lin.weight.data.numpy()

array([[ 0.23786381, -0.1645965 ,  0.04727349, -0.1540767 , -0.17953578,
         0.10246778,  0.05267373,  0.2035304 ,  0.08003432, -0.08739886],
       [-0.14065963,  0.0054408 ,  0.0078055 ,  0.24562249, -0.31291765,
        -0.04602849,  0.18874463,  0.12750387,  0.23134974,  0.26857302],
       [-0.07116151, -0.21360278,  0.1927335 , -0.25826943, -0.27726087,
        -0.09444   ,  0.00337893, -0.29869476, -0.14975534,  0.15139988],
       [ 0.07702318,  0.17735332, -0.02446824, -0.2540539 , -0.2488738 ,
        -0.15676379,  0.10031453,  0.26223698, -0.25042427, -0.21981898],
       [ 0.01058552, -0.15795133, -0.25250363,  0.29097852, -0.03704199,
         0.25232926,  0.166044  , -0.19929628,  0.05854887, -0.24048506]],
      dtype=float32)

In [8]:
conv = nn.Conv2d(3,32,3)

In [9]:
conv(torch.randn(1,3,28,28)).shape

torch.Size([1, 32, 26, 26])

See [torch.nn docs](https://pytorch.org/docs/stable/nn.html) for all the options!

## Neural networks

In [10]:
my_first_neural_network = nn.Sequential(nn.Linear(64,32),nn.ReLU(),nn.Linear(32,1),nn.Sigmoid())

In [11]:
my_first_neural_network

Sequential(
  (0): Linear(in_features=64, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=1, bias=True)
  (3): Sigmoid()
)

In [12]:
# run tensor through network
my_first_neural_network(torch.randn(1,64))

tensor([[0.5647]], grad_fn=<SigmoidBackward>)

In [27]:
# squeeze output to get rank-0 tensor
my_first_neural_network(torch.randn(1,64)).squeeze()

tensor(0.5640, grad_fn=<SqueezeBackward0>)

In [None]:
# get the output with .item()
my_first_neural_network(torch.randn(1,64)).item()

In [18]:
# print all the parameters and their gradients:
for name, param in my_first_neural_network.named_parameters():
    print(name,':\nparam:',param,'nparam.grad:',param.grad)

0.weight :
param: Parameter containing:
tensor([[-0.1235, -0.0776, -0.0610,  ...,  0.0916,  0.0746, -0.1060],
        [-0.0233, -0.0515, -0.0584,  ...,  0.0418, -0.0975, -0.1035],
        [-0.0341,  0.0495, -0.0390,  ..., -0.0283,  0.0513, -0.0460],
        ...,
        [-0.0357,  0.1239, -0.0695,  ...,  0.0321, -0.0954,  0.1101],
        [ 0.0496, -0.0230,  0.0795,  ...,  0.1218,  0.1227,  0.0270],
        [ 0.1110,  0.0312, -0.0408,  ...,  0.0363,  0.1101,  0.0979]],
       requires_grad=True) nparam.grad: None
0.bias :
param: Parameter containing:
tensor([-0.1072,  0.0293, -0.1230,  0.0637,  0.0511, -0.0294, -0.0866, -0.0430,
        -0.0413, -0.0589,  0.0043, -0.0906,  0.0062, -0.0553, -0.0051, -0.0693,
         0.1203,  0.1058,  0.0142,  0.0726,  0.0804,  0.0819,  0.0177,  0.0371,
         0.0147, -0.0824,  0.0406, -0.1154,  0.1220,  0.0925, -0.1125, -0.0042],
       requires_grad=True) nparam.grad: None
2.weight :
param: Parameter containing:
tensor([[-0.1486, -0.0795, -0.0716,  

In [19]:
# easy way to get NN parameters with requires_grad == True
param_dict = {name: param for name, param in my_first_neural_network.named_parameters() if param.requires_grad}

In [20]:
param_dict

{'0.weight': Parameter containing:
 tensor([[-0.1235, -0.0776, -0.0610,  ...,  0.0916,  0.0746, -0.1060],
         [-0.0233, -0.0515, -0.0584,  ...,  0.0418, -0.0975, -0.1035],
         [-0.0341,  0.0495, -0.0390,  ..., -0.0283,  0.0513, -0.0460],
         ...,
         [-0.0357,  0.1239, -0.0695,  ...,  0.0321, -0.0954,  0.1101],
         [ 0.0496, -0.0230,  0.0795,  ...,  0.1218,  0.1227,  0.0270],
         [ 0.1110,  0.0312, -0.0408,  ...,  0.0363,  0.1101,  0.0979]],
        requires_grad=True), '0.bias': Parameter containing:
 tensor([-0.1072,  0.0293, -0.1230,  0.0637,  0.0511, -0.0294, -0.0866, -0.0430,
         -0.0413, -0.0589,  0.0043, -0.0906,  0.0062, -0.0553, -0.0051, -0.0693,
          0.1203,  0.1058,  0.0142,  0.0726,  0.0804,  0.0819,  0.0177,  0.0371,
          0.0147, -0.0824,  0.0406, -0.1154,  0.1220,  0.0925, -0.1125, -0.0042],
        requires_grad=True), '2.weight': Parameter containing:
 tensor([[-0.1486, -0.0795, -0.0716,  0.0629, -0.0544,  0.0718, -0.1625, -0

In [21]:
# freeze all parameters in a NN
for param in my_first_neural_network.parameters():
    param.requires_grad_(False)

In [22]:
param_dict = {name: param for name, param in my_first_neural_network.named_parameters() if param.requires_grad}
param_dict

{}

In [15]:
my_first_neural_network[1] = nn.SELU()

In [16]:
my_first_neural_network

Sequential(
  (0): Linear(in_features=64, out_features=32, bias=True)
  (1): SELU()
  (2): Linear(in_features=32, out_features=1, bias=True)
  (3): Sigmoid()
)

In [23]:
# unfreeze the last layer with gradients
my_first_neural_network[-2].weight.requires_grad_(True);
my_first_neural_network[-2].bias.requires_grad_(True);

In [24]:
param_dict = {name: param for name, param in my_first_neural_network.named_parameters() if param.requires_grad}
param_dict

{'2.weight': Parameter containing:
 tensor([[-0.1486, -0.0795, -0.0716,  0.0629, -0.0544,  0.0718, -0.1625, -0.1192,
          -0.0579, -0.0227,  0.1214, -0.1168,  0.1385, -0.0454, -0.0547, -0.1180,
           0.0825, -0.0096, -0.1004, -0.0854, -0.0659, -0.1035,  0.1126, -0.1017,
           0.0130,  0.0360, -0.1637,  0.1634,  0.0767, -0.1760,  0.0429, -0.1716]],
        requires_grad=True), '2.bias': Parameter containing:
 tensor([0.0449], requires_grad=True)}

## Weight initialisation

See [Udacity DL PyTorch weight init notebook](https://github.com/udacity/deep-learning-v2-pytorch/tree/master/weight-initialization).

In [26]:
[m.__class__.__name__ for m in my_first_neural_network]

['Linear', 'ReLU', 'Linear', 'Sigmoid']

In [46]:
# custom weights initialization (from https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html)
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)
    elif classname.find('Linear') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
        nn.init.normal_(m.bias.data, 0.0, 0.02)

In [72]:
# Apply the weights_init function to initialize all weights
my_first_neural_network.apply(weights_init)

Sequential(
  (0): Linear(in_features=64, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=1, bias=True)
  (3): Sigmoid()
)

## Custom nn classes

Minimal code to sublass nn.Module in PyTorch:
```
class NewClass(nn.Module):
   def __init__(self): # overwritte constructor
      super().__init__() # call super class constructor
      ...

   def forward(self, ...):
      ...
      return ..`
```

In [19]:
# Based on https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py
class GeLU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        out = 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        return out

In [20]:
gelu = GeLU()

In [21]:
gelu(torch.randn(6,3))

tensor([[-0.1478, -0.0914,  0.1858],
        [-0.1656,  0.1073, -0.0489],
        [ 0.3415, -0.1373, -0.0405],
        [ 0.3330, -0.1672, -0.0821],
        [ 0.6911,  0.7304,  1.3309],
        [-0.0269,  0.2888,  1.0044]])

## torch.nn.functional or F

See https://discuss.pytorch.org/t/whats-the-difference-between-torch-nn-functional-and-torch-nn/681/2?u=micpie

## Other useful tensor operations

In [46]:
# squeeze operation removes dimensions/axis with length 1:
torch.randn(3,1,5,1,1).squeeze().shape

torch.Size([3, 5])

In [47]:
# this can be also achieved with .view:
torch.randn(3,1,5,1,1).view(3,5).shape

torch.Size([3, 5])