In [2]:
# display full output
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all' 

In [8]:
import torch
import torch.nn as nn
import numpy as np

# Linear layer

In [7]:
inputs = torch.randn(10,4) # size (10,4) in neural networks mean there're 10 learning examples in the inputs, each with 4 features
fc = nn.Linear(in_features= 4, out_features = 1) # in_features should be equal to inputs.shape[1], the out_features is the number of features of the target
outputs = fc(inputs) # hence, the outputs would in the size of (10,1)

In [8]:
inputs
outputs

outputs.shape

tensor([[ 0.3102,  0.1271, -0.0823,  1.0858],
        [-1.1686,  1.6650,  1.3797, -0.9915],
        [ 0.3897,  0.3723,  1.0115,  0.1784],
        [ 1.9373, -1.0086,  0.7447,  0.2432],
        [ 0.4305,  0.0627,  0.7668, -1.9446],
        [ 1.0522, -0.2573,  0.6940,  0.0913],
        [-0.0278, -0.0746, -0.0560, -1.1732],
        [-1.1034,  0.8108, -1.1170, -0.2516],
        [-1.1515,  1.2772, -1.6101, -0.8164],
        [-0.6352,  0.9345,  0.4916,  1.1171]])

tensor([[-0.2397],
        [-0.9325],
        [-0.7593],
        [-0.8897],
        [-1.2447],
        [-0.7963],
        [-0.7541],
        [-0.1225],
        [-0.1838],
        [-0.2319]], grad_fn=<AddmmBackward>)

torch.Size([10, 1])

In [9]:
# check the weights and biases in this 'fc' linear layer
fc.weight
fc.bias

Parameter containing:
tensor([[-0.2009, -0.0747, -0.2266,  0.2618]], requires_grad=True)

Parameter containing:
tensor([-0.4708], requires_grad=True)

The algorithm behind the linear layer is actually some basic matrix calculus:
$$outputs = inputs \times weight^T + bias$$
Let's manually implement it

In [12]:
torch.mm(inputs,fc.weight.T).add(fc.bias) # Here we get the same outputs 

tensor([[-0.2397],
        [-0.9325],
        [-0.7593],
        [-0.8897],
        [-1.2447],
        [-0.7963],
        [-0.7541],
        [-0.1225],
        [-0.1838],
        [-0.2319]], grad_fn=<AddBackward0>)

In [17]:
fc1 = nn.Linear(in_features=4, out_features=1, bias=False) # We could remove the bias by setting it to False
fc1(inputs)
torch.mm(inputs,fc1.weight.T) # Again we get the same outputs as fc1(inputs)

tensor([[-0.4864],
        [ 1.7685],
        [ 0.5358],
        [-0.1849],
        [ 1.2545],
        [ 0.1672],
        [ 0.4731],
        [-0.0845],
        [ 0.1134],
        [ 0.1043]], grad_fn=<MmBackward>)

tensor([[-0.4864],
        [ 1.7685],
        [ 0.5358],
        [-0.1849],
        [ 1.2545],
        [ 0.1672],
        [ 0.4731],
        [-0.0845],
        [ 0.1134],
        [ 0.1043]], grad_fn=<MmBackward>)

# Convolutional layers

The difference among Conv1D, Conv2D, Conv3D is the input of these layers
* Conv1D is used for input signals which are similar to the voice, with the input shape (batch_size, W, channels)
* Conv2D is used for images, with the input shape (batch_size, H, W, channels)
* Conv3D is usually used for videos where you have a frame for each time span, with the input shape (batch_size, H, W, d, channels)

Here we focus on Conv2D

In [6]:
conv2d = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2)

inputs = torch.rand(100,3,120,160) # the inputs should be in the shape of [batch_size,in_channels,height,width]
outputs = conv2d(inputs)

outputs.shape # [batch_size, out_channels, height, width]

torch.Size([100, 16, 59, 79])

You may be wondering how could we calculate the outputs height and width, well, here is the formula:

In [9]:
# inputs shape: [batch_size, in_channels, input_height, input_width]
# conv2d parameters: in_channels, out_channels, kernel_size, stride, dilation
# outputs shape: [batch_size, out_channels, output_height, output_width]

batch_size, in_channels, input_height, input_width = 100, 3, 120, 160
out_channels, kernel_size, stride, padding, dilation = 16, [3,3], [2,2], [0,0], [1,1]
# the default value for stride, padding, dilation are [1,1], [0,0], [1,1]


output_height = int(np.floor((input_height + 2*padding[0] - dilation[0]*(kernel_size[0]-1)-1)/stride[0] + 1))
output_width = int(np.floor((input_width + 2*padding[1] - dilation[1]*(kernel_size[1]-1)-1)/stride[1] + 1))

print("Output size: {}".format([batch_size, out_channels, output_height, output_width]))
print("Weight/kernel size: {}".format([out_channels, in_channels, kernel_size[0], kernel_size[1]]))

Output size: [100, 16, 59, 79]
Weight/kernel size: [16, 3, 3, 3]


# Max Pool layer

nn.MaxPool2d applies a 2D max pooling over an input signal composed of several input planes. inputs shape is [batch_size, in_channels, image_height, image_width]

In [15]:
maxpool = nn.MaxPool2d(kernel_size=(3,2), padding=0, dilation=1)
inputs = torch.rand(100,3,120,160) # shape: [batch_size, in_channels, height, width]
outputs = maxpool(inputs)

outputs.shape

torch.Size([100, 3, 40, 80])

Here is the formula to calculate the outputs height and width

In [16]:
## ------------ MaxPoll2d --------------- ## 

# inputs shape: [batch_size, in_channels, input_height, input_width]
# outputs shape: [batch_size, out_channels, output_height, output_width]

batch_size, in_channels, input_height, input_width = 100,3,120,160
kernel_size, padding, dilation = [3,2], [0,0], [1,1]
stride = kernel_size
# default values are: stride=kernel_size, padding=[0,0], dilation=[1,1]


output_height = int(np.floor((input_height + 2*padding[0] - dilation[0]*(kernel_size[0]-1)-1)/stride[0] + 1))
output_width = int(np.floor((input_width + 2*padding[1] - dilation[1]*(kernel_size[1]-1)-1)/stride[1] + 1))

print("Output size: {}".format([batch_size, in_channels, output_height, output_width]))

Output size: [100, 3, 40, 80]


# Batch Normalization

## BatchNorm1d

BatchNorm1d usually applies Barch Normlization over a 2D input, which in the shape of [batch_size,num_features]. The mean and standard-deviation are calculated per-dimension over the mini-batches and the weight and bias are learnable parameter vetors

In [19]:
batchNorm1 = nn.BatchNorm1d(5, affine=True) # the first argument 5 is the number of input features

inputs = torch.randn(10, 5) # inputs is in the shape of [batch_size, num_features]
outputs = batchNorm1(inputs)
outputs

tensor([[-0.2533,  0.7277,  0.9117, -0.5479, -0.6629],
        [ 1.8718,  0.9728, -0.3642,  1.8587,  0.0546],
        [ 0.1330,  2.1461, -1.2436, -0.4471, -0.3075],
        [-0.5346, -0.4766, -0.5465,  0.7113, -0.3958],
        [ 0.0586,  0.0893, -0.3842, -1.0007, -0.0269],
        [ 0.7026, -1.0440, -1.3213,  0.3033,  0.1506],
        [ 0.1498, -0.0740, -0.3236,  0.2132, -1.6511],
        [-0.6272, -0.4069,  1.7836, -0.5955,  2.1632],
        [ 0.6795, -0.4418,  1.4164,  1.1529, -0.5701],
        [-2.1803, -1.4923,  0.0716, -1.6482,  1.2458]],
       grad_fn=<NativeBatchNormBackward>)

Let's reproduct the above calculation

In [20]:
inputs.mean(dim=0) # we get the mean and standard-deviation for each column
inputs.var(dim=0, unbiased=False)

tensor([ 1.8866e-01, -5.7365e-01, -9.9263e-02, -3.3903e-04, -3.8356e-01])

tensor([1.2932, 0.7139, 1.5251, 1.4939, 0.5185])

$$ y = \frac{x-E[x]}{\sqrt{Var[x]+\epsilon}}*weight + bias$$

In [24]:
(inputs-inputs.mean(dim=0))/torch.sqrt(inputs.var(dim=0,unbiased=False)+batchNorm1.eps)*batchNorm1.weight + batchNorm1.bias

tensor([[-0.2533,  0.7277,  0.9117, -0.5479, -0.6629],
        [ 1.8718,  0.9728, -0.3642,  1.8587,  0.0546],
        [ 0.1330,  2.1461, -1.2436, -0.4471, -0.3075],
        [-0.5346, -0.4766, -0.5465,  0.7113, -0.3958],
        [ 0.0586,  0.0893, -0.3842, -1.0007, -0.0269],
        [ 0.7026, -1.0440, -1.3213,  0.3033,  0.1506],
        [ 0.1498, -0.0740, -0.3236,  0.2132, -1.6511],
        [-0.6272, -0.4069,  1.7836, -0.5955,  2.1632],
        [ 0.6795, -0.4418,  1.4164,  1.1529, -0.5701],
        [-2.1803, -1.4923,  0.0716, -1.6482,  1.2458]], grad_fn=<AddBackward0>)

## BatchNorm2d

BatchNorm2d applies batch Normalization over a 4D input with the shape of [Batch_size, in_channels,height,width]. The mean and standard-deviation are calculated per-dimension over the channels.

In [26]:
batchNorm2d = nn.BatchNorm2d(3, affine=True) # the first parameter 3 is the in_channels of the input

inputs = torch.randn(2,3,2,2) # in the shape of [batch_size, in_channles, height, width]
outputs = batchNorm2d(inputs)

outputs

tensor([[[[ 0.0825, -0.2377],
          [ 2.4320, -0.7050]],

         [[-0.8265, -0.7808],
          [ 1.2174, -1.3867]],

         [[ 0.9693, -0.7551],
          [-0.3473, -0.7084]]],


        [[[-0.3053, -0.4256],
          [-1.0903,  0.2493]],

         [[-0.1257,  0.1201],
          [ 1.8087, -0.0265]],

         [[-1.2152, -0.6561],
          [ 0.9815,  1.7313]]]], grad_fn=<NativeBatchNormBackward>)

Let's reproduct that with this fomular below
$$ y = \frac{x-E[x]}{\sqrt{Var[x]+\epsilon}}*weight + bias$$

In [39]:
# To calculate the means and vars in each channels

means = torch.tensor([inputs[:,0].mean(),inputs[:,1].mean(),inputs[:,2].mean()
                     ]).view(-1,1,1).expand_as(inputs)
vars = torch.tensor([inputs[:,0].var(unbiased=False),inputs[:,1].var(unbiased=False),inputs[:,2].var(unbiased=False)
                     ]).view(-1,1,1).expand_as(inputs)

In [38]:
batchNorm2d.weight.view(-1,1,1).expand_as(inputs)*(inputs-means)/torch.sqrt(vars+batchNorm2d.eps)+ batchNorm2d.bias.view(-1,1,1).expand_as(inputs)


tensor([[[[ 0.0825, -0.2377],
          [ 2.4320, -0.7050]],

         [[-0.8265, -0.7808],
          [ 1.2174, -1.3867]],

         [[ 0.9693, -0.7551],
          [-0.3473, -0.7084]]],


        [[[-0.3053, -0.4256],
          [-1.0903,  0.2493]],

         [[-0.1257,  0.1201],
          [ 1.8087, -0.0265]],

         [[-1.2152, -0.6561],
          [ 0.9815,  1.7313]]]], grad_fn=<AddBackward0>)