In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import torch.nn.functional as F

In [3]:
import torch
torch.__version__

'1.4.0'

## Does nn.Conv2d init work well?

[Jump_to lesson 9 video](https://course.fast.ai/videos/?lesson=9&t=21)

In [4]:
#export
import sys
sys.path.append('../')
from exp.nb_02 import *


def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid),
         _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train, y_train, x_valid, y_valid))


def normalize(x, m, s):
    return (x - m) / s

In [5]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [6]:
# Signature: torch.nn.modules.conv._ConvNd.reset_parameters(self)
# Docstring: <no docstring>
# Source:   
#     def reset_parameters(self):
#         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
#         if self.bias is not None:
#             fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
#             bound = 1 / math.sqrt(fan_in)
#             init.uniform_(self.bias, -bound, bound)
# File:      ~/anaconda3/envs/dl-zoo/lib/python3.7/site-packages/torch/nn/modules/conv.py
# Type:      function

In [7]:
x_train, y_train, x_valid, y_valid = get_data()
train_mean, train_std = x_train.mean(), x_train.std()
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [8]:
# Reshape to C x h x w to use Conv layers
x_train = x_train.view(-1, 1, 28, 28)
x_valid = x_valid.view(-1, 1, 28, 28)
x_train.shape,x_valid.shape

(torch.Size([50000, 1, 28, 28]), torch.Size([10000, 1, 28, 28]))

In [9]:
n, *_ = x_train.shape
c = y_train.max() + 1
nh = 32
n, c

(50000, tensor(10))

In [10]:
l1 = nn.Conv2d(1, nh, 5)

In [11]:
x = x_valid[:100]

In [12]:
x.shape

torch.Size([100, 1, 28, 28])

In [13]:
def stats(x): 
    return x.mean(), x.std()

- weight shape in Conv2D is `out_channels x input_channels x filter_height x filter_width`.

In [14]:
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [15]:
stats(l1.weight), stats(l1.bias)

((tensor(6.3121e-06, grad_fn=<MeanBackward0>),
  tensor(0.1165, grad_fn=<StdBackward0>)),
 (tensor(-0.0369, grad_fn=<MeanBackward0>),
  tensor(0.1172, grad_fn=<StdBackward0>)))

In [16]:
t = l1(x)
stats(t)

(tensor(-0.0375, grad_fn=<MeanBackward0>),
 tensor(0.6822, grad_fn=<StdBackward0>))

- Kaiming normal is generally used when the activation function is either **Relu** or **Leaky Reu**.
- Since all pixels are nonnegative, so leaky relu is the same as **a = 1**.

In [17]:
init.kaiming_normal_(l1.weight, a=1.)
stats(l1(x))

(tensor(-0.0269, grad_fn=<MeanBackward0>),
 tensor(1.0910, grad_fn=<StdBackward0>))

Looks like it is working.

In [18]:
def f1(x, a=0):
    return F.leaky_relu(l1(x), a)

In [19]:
init.kaiming_normal_(l1.weight, a=0)
stats(f1(x))

(tensor(0.4253, grad_fn=<MeanBackward0>),
 tensor(0.8559, grad_fn=<StdBackward0>))

The mean is no longer zero because after normalizing the values and using Relu, we removed all values below zero and assign them a zero value. Therefore, the mean will no longer be zero.

In [20]:
l1 = nn.Conv2d(1, nh, 5)
stats(f1(x))

(tensor(0.1911, grad_fn=<MeanBackward0>),
 tensor(0.3712, grad_fn=<StdBackward0>))

The variance of default initialization is about **0.36** which is not good at all.

In [21]:
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [22]:
# receptive field size (filter_height x filter_width)
rec_fs = l1.weight[0, 0].numel()
rec_fs

25

In [23]:
nf, ni= l1.weight.shape[:2]
nf,ni

(32, 1)

In [24]:
fan_in  = ni * rec_fs
fan_out = nf * rec_fs
fan_in, fan_out

(25, 800)

$$gain = \sqrt\frac{2}{1 + a ^2}$$

In [25]:
def gain(a):
    return math.sqrt(2.0 / (1 + a ** 2))

$a = 0$ is Relu.

In [26]:
gain(0), gain(0.01), gain(0.1), gain(1), gain(math.sqrt(5.))

(1.4142135623730951,
 1.4141428569978354,
 1.4071950894605838,
 1.0,
 0.5773502691896257)

The first 3 gains are close to $\sqrt{2}$.

PyTorch uses *kaiming_uniform* for initializing the weight tensors of Conv2D layer. The mean and std of a uniform distribution are $\frac{(b - a)}{2}$ and $\frac{(b - a)^2}{\sqrt{12}}$.

In [27]:
(1 - (-1)) / math.sqrt(12)

0.5773502691896258

The gain of $\sqrt{5}$ is the same as the std of uniform[-1, 1].

In [28]:
torch.zeros(10000).uniform_(-1, 1).std()

tensor(0.5748)

In [29]:
def kaiming2(x, a, use_fan_out=False):
    nf, ni, *_ = x.shape
    rec_fs = x[0, 0].shape.numel()
    fan = nf * rec_fs if use_fan_out else ni * rec_fs
    std = gain(a) / math.sqrt(fan)
    bound = math.sqrt(3.) * std
    x.data.uniform_(-bound, bound)

since $std = \frac{(b - a)^2}{\sqrt{12}}$ and assume $a = -b$; therefore, $std = \frac{(2b)^2}{\sqrt{12}} = \sqrt\frac{4b^2}{12} = \frac{b}{\sqrt{3}}$
$$\rightarrow bound = std * \sqrt{3}$$

In [30]:
kaiming2(l1.weight, a=0);
stats(f1(x))

(tensor(0.4547, grad_fn=<MeanBackward0>),
 tensor(0.8313, grad_fn=<StdBackward0>))

In [31]:
kaiming2(l1.weight, a=math.sqrt(5.))
stats(f1(x))

(tensor(0.2163, grad_fn=<MeanBackward0>),
 tensor(0.3927, grad_fn=<StdBackward0>))

In [32]:
class Flatten(nn.Module):
    def forward(self, x):
        return x.view(-1)

In [33]:
m = nn.Sequential(
    nn.Conv2d(1, 8, 5, stride=2, padding=2), nn.ReLU(),
    nn.Conv2d(8, 16, 3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(16, 32, 3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(32, 1, 3, stride=2, padding=1),
    nn.AdaptiveAvgPool2d(1),
    Flatten(),
)

In [34]:
y = y_valid[:100].float()

In [35]:
t = m(x)
stats(t)

(tensor(0.0107, grad_fn=<MeanBackward0>),
 tensor(0.0080, grad_fn=<StdBackward0>))

That is bad because the input had a std = 1, the first hidden layer had std = 0.4 and the 4th layer had a std ~ 0. That is not good at all because that means you can't learn anything with 4 layers and you can't create a model with more layers due to this issue.

In [36]:
l = mse(t,y)
l.backward()

In [37]:
stats(m[0].weight.grad)

(tensor(0.0023), tensor(0.0378))

The gradients also have close to zero std.

In [38]:
init.kaiming_uniform_??

In [39]:
for l in m:
    if isinstance(l,nn.Conv2d):
        init.kaiming_uniform_(l.weight)
        l.bias.data.zero_()

In [40]:
t = m(x)
stats(t)

(tensor(-0.6975, grad_fn=<MeanBackward0>),
 tensor(0.3601, grad_fn=<StdBackward0>))

In [41]:
l = mse(t,y)
l.backward()
stats(m[0].weight.grad)

(tensor(0.6462), tensor(0.6631))

The above gradients and weights std are much better in terms of std and are no longer close to zero.

## Export

In [42]:
!python ../src/notebook2script.py 02a_why_sqrt5.ipynb

Converted 02a_why_sqrt5.ipynb to exp/nb_02a.py
