In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Does nn.Conv2d init work well?

In [4]:
#export 
from exp.nb_02 import * 

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

In [None]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [5]:
x_train,y_train,x_valid,y_valid = get_data()
train_mean,train_std = x_train.mean(),x_train.std()
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [6]:
x_train = x_train.view(-1,1,28,28)
x_valid = x_valid.view(-1,1,28,28)
x_train.shape,x_valid.shape

(torch.Size([50000, 1, 28, 28]), torch.Size([10000, 1, 28, 28]))

In [7]:
n,*_ = x_train.shape
c = y_train.max()+1
nh = 32
n,c

(50000, tensor(10))

In [10]:
l1 = nn.Conv2d(1, nh, 5)

In [11]:
x = x_valid[:100]

In [12]:
x.shape

torch.Size([100, 1, 28, 28])

In [13]:
def stats(x): return x.mean(),x.std()

In [14]:
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [15]:
stats(l1.weight),stats(l1.bias)

((tensor(0.0076, grad_fn=<MeanBackward0>),
  tensor(0.1144, grad_fn=<StdBackward0>)),
 (tensor(0.0273, grad_fn=<MeanBackward0>),
  tensor(0.1132, grad_fn=<StdBackward0>)))

In [16]:
t = l1(x)

In [17]:
stats(t)

(tensor(0.0457, grad_fn=<MeanBackward0>),
 tensor(0.6746, grad_fn=<StdBackward0>))

In [18]:
init.kaiming_normal_(l1.weight, a=1.)
stats(l1(x))

(tensor(0.0285, grad_fn=<MeanBackward0>),
 tensor(1.0588, grad_fn=<StdBackward0>))

In [19]:
import torch.nn.functional as F

In [21]:
def f1(x, a=0): return F.leaky_relu(l1(x), a)

In [23]:
init.kaiming_normal_(l1.weight, a=0)
stats(f1(x))

(tensor(0.5159, grad_fn=<MeanBackward0>),
 tensor(0.9249, grad_fn=<StdBackward0>))

In [24]:
l1 = nn.Conv2d(1, nh, 5)
stats(f1(x))

(tensor(0.2264, grad_fn=<MeanBackward0>),
 tensor(0.3803, grad_fn=<StdBackward0>))

In [25]:
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [26]:
#receptive field size 
rec_fs = l1.weight[0,0].numel()
rec_fs

25

In [27]:
nf, ni, *_ = l1.weight.shape
nf, ni

(32, 1)

In [28]:
fan_in = ni*rec_fs
fan_out = nf*rec_fs
fan_in, fan_out

(25, 800)

In [29]:
#formula for kaimining.init
def gain(a): return math.sqrt(2.0 / (1 + a**2))

In [30]:
gain(1), gain(0), gain(0.01), gain(math.sqrt(5.))

(1.0, 1.4142135623730951, 1.4141428569978354, 0.5773502691896257)

In [31]:
torch.zeros(10000).uniform_(-1,1).std()

tensor(0.5813)

In [32]:
def kaiming2(x,a, use_fan_out=False):
    nf,ni,*_ = x.shape
    rec_fs = x[0,0].shape.numel()
    fan = nf*rec_fs if use_fan_out else ni*rec_fs
    std = gain(a) / math.sqrt(fan)
    bound = math.sqrt(3.) * std
    x.data.uniform_(-bound,bound)

In [33]:
kaiming2(l1.weight, a=0);
stats(f1(x))

(tensor(0.5465, grad_fn=<MeanBackward0>),
 tensor(1.0243, grad_fn=<StdBackward0>))

In [34]:
#Pytorch's version
kaiming2(l1.weight, a=math.sqrt(5.))
stats(f1(x))

(tensor(0.2456, grad_fn=<MeanBackward0>),
 tensor(0.4309, grad_fn=<StdBackward0>))

In [35]:
class Flatten(nn.Module): 
    def forward(self, x): return x.view(-1)

In [52]:
m = nn.Sequential(
    nn.Conv2d(1,8,5, stride=2, padding=1), 
    nn.ReLU(),
    nn.Conv2d(8,16,3, stride=2, padding=1), 
    nn.ReLU(),
    nn.Conv2d(16,32,3, stride=2, padding=1),
    nn.ReLU(), 
    nn.Conv2d(32,1,3, stride=2, padding=1), 
    nn.AdaptiveAvgPool2d(1),
    Flatten(), 
)

In [53]:
m

Sequential(
  (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (1): ReLU()
  (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (3): ReLU()
  (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (5): ReLU()
  (6): Conv2d(32, 1, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (7): AdaptiveAvgPool2d(output_size=1)
  (8): Flatten()
)

In [54]:
y = y_valid[:100].float()
x = x_valid[:100]

In [55]:
t = m(x)
stats(t)

(tensor(-0.0298, grad_fn=<MeanBackward0>),
 tensor(0.0078, grad_fn=<StdBackward0>))

In [56]:
y.shape, t.shape

(torch.Size([100]), torch.Size([100]))

In [57]:
l = mse(t,y)
l.backward()

In [58]:
stats(m[0].weight.grad)

(tensor(-0.0045), tensor(0.0422))

In [59]:
# init.kaiming_uniform_??

In [60]:
for l in m:
    if isinstance(l,nn.Conv2d):
        init.kaiming_uniform_(l.weight)
        l.bias.data.zero_()

In [61]:
t = m(x)
stats(t)

(tensor(0.1404, grad_fn=<MeanBackward0>),
 tensor(0.3267, grad_fn=<StdBackward0>))

In [62]:
l = mse(t,y)
l.backward()
stats(m[0].weight.grad)

(tensor(-0.1129), tensor(0.4858))

## Export

In [63]:
! python notebook2script.py 02a_why_sqrt5.ipynb

Converted 02a_why_sqrt5.ipynb to exp/nb_02a.py
