In [1]:
import torch
from torch import nn
from d2l import torch as d2l
from torch.nn import functional as F
from train_epoch.train import train_ch6

In [2]:
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not torch.is_grad_enabled():
        X_hat = (X - moving_mean) / torch.sqrt(moving_var  + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            mean = X.mean(dim=(0, 2, 3), keepdim=True)
            var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
        X_hat = (X - mean) / torch.sqrt(var + eps)
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta
    return Y, moving_mean.data, moving_var.data

In [3]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)
        
    def forward(self, X):
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9
        )
        return Y

In [4]:
net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
    nn.Linear(16*4*4, 120), BatchNorm(120, num_dims=2), nn.Sigmoid(),
    nn.Linear(120, 84), BatchNorm(84, num_dims=2), nn.Sigmoid(),
    nn.Linear(84, 10))

In [22]:
lr, num_epochs, batch_size = 0.1, 10, 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu(),
          save_path="models/leNet_norm",load_dir="models/leNet_norm/best.ckpt")

load model from models/leNet_norm/best.ckpt
training on cuda:0


Epoch 1/10: 100%|██████████| 60000/60000 [00:02<00:00, 20151.29img/s, loss=0.2694, train_acc=0.9022]


epoch 0, loss 0.269, train acc 0.902, test_acc 0.893


Epoch 2/10: 100%|██████████| 60000/60000 [00:02<00:00, 21063.02img/s, loss=0.2607, train_acc=0.9053]


epoch 1, loss 0.261, train acc 0.905, test_acc 0.893


Epoch 3/10: 100%|██████████| 60000/60000 [00:02<00:00, 21343.80img/s, loss=0.2575, train_acc=0.9063]


epoch 2, loss 0.258, train acc 0.906, test_acc 0.887
Saved best model checkpoint to models/leNet_norm\0508121931\best.ckpt with test accuracy 0.887


Epoch 4/10: 100%|██████████| 60000/60000 [00:02<00:00, 21575.63img/s, loss=0.2542, train_acc=0.9082]


epoch 3, loss 0.254, train acc 0.908, test_acc 0.895


Epoch 5/10: 100%|██████████| 60000/60000 [00:02<00:00, 21419.66img/s, loss=0.2517, train_acc=0.9080]


epoch 4, loss 0.252, train acc 0.908, test_acc 0.896


Epoch 6/10: 100%|██████████| 60000/60000 [00:02<00:00, 21479.81img/s, loss=0.2494, train_acc=0.9090]


epoch 5, loss 0.249, train acc 0.909, test_acc 0.897
Saved best model checkpoint to models/leNet_norm\0508121931\best.ckpt with test accuracy 0.897


Epoch 7/10: 100%|██████████| 60000/60000 [00:02<00:00, 21233.71img/s, loss=0.2473, train_acc=0.9102]


epoch 6, loss 0.247, train acc 0.910, test_acc 0.894


Epoch 8/10: 100%|██████████| 60000/60000 [00:02<00:00, 21700.43img/s, loss=0.2452, train_acc=0.9115]


epoch 7, loss 0.245, train acc 0.911, test_acc 0.895


Epoch 9/10: 100%|██████████| 60000/60000 [00:02<00:00, 20788.88img/s, loss=0.2437, train_acc=0.9119]


epoch 8, loss 0.244, train acc 0.912, test_acc 0.898
Saved best model checkpoint to models/leNet_norm\0508121931\best.ckpt with test accuracy 0.898


Epoch 10/10: 100%|██████████| 60000/60000 [00:02<00:00, 21360.23img/s, loss=0.2417, train_acc=0.9116]


epoch 9, loss 0.242, train acc 0.912, test_acc 0.894
loss 0.242, train acc 0.912, test acc 0.894
70225.0 examples/sec on cuda:0


In [5]:
net[1].gamma.reshape((-1,)), net[1].beta.reshape((-1,))

(tensor([1., 1., 1., 1., 1., 1.], grad_fn=<ViewBackward0>),
 tensor([0., 0., 0., 0., 0., 0.], grad_fn=<ViewBackward0>))

In [6]:
net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5), nn.BatchNorm2d(6), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), nn.BatchNorm2d(16), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
    nn.Linear(256, 120), nn.BatchNorm1d(120), nn.Sigmoid(),
    nn.Linear(120, 84), nn.BatchNorm1d(84), nn.Sigmoid(),
    nn.Linear(84, 10))

In [9]:
lr, num_epochs, batch_size = 0.1, 10, 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu(),save_path="models/leNet_norm", load_dir="models/leNet_norm/best.ckpt")

load model from models/leNet_norm/best.ckpt
training on cuda:0
pretrain test acc: 0.8797


Epoch 1/10: 100%|██████████| 60000/60000 [00:02<00:00, 22700.15img/s, loss=0.2438, train_acc=0.9109]


epoch 0, loss 0.244, train acc 0.911, test_acc 0.895


Epoch 2/10: 100%|██████████| 60000/60000 [00:02<00:00, 22597.36img/s, loss=0.2378, train_acc=0.9142]


epoch 1, loss 0.238, train acc 0.914, test_acc 0.890


Epoch 3/10: 100%|██████████| 60000/60000 [00:02<00:00, 22897.75img/s, loss=0.2351, train_acc=0.9138]


epoch 2, loss 0.235, train acc 0.914, test_acc 0.898
Saved best model checkpoint to models/leNet_norm\0508123535\best.ckpt with test accuracy 0.898


Epoch 4/10: 100%|██████████| 60000/60000 [00:02<00:00, 22402.94img/s, loss=0.2328, train_acc=0.9148]


epoch 3, loss 0.233, train acc 0.915, test_acc 0.899


Epoch 5/10: 100%|██████████| 60000/60000 [00:02<00:00, 22109.15img/s, loss=0.2311, train_acc=0.9163]


epoch 4, loss 0.231, train acc 0.916, test_acc 0.899


Epoch 6/10: 100%|██████████| 60000/60000 [00:02<00:00, 22710.35img/s, loss=0.2293, train_acc=0.9163]


epoch 5, loss 0.229, train acc 0.916, test_acc 0.899
Saved best model checkpoint to models/leNet_norm\0508123535\best.ckpt with test accuracy 0.899


Epoch 7/10: 100%|██████████| 60000/60000 [00:02<00:00, 22764.62img/s, loss=0.2284, train_acc=0.9169]


epoch 6, loss 0.228, train acc 0.917, test_acc 0.898


Epoch 8/10: 100%|██████████| 60000/60000 [00:02<00:00, 23087.15img/s, loss=0.2263, train_acc=0.9181]


epoch 7, loss 0.226, train acc 0.918, test_acc 0.899


Epoch 9/10: 100%|██████████| 60000/60000 [00:02<00:00, 22819.99img/s, loss=0.2252, train_acc=0.9176]


epoch 8, loss 0.225, train acc 0.918, test_acc 0.900
Saved best model checkpoint to models/leNet_norm\0508123535\best.ckpt with test accuracy 0.900


Epoch 10/10: 100%|██████████| 60000/60000 [00:02<00:00, 22991.33img/s, loss=0.2239, train_acc=0.9185]


epoch 9, loss 0.224, train acc 0.919, test_acc 0.882
loss 0.224, train acc 0.919, test acc 0.882
90975.0 examples/sec on cuda:0
