# **VGG**

Visual Geometry Group (VGG) is basicly a bigger and deeper AlexNet with repeated VGG blocks.

In [None]:
%matplotlib inline
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import numpy as np
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import torch.optim as optim
import time

batch_size = 128
num_epochs = 5
# Note not to flip two transform types, otherwise data type would be wrong.
transform = transforms.Compose([transforms.Resize(224),
                                transforms.ToTensor(),
                              ]) 

mnist_trainset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

# **Implementation for VGG11**
Key innovation in VGG is that the group layers into blocks which then turns into parameterizable repeated blocks that used for computer vision.

In [None]:
def vgg_block(num_convs, num_in_channels, num_out_channels):
    ''' Basic VGG block.
        Inputs: 
                num_convs: number of convolutional layers in this VGG block
                num_in_channels: number of input channels of the VGG block
                num_out_channels: number of output channels of the VGG block
        Output: the VGG block at the given shape
    '''
    blk = []
    for _ in range(num_convs):
        blk.append(nn.Conv2d(num_in_channels, num_out_channels, kernel_size=3, padding=1))
        blk.append(nn.ReLU())
        num_in_channels = num_out_channels
    blk.append(nn.MaxPool2d(kernel_size=2,stride=2))
    return nn.Sequential(*blk)


class flatten(nn.Module):
    ''' Flatten convotional layers output for classifier.'''
    def forward(self, x):
        return x.view(x.shape[0], -1)

# Establish VGG using vgg_block
def VGG(conv_arch):
    net = []
    num_in_channels = 1
    for (num_convs, num_out_channels) in conv_arch:
        net.append(vgg_block(num_convs, num_in_channels, num_out_channels))
        # Only the first input channel in a VGG block is different from the 
        # number of output channel in the same block. The number of first input
        # channels is the number of output channels in the previous VGG block
        num_in_channels = num_out_channels

    vgg_net=nn.Sequential(
                      *net,
                      flatten(),
                      # Classifier using three fully connected layers,
                      # with ReLU activation function and dropout rate at 0.5.
                      nn.Linear(512*7*7, 4096),
                      nn.ReLU(),
                      nn.Dropout(0.5),
                      nn.Linear(4096, 4096),
                      nn.ReLU(),
                      nn.Dropout(0.5),
                      nn.Linear(4096, 10)
                     )
    return vgg_net

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.normal_(0.0, 0.01)
    elif classname.find('Linear') != -1:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.normal_(0.0, 0.01)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.01)
        m.bias.data.fill_(0)

def evaluate_accuracy(data_iter, net):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = 0,0
    for (imgs, labels) in data_iter:
        # send data to the GPU if cuda is availabel
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        net.eval()
        with torch.no_grad():
            labels = labels.long()
            acc_sum += torch.sum((torch.argmax(net(imgs), dim=1) == labels)).float()
            n += labels.shape[0]
    return acc_sum.item()/n

# **Train with Implemented VGG**

In [None]:
# Loading training set and test set using DataLoader.
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=batch_size,
    shuffle=True, num_workers=0)
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=batch_size,
    shuffle=True, num_workers=0)

# Architecure of VGG-11
conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))


if torch.cuda.is_available():
    print('Training using GPU.')
    net = VGG(conv_arch).cuda()
else:
    print('Training using CPU.')
    net = VGG(conv_arch)

#Initialize network parameters.
net.apply(weights_init)

#Loss function
if torch.cuda.is_available():
    loss = nn.CrossEntropyLoss().cuda()
else:
    loss = nn.CrossEntropyLoss()

# Train using SGD optimizer 
lr= 0.05 # Compare to LeNet, the learning rate is much smaller due to much larget images
opt_n = optim.SGD(net.parameters(), lr=lr)

# Training stage
from tqdm import tqdm
for epoch in range(1, num_epochs+1):
    train_loader_iter = iter(train_loader)
    train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
    
    for _, (imgs, labels) in tqdm(enumerate(train_loader_iter)):
        net.train()
        opt_n.zero_grad()
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        # Label prediction from LeNet
        y_hat = net(imgs)
        l = loss(y_hat, labels)
        # Backprobagation
        l.backward()
        opt_n.step()

        # Calculate tarining error
        with torch.no_grad():
            labels = labels.long()
            train_l_sum += l.item()
            train_acc_sum += (torch.sum(torch.argmax(y_hat, dim=1) == labels)).float().item()
            n += labels.shape[0]
    # calculate testing error every epoch.
    test_acc = evaluate_accuracy(iter(test_loader), net)
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
          % (epoch, train_l_sum/n, train_acc_sum/n, test_acc,
            time.time() - start))


            



Training using GPU.


469it [06:55,  1.13it/s]
0it [00:00, ?it/s]

epoch 1, loss 0.0126, train acc 0.399, test acc 0.783, time 437.1 sec


469it [06:57,  1.12it/s]
0it [00:00, ?it/s]

epoch 2, loss 0.0037, train acc 0.825, test acc 0.857, time 439.6 sec


469it [06:58,  1.12it/s]
0it [00:00, ?it/s]

epoch 3, loss 0.0027, train acc 0.871, test acc 0.881, time 439.7 sec


469it [06:57,  1.12it/s]
0it [00:00, ?it/s]

epoch 4, loss 0.0023, train acc 0.889, test acc 0.890, time 439.5 sec


469it [06:57,  1.12it/s]


epoch 5, loss 0.0021, train acc 0.903, test acc 0.896, time 439.6 sec


# **Train with Pytorch VGG model**
In Pytorch, you can also call the implemented and pretrained VGG models. In the following notebook, we call VGG11 to compare with the self-implemented model. 

We can also call VGG13, VGG16, VGG19 (and the versions with batrh normalization).

In [1]:
# Sample code
import torchvision.models as models
pretrained = True
net = models.vgg11(pretrained=pretrained).cuda()
print(net)

Downloading: "https://download.pytorch.org/models/vgg11-bbd30ac9.pth" to /root/.cache/torch/hub/checkpoints/vgg11-bbd30ac9.pth


HBox(children=(FloatProgress(value=0.0, max=531456000.0), HTML(value='')))


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
