# **AlexNet**

AlexNet improves the LeNet using bigger and deeper networks. It further improves the performance with ReLU, Dropout, and MaxPolling.

In this notebook, the 28$\times$28 Fasion-MNIST is scale up to 224$\times$224, with 10 class.

In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import numpy as np
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import torch.optim as optim
import time

batch_size = 128
num_epochs = 5
# Note not to flip two transform types, otherwise data type would be wrong.
transform = transforms.Compose([transforms.Resize(224),
                                transforms.ToTensor(),
                              ]) 

mnist_trainset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

AlexNet uses extra three sucessive convolutional layers to learn the features, and uses ReLU, dropout in the classification stage.

In [2]:
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1)
        self.conv2 = nn.Conv2d(96, 256, kernel_size=5, padding=2)
        self.conv3 = nn.Conv2d(256, 384, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(384, 384, kernel_size=3, padding=1)
        self.conv5 = nn.Conv2d(384, 256, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(6400, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, 10)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = self.maxpool(self.relu(self.conv1(x)))
        x = self.maxpool(self.relu(self.conv2(x)))
        # Then, use three sucessive convolutional layers and a smaller covolution window
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        # dimensional reduction using maxpooling
        x = self.maxpool(x)
        # Flatten convotional layers output for classifier.
        x = x.view(x.shape[0], -1)
        # Classifier using three fully connected layers.
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.fc3(x)
        return x

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.normal_(0.0, 0.01)
    elif classname.find('Linear') != -1:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.normal_(0.0, 0.01)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.01)
        m.bias.data.fill_(0)

def evaluate_accuracy(data_iter, net):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = 0,0
    for (imgs, labels) in data_iter:
        # send data to the GPU if cuda is availabel
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        net.eval()
        with torch.no_grad():
            labels = labels.long()
            acc_sum += torch.sum((torch.argmax(net(imgs), dim=1) == labels)).float()
            n += labels.shape[0]
    return acc_sum.item()/n

In [28]:
# Loading training set and test set using DataLoader.
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=batch_size,
    shuffle=True, num_workers=0)
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=batch_size,
    shuffle=True, num_workers=0)

if torch.cuda.is_available():
    print('Training using GPU.')
    net = LeNet().cuda()
else:
    print('Training using CPU.')
    net = LeNet()

#Initialize network parameters.
net.apply(weights_init)

#Loss function
if torch.cuda.is_available():
    loss = nn.CrossEntropyLoss().cuda()
else:
    loss = nn.CrossEntropyLoss()

# Train using SGD optimizer 
lr= 0.01 # Compare to LeNet, the learning rate is much smaller due to much larget images
opt_n = optim.SGD(net.parameters(), lr=lr)

# Training stage
for epoch in range(1, num_epochs+1):
    train_loader_iter = iter(train_loader)
    train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
    
    for (imgs, labels) in train_loader_iter:
        net.train()
        opt_n.zero_grad()
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        # Label prediction from LeNet
        y_hat = net(imgs)
        l = loss(y_hat, labels)
        # Backprobagation
        l.backward()
        opt_n.step()

        # Calculate tarining error
        with torch.no_grad():
            labels = labels.long()
            train_l_sum += l.item()
            train_acc_sum += (torch.sum(torch.argmax(y_hat, dim=1) == labels)).float().item()
            n += labels.shape[0]
    # calculate testing error every epoch.
    test_acc = evaluate_accuracy(iter(test_loader), net)
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
          % (epoch, train_l_sum/n, train_acc_sum/n, test_acc,
            time.time() - start))


            



Training using GPU.
epoch 1, loss 0.0081, train acc 0.616, test acc 0.777, time 64.2 sec
epoch 2, loss 0.0044, train acc 0.791, test acc 0.824, time 64.2 sec
epoch 3, loss 0.0037, train acc 0.826, test acc 0.848, time 64.3 sec
epoch 4, loss 0.0033, train acc 0.845, test acc 0.861, time 64.4 sec
epoch 5, loss 0.0030, train acc 0.859, test acc 0.870, time 64.1 sec
