# AlexNet with MNIST dataset

`Author: YUAN Yanzhe`

- This notebook is a reproduction of the [AlexNet paper](https://dl.acm.org/doi/pdf/10.1145/3065386).
  - If you want to do parameter fine-tuning, setting hyperparameters on the entrance of the model is recommended.
    - e.g. def \_\_init\_\_(param) 
- The code runs on Google Colab, GPU mode


**一些细节**
- AlexNet结构：
  - 两层: conv-relu-pool: conv增加特征数、减小image的size，pool下采样。
  - 三层conv, +pool: conv-relu-conv-relu-conv: 总的来说特征数不变，采用311结构让image的size不变。
  - fc层：**减少特征到10**
    - linear-relu-dropout：减少特征数+dropout
    - linear-relu-dropout：特征数不变+dropout
    - output：linear降特征数到10
- 论文数据集是ImageNet,
- AlexNet与LeNet的区别
  - 层数更深
  - 采用ReLU
  - 采用Dropout
  - 采用图像增广技术来扩大数据集
    - 比如：翻转、裁剪和颜色变化
- 可看出分类效果比LeNet好
- trick:
  - Conv2d(in,out,3,1,1) 保留image的hight和width，对feature_num改变
  - MaxPool2d(2,2) 保留feature_num对image的hight和width进行减半
- 可以用类似与这样的语句来测试网络每层输出的维度。
for name, blk in net.named_children(): 
    X = blk(X)
    print(name, 'output shape: ', X.shape)
  - X = torch.rand((1, 1, 224, 224))
  - for name, layer in net.named_children():  `named_children获取一级子模块及其名字(named_modules会返回所有子模块,包括子模块的子模块)`
  - X = layer(X)
  - print(name, ' output shape:\t', X.shape)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/d2dl_pytorch')

In [34]:
# Import Packages
import torch
from torch import nn as nn
from torch import optim as optim
from torch.utils import data as Data

import torchvision
from torchvision import datasets
from torchvision import transforms

import numpy as np
import pandas as pd 
import time

import d2lzh_pytorch as d2dl

print(torch.__version__)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device on:', device)

1.7.0+cu101
device on: cuda


In [35]:
# Hyperparameters
batch_size = 256
num_epochs = 5
learning_rate = 0.001

num_classes = 10

# Load Data
# non-default argument follows default argument, has to define non-default value first
def load_data_from_mnist(batch_size, resize=None, root=''):
    trans = []
    if resize:
        trans.append(transforms.Resize(resize))
    trans.append(transforms.ToTensor())
    transform = transforms.Compose(trans)

    train_data = torchvision.datasets.MNIST(root=root,train=True,transform=transform,download=False)
    test_data = torchvision.datasets.MNIST(root=root,train=False,transform=transform,download=False)
    train_iterator = Data.DataLoader(train_data,batch_size=batch_size,shuffle=True,num_workers=4)
    test_iterator = Data.DataLoader(test_data,batch_size=batch_size,shuffle=True,num_workers=4)

    return train_iterator, test_iterator

def load_data_fashion_mnist(batch_size, resize=None, root=''):
    trans = []
    if resize:
        trans.append(torchvision.transforms.Resize(size=resize))
    trans.append(torchvision.transforms.ToTensor())

    transform = torchvision.transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)

    train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=4)
    test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=4)

    return train_iter, test_iter

#train_iterator, test_iterator = load_data_fashion_mnist(batch_size,resize=224)
train_iterator, test_iterator = load_data_from_mnist(batch_size,resize=224)

# Define Model
class alexNet(nn.Module):
    def __init__(self):
        super(alexNet,self).__init__()
        self.cnn_layer = nn.Sequential(
            nn.Conv2d(in_channels=1,out_channels=96,kernel_size=11,stride=4),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3,stride=2),
            # add feature_num, remain the image size:
            nn.Conv2d(96,256,5,1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(3,2),
            # three constant convolutional layer, remain the feautre_num and remain the image size:
            nn.Conv2d(256,384,3,1,1),
            nn.ReLU(),
            nn.Conv2d(384,384,3,1,1),
            nn.ReLU(),
            nn.Conv2d(384,256,3,1,1),
            nn.ReLU(),
            # max pool at the end:
            nn.MaxPool2d(3,2)
        )
        self.fc_layer = nn.Sequential(
            # dropout layer, avoid overfitting
            nn.Linear(256*5*5,4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096,4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            # output layer
            nn.Linear(4096,10),
        )
    def forward(self, x):
        y = self.cnn_layer(x)
        y = self.fc_layer(y.view(x.shape[0],-1))  # faltten layer
        return y

net = alexNet()
print(net)

loss_func = nn.CrossEntropyLoss()
optimizor = optim.Adam(net.parameters(), lr=learning_rate)



AlexNet(
  (cnn_layer): Sequential(
    (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layer): Sequential(
    (0): Linear(in_features=6400, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=4096, out_features=4096, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=F

In [36]:
# Train Model
def evaluate_model(net, test_iterator, device):
    net = net.to(device)
    print('testing on:', device)
    with torch.no_grad():
        correct,num_exp = 0.0,0
        for X,y in test_iterator:
            if isinstance(net, nn.Module):
                net.eval()  # eval mode will shut off dropout function
                correct += (net(X.to(device)).argmax(1)==y.to(device)).float().sum().cpu().item()
                net.train()
            else: 
                print('is this your self-defined nn module?? we are not considering GPU if so')
                if('is_training' in net.__code__.co_varnames): 
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            num_exp += y.size(0)
     
    return correct/num_exp*100

def train_model(num_epochs, train_iterator, test_iterator, loss_func, optimizor, net, device):
    net = net.to(device)
    print('training on:', device)
    for epoch in range(num_epochs):
        total_loss,total_batch,total_acc,total_num,start_time = 0.0,0,0.0,0,time.time()
        for X, y in train_iterator:
            X = X.to(device)
            y = y.to(device)

            output = net(X)
            loss = loss_func(output,y)
            optimizor.zero_grad()
            loss.backward()
            optimizor.step()
            
            total_loss += loss.cpu().item()
            total_batch += 1
            total_acc += (output.argmax(1)==y).sum().cpu().item()
            total_num += y.size(0)
        
        test_acc = evaluate_model(net, test_iterator, device)
        print('Epoch: {}, Average loss: {:.4f}, Average accuracy: {:.2f}%, Test Accuracy: {:.2f}%, time: {:.1f}sec' \
              .format(epoch+1, total_loss/total_batch, total_acc/total_num*100, test_acc, time.time()-start_time))

train_model(num_epochs,train_iterator,test_iterator,loss_func,optimizor,net,device)
        
# Prediction

training on: cuda
testing on: cuda
Epoch: 1, Average loss: 0.3895, Average accuracy: 86.67%, Test Accuracy: 98.39%, time: 50.2sec
testing on: cuda
Epoch: 2, Average loss: 0.0620, Average accuracy: 98.20%, Test Accuracy: 98.66%, time: 50.5sec
testing on: cuda
Epoch: 3, Average loss: 0.0453, Average accuracy: 98.63%, Test Accuracy: 99.10%, time: 50.2sec
testing on: cuda
Epoch: 4, Average loss: 0.0372, Average accuracy: 98.92%, Test Accuracy: 99.11%, time: 50.0sec
testing on: cuda
Epoch: 5, Average loss: 0.0331, Average accuracy: 99.00%, Test Accuracy: 99.11%, time: 50.0sec


In [21]:
for X,y in train_iterator:
    print(X.size())
    break

torch.Size([256, 1, 224, 224])
