# AlexNet Model Implementation
---
## 1. Load CIFAR-10
- 구현에 쓰인 학습 데이터셋은 `pytorch.datasets`에 내장된 CIFAR10을 사용함. 

In [1]:
import torch
import torch.nn as nn
from torchvision import transforms, datasets

In [2]:
devices = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(devices)

cuda:0


In [3]:
BATCH_SIZE = 256
EPOCHS = 20

In [4]:
# transforms : 이미지 데이터를 로딩할 때 모듈의 입력값으로 사용할 수 있도록 변환

transform = transforms.Compose([
#     transforms.Resize(227),
    transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [5]:
# CIFAR-10

train_set = datasets.CIFAR10('./CIFAR-10', download=True, train=True, transform=transform)
# train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

test_set = datasets.CIFAR10('./CIFAR-10', download=True, train=False, transform=transform)
# test_loader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True)

print(train_set.data.shape)
print(test_set.data.shape)

Files already downloaded and verified
Files already downloaded and verified
(50000, 32, 32, 3)
(10000, 32, 32, 3)


In [6]:
# 데이터 정규화를 위해 평균, 표준편차 값 계산

import numpy as np

meanRGB = [np.mean(x.numpy(), axis=(1, 2)) for x, _ in train_set]
stdRGB = [np.std(x.numpy(), axis=(1, 2)) for x, _ in train_set]

meanR = np.mean([m[0] for m in meanRGB])
meanG = np.mean([m[1] for m in meanRGB])
meanB = np.mean([m[2] for m in meanRGB])

stdR = np.mean([s[0] for s in stdRGB])
stdG = np.mean([s[1] for s in stdRGB])
stdB = np.mean([s[2] for s in stdRGB])

print(meanR, meanG, meanB)
print(stdR, stdG, stdB)

0.49139965 0.48215845 0.4465309
0.20220213 0.19931543 0.20086348


In [7]:
transformer = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(227),
    transforms.Normalize([meanR, meanG, meanB], [stdR, stdG, stdB])
])

In [8]:
train_set.transform = transformer
test_set.transform = transformer

train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True)

## 2. 모델 구축

In [9]:
# 모델 구축

class AlexNet(nn.Module):
    def __init__(self, input_size=227, num_classes=10):
        super(AlexNet, self).__init__()
        
        self.cnnLayer = nn.Sequential(
            
            # 1st Conv : conv, relu, lrn, pool
            # 227 -> 55
            nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, padding=0, stride=4),
            # inplace=True : input으로 들어온 값 자체를 수정함. 메모리 usage가 좀 좋아지나 input을 없앰. 
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
            # 55 -> 27
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            
            # 2nd Conv : conv, relu, lrn, pool
            # 27 -> 27
            nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, padding=2, stride=1),
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=4, alpha=0.0001, beta=0.75, k=2),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            
            # 3rd Conv : conv, relu
            # 27 -> 27
            nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, padding=1, stride=1),
            nn.ReLU(inplace=True),
            
            
            # 4th Conv : conv, relu
            nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, padding=1, stride=1),
            nn.ReLU(inplace=True),
            
            
            # 5th Conv : conv, relu, pool
            nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, padding=1, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2)
        )
        
        # fully connected layer
        self.fcLayer = nn.Sequential(
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes)
        )
        
        # 정규화 및 bias 초기화
        for layer in self.cnnLayer:
            if isinstance(layer, nn.Conv2d):
                # conv 레이어들을 가우시안 분포로 정규화하고 bias는 0
                nn.init.normal_(layer.weight, mean=0, std=0.01)
                nn.init.constant_(layer.bias, 0)
                
        # 그런데 2, 4, 5 conv는 1로 초기화
        nn.init.constant_(self.cnnLayer[4].bias, 1)
        nn.init.constant_(self.cnnLayer[10].bias, 1)
        nn.init.constant_(self.cnnLayer[12].bias, 1)
        
    def forward(self, train):
        output = self.cnnLayer(train)
        output = output.view(-1, 256 * 6 * 6)
        output = self.fcLayer(output)
        
        return output

In [10]:
alexnet = AlexNet(227, 10)
alexnet.to(devices)

# weight decay = 0.0005, momentum = 0.9, lr = 0.01
optimizer = torch.optim.SGD(alexnet.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005)
criterion = nn.CrossEntropyLoss().to(devices)

## 3. 모델 학습

In [11]:
from tqdm.notebook import tqdm

for epoch in range(EPOCHS): 
    epoch_loss = 0
    for data, classes in tqdm(train_loader):
        inputs, labels = data.to(devices), classes.to(devices)
        
        optimizer.zero_grad()
        outputs = alexnet(inputs)
        
        # 순전파, 역전파, 최적화
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    correct = list(0. for i in range(1000))
    total = list(0. for i in range(1000))
    
    with torch.no_grad():
        for data, classes in test_loader:
            inputs, labels = data.to(devices), classes.to(devices)
            outputs = alexnet(inputs)
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels).squeeze()
            for i in range(labels.size()[0]):
                label = labels[i]
                correct[label] += c[i].item()
                total[label] += 1
                
    print('{0} : loss {1:.3f}, val_acc {2:.3f}'.format(epoch+1, epoch_loss, (sum(correct) / sum(total)) * 100))
print('끝')

  0%|          | 0/196 [00:00<?, ?it/s]

1 : loss 452.137, val_acc 9.550


  0%|          | 0/196 [00:00<?, ?it/s]

2 : loss 451.381, val_acc 10.330


  0%|          | 0/196 [00:00<?, ?it/s]

3 : loss 451.368, val_acc 9.700


  0%|          | 0/196 [00:00<?, ?it/s]

4 : loss 451.342, val_acc 9.990


  0%|          | 0/196 [00:00<?, ?it/s]

5 : loss 451.369, val_acc 9.880


  0%|          | 0/196 [00:00<?, ?it/s]

6 : loss 451.346, val_acc 9.760


  0%|          | 0/196 [00:00<?, ?it/s]

7 : loss 451.346, val_acc 10.070


  0%|          | 0/196 [00:00<?, ?it/s]

8 : loss 451.344, val_acc 10.080


  0%|          | 0/196 [00:00<?, ?it/s]

9 : loss 451.346, val_acc 10.440


  0%|          | 0/196 [00:00<?, ?it/s]

10 : loss 451.347, val_acc 9.700


  0%|          | 0/196 [00:00<?, ?it/s]

11 : loss 451.342, val_acc 10.120


  0%|          | 0/196 [00:00<?, ?it/s]

12 : loss 451.340, val_acc 9.630


  0%|          | 0/196 [00:00<?, ?it/s]

13 : loss 451.337, val_acc 10.000


  0%|          | 0/196 [00:00<?, ?it/s]

14 : loss 451.345, val_acc 10.000


  0%|          | 0/196 [00:00<?, ?it/s]

15 : loss 451.331, val_acc 9.870


  0%|          | 0/196 [00:00<?, ?it/s]

16 : loss 451.330, val_acc 10.340


  0%|          | 0/196 [00:00<?, ?it/s]

17 : loss 451.335, val_acc 10.140


  0%|          | 0/196 [00:00<?, ?it/s]

18 : loss 451.341, val_acc 9.960


  0%|          | 0/196 [00:00<?, ?it/s]

19 : loss 451.335, val_acc 9.950


  0%|          | 0/196 [00:00<?, ?it/s]

20 : loss 451.341, val_acc 10.070
끝


## 4. 회고 / TIL
---
- FashionMNIST에 이어서 CIFAR-10을 학습하는 AlexNet을 구현하고자 했음. 
- transforms 부분에서 normalize와 resize는 순서가 바뀌면 연산 결과가 달라질 것이라 생각했음. resize의 경우 interpolation을 하기 때문에 normalize가 먼저 이루어진 뒤에 interpolation된 것과 interplation 후 normalize한 결과가 다를 것이라 생각하고 transforms.Compose 정의 부분에서 고민에 빠졌는데, 실제 값을 찍어보니 결과가 같았음. 어쨌거나 normalize는 값을 특정한 range로 재설정하기 때문에 그런 것으로 결론지었음. 
- 앞서 구현한 FashionMNIST AlexNet 모델의 경우 LRN layer가 없었는데, 이번에 참고한 reference들에는 구현이 되어 있었다. 논문에 입각해 제대로 구현하려면 LRN layer가 있어야 할 것이다. 
- 모델 학습 부분의 코드가 복잡해 필사하면서 완벽히 이해하지 못했다. 아직 PyTorch가 낯설기도 하고... 많은 공부가 필요할 것 같다. 
- 학습 결과 loss와 val_acc 값이 이상하게 나왔다. 이 부분에 대해서도 개선이 필요함. 

## Reference
---
- https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf
- https://123okk2.tistory.com/171
- https://deep-learning-study.tistory.com/518