# [Book Reading Chapter7 Part2]

> [7.4 GooLeNet]

> [7.5 BatchNormalization]

> [7.6 ResNet]

> [7.7 DenseNet]


## Setting

### 1) Import Modules

In [1]:
# PyTorch Modeling
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

# Data Loading
from torch.utils.data import DataLoader
import torchvision
from torchvision import transforms

# Reproducibility
import random
import numpy as np

# Draw plot
%matplotlib inline
import matplotlib.pyplot as plt
import pylab as pl
from IPython import display

# Others
import sys

### 2) Set Seed for reproducibility
https://pytorch.org/docs/stable/notes/randomness.html

In [2]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
# torch.use_deterministic_algorithms(True)

### 3) Download Data
https://github.com/d2l-ai/d2l-en/blob/master/d2l/torch.py

In [3]:
def load_data_fashion_mnist(batch_size) -> dict :
    """
    Dive Into Deeplearning Github에 공개된 데이터 다운로드 코드를 변형
    Fashion MNIST 데이터를 './data' 경로에 다운로드하여 DataLoader로 반환 
    """ 
    trans = transforms.Compose([transforms.Resize(96),                          # 이미지를 96으로 Resize 후
                                transforms.ToTensor()])                         # Tensor로 바꿔주기 위한 세팅

    mnist_train = torchvision.datasets.FashionMNIST(                            # TorchVision에 있는 Fashion MNIST 데이터 로딩
        root="./data", train=True, transform=trans, download=True)            # ../data에 Transform을 적용한 채 Train 데이터 다운
    
    mnist_test = torchvision.datasets.FashionMNIST(
        root="./data", train=False, transform=trans, download=True)           # ../data에 Transform을 적용한 채 Test  데이터 다운
    
    return {
            'train' : DataLoader(mnist_train, batch_size, shuffle=True),              # DataLoader 형태로 반환 (Train은 Shuffle O)
            'test'  : DataLoader(mnist_test, batch_size, shuffle=False)             # DataLoader 형태로 반환 (Test는  Shuffle X)
            }

### 4) Default Variables for Training

In [4]:
# Dataloaders
dataloaders = load_data_fashion_mnist(5000)

# Device Setting
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

### 4) Code for Training Model

In [11]:
def Chapter7_Training(
                      model, 
                      dataloaders,
                      criterion,
                      optimizer,
                      num_epochs : int
                      ) -> None : 
    
    train_accuracy = 0
    train_loss = 0
    train_losses = []
    test_accuracy = 0
    test_loss = 0
    test_losses = []

    for num_epoch in range(num_epochs) : 
        for phase in ['train', 'test'] : 
            if phase == 'train' : 
                model.train()
            else : 
                model.eval()
            
            running_accuracy = 0
            running_loss = 0
            
            for X, y in dataloaders.get(phase) :    
                X = X.to(device)
                y = y.to(device)

                with torch.set_grad_enabled(phase == 'train') :
                    output = model(X)
                    loss = criterion(output, y)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_accuracy += (output.argmax(1)==y).sum().item()
                running_loss += loss.item()
                optimizer.zero_grad()

            epoch_accuracy = running_accuracy / len(dataloaders.get(phase).dataset)
            epoch_loss = running_loss / len(dataloaders.get(phase).dataset)


            if phase == 'train' : 
                train_accuracy = epoch_accuracy * 100
                train_loss = epoch_loss
                train_losses.append(train_loss)

            else :
                test_accuracy =  epoch_accuracy * 100
                test_loss = epoch_loss
                test_losses.append(test_loss)

            pl.plot(train_losses, c = 'r', marker = 'o')
            pl.plot(test_losses, c= 'b', marker = 'o')
            display.clear_output(wait=True)
            display.display(pl.gcf())

            sys.stdout.write("\r[EPOCH %d/%d] [Train Loss : %f] [Train Accuracy : %f] [Test  Loss : %f] [Test  Accuracy : %f]"
                  % (
                      num_epoch + 1,
                      num_epochs,
                      train_loss,
                      train_accuracy,
                      test_loss,
                      test_accuracy
                    )
                  )

    return model


### 5) Weight Initialization

In [12]:
def initialize_weights(model):
    # track all layers
    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_normal_(m.weight)

            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)

        elif isinstance(m, nn.Linear):
            nn.init.xavier_normal_(m.weight)
            nn.init.constant_(m.bias, 0)
    return model

# 7.4 GoogLeNet (2014.09 on arxiv)

### 1) Introduction

- 2014 ImageNet Challenge Winner
- Combined NiN + More Repeated Blocks
> 1.Inception Blocks<br>2.Deeper Architecture

### 2) Inception Blocks
<img src='https://raw.githubusercontent.com/JayHong99/2022_Summer_BookReading/master/Chapter7/Images/Inception_Block.png?raw=true' width='600'>

- Inception "We need to go deeper"에서 차용
- 4가지 평행 구조의 Layer
> (1x1 Conv)<br>(1x1 Conv) -> (3x3 Conv, pad1)<br> (1x1 Conv) -> (5x5 Conv, pad2)<br> (3x3 MaxPool, pad 1) -> (1x1 Conv)

특징
- 1x1 Conv 사용 -> Channel의 수 감소 (= 모델 복잡도 감소)
- 다양한 Filter Size -> 이미지를 다양하게 탐색
- 이미지의 크기는 input과 output 동일

### 3) Deeper Architecture

![GooLeNet Architecture](https://d2l.ai/_images/inception-full.svg)

- 이전 모델인 NiN은 4개의 Block을 사용
- VGG11도 5개의 Block 사용
- GooLeNet은 총 9개의 Inception Block 사용

결과적으로, 계산의 비용이 증가하고, 변형이 쉽지 않음

### 4) Implement with PyTorch

**<Inception Block 구현>**

In [13]:
class Inception(nn.Module) : 
    """
    c1, c2, c3, c4는 개별 path에서 Output Channel의 수 이다.
    """
    def __init__(self, in_channels, c1 : int, c2 : list, c3 : int, c4 : int, **kwargs) -> None: 
        super(Inception, self).__init__(**kwargs)
        """ 1번 경로 """
        self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size = 1)

        """ 2번 경로 """
        self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size = 1)
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size = 3, padding = 1)

        """ 3번 경로 """
        self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size = 1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size = 5, padding = 2)

        """ 4번 경로 """
        self.p4_1 = nn.MaxPool2d(kernel_size = 3, stride = 1, padding = 1)
        self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size = 1)

    def forward(self, x : torch.FloatTensor) -> torch.Tensor :
        p1 = F.relu(self.p1_1(x))                     # 1x1 Conv
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))  # 1x1 Conv -> ReLU -> 3x3 Conv -> ReLU
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))  # 1x1 Conv -> ReLU -> 3x3 Conv -> ReLU
        p4 = F.relu(self.p4_2(self.p4_1(x)))          # 3x3 MaxPool -> 1x1 Conv
        return torch.cat((p1, p2, p3, p4), dim = 1)   # 이미지의 크기는 모두 동일하니, channel의 방향으로 Concatenate

**<GooLeNet 구현>**

In [14]:
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size = 7, stride = 2, padding = 3),  # Kernel 7 with Stride 2 and padding 3 => 이미지 사이즈 절반 (64, 48,48)
                   nn.ReLU(), 
                   nn.MaxPool2d(kernel_size =  3, stride = 2, padding = 1))     # Kernel 3 with Stride 2 and Padding 1 => 이미지 사이즈 절반 (64, 24,24)


b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size = 1),                          # 1x1 Conv => (64, 24, 24)
                   nn.ReLU(),
                   nn.Conv2d(64, 192, kernel_size = 3, padding = 1),            # Kernel 3 with Padding 1 => 이미지 사이즈 동일 (192, 24,24)
                   nn.ReLU(),
                   nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1))      # Kernel 3 with Stride 2 and Padding 1 => 이미지 사이즈 절반 (192, 24,24)


b3 = nn.Sequential(Inception(192, 64, (96, 128), (16,32), 32),                  # Inception Block (192 => 64 + 128 + 32 + 32 = 256) => 이미지 사이즈 동일 (256, 12, 12)
                   Inception(256, 128, (128,192), (32, 96), 64),                # Inception Block (256 => 128 + 192 + 96 + 64 = 480) => 이미지 사이즈 동일 (480, 12, 12)
                   nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1))      # Kernel 3 with Stride 2 and Padding 1 => 이미지 사이즈 절반 (480, 6, 6)


b4 = nn.Sequential(Inception(480, 192, (96, 208), (16, 48), 64),                # Inception Block (480 => 192 + 208 + 48 + 64 = 512) => 이미지 사이즈 동일 (512, 48, 48)
                   Inception(512, 160, (112, 224), (24, 64), 64),               # Inception Block (512 => 160 + 224 + 64 + 64 = 512) => 이미지 사이즈 동일 (512, 6, 6)
                   Inception(512, 128, (128, 256), (24, 64), 64),               # Inception Block (512 => 128 + 256 + 64 + 64 = 256) => 이미지 사이즈 동일 (512, 6, 6)
                   Inception(512, 112, (144, 288), (32, 64), 64),               # Inception Block (512 => 112 + 288 + 64 + 64 = 528) => 이미지 사이즈 동일 (528, 6, 6)
                   Inception(528, 256, (160, 320), (32, 128), 128),             # Inception Block (528 => 256 + 320 + 128 + 128 = 256) => 이미지 사이즈 동일 (832, 6, 6)
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))            # Kernel 3 with Stride 2 and Padding 1 => 이미지 사이즈 절반 (832, 3, 3)


b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),             # Inception Block (832 => 256 + 320 + 128 + 128 = 832) => 이미지 사이즈 동일 (832, 3, 3)
                   Inception(832, 384, (192, 384), (48, 128), 128),             # Inception Block (832 => 384 + 384 + 128 + 128 = 1024) => 이미지 사이즈 동일 (1024, 3, 3)
                   nn.AdaptiveAvgPool2d((1,1)),                                 # Adaptive AvgPool2d는 Image의 Output을 자동으로 맞추어 Pooling하는 Layer => (1024,1,1)
                   nn.Flatten())                                                # Flatten => (1024)

GooLeNet = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10))               # FC는 단층으로, output = 10

**<맞게 구축했는지 파악>**

In [15]:
X = torch.rand(size=(1, 1, 96, 96))
for layer in GooLeNet:
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([1, 64, 24, 24])
Sequential output shape:	 torch.Size([1, 192, 12, 12])
Sequential output shape:	 torch.Size([1, 480, 6, 6])
Sequential output shape:	 torch.Size([1, 832, 3, 3])
Sequential output shape:	 torch.Size([1, 1024])
Linear output shape:	 torch.Size([1, 10])


### 5) Training with PyTorch

In [16]:
model = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10)).to(device)
model = initialize_weights(model)

num_epochs = 1000
learning_rate = 0.1
optimizer = optim.SGD(model.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

model = Chapter7_Training(model, dataloaders, criterion, optimizer, num_epochs)

AttributeError: module 'pylab' has no attribute 'plot'

# 7.5 BatchNormalization

## 1) Introduction

- Chapter 4에서 집 값 예측할 때 제일 먼저 했던 일이 Standard Scailing
- 필요성


> 결과에 차이가 크다 -> 







# 7.6 ResNet

# 7.7 DenseNet