# Identity Mappings in Deep Residual Networks 코드 실습

## 목표
1. shortcut connection은 identity Mapping이 최적임을 밝힘
2. Foward 방식은 Full pre-activation이 최적임을 밝힘

## 실험 방법

### DataSet
- CIFAR-10 Data Set

### Model
- ResNet-20
    - Residual Block 구성에 따라 다른 model 구성
        1. Original Block
        2. ConstantScaledBlock (0.5 on Shortcut & Residual Function)
        3. ExclusiveGating 
            - On Shortcut & Residual Function
            - Only on Shortcut
        4. ConvShortcout (1x1 conv on Shortcut)
        5. DropoutShortcut (0.5 on Shortcut)
    - Forward Process에 따라 다른 model 구성
        1. BN After Addition
        2. ReLu Before Addition
        3. ReLu Only Pre-Activation
        4. Full Pre-Activation
        


In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import DataLoader as dataloader
import torch.nn.functional as F
import torch.optim as optim
import os

In [None]:
# Data Loading

train_transforms = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor()
])
test_transforms = transforms.Compose([
    transforms.ToTensor()
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=False, transform=test_transforms)

In [None]:
train_loader= dataloader(train_dataset, batch_size = 128, shuffle=True, num_workers=1)
test_loader = dataloader(test_dataset, batch_size = 100, shuffle=False, num_workers=1)

In [None]:
class OriginalBlock(nn.Module):
    def __init__(self, in_channel, out_channel, stride):
        super(OriginalBlock,self).__init__()
        self.conv1 = nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channel)
        
        self.conv2 = nn.Conv2d(out_channel, out_channel, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channel)
        
        self.stride = stride
        self.down_dim = out_channel - in_channel
        self.pooling = nn.MaxPool2d(1, stride=stride)
    def forward(self, x):
        out = self.bn1(self.conv1(x))
        out = F.relu(out)
        out = self.bn2(self.conv2(out))
        if self.stride != 1:
            out += self.pooling(F.pad(x, (0, 0, 0, 0, 0 ,self.down_dim)))
        else:
            out += x
        out = F.relu(out)
        return out

### Shortcut 형태에 따라 다른 Block 구분

In [None]:
class ConstantScaledBlock(OriginalBlock):
    def __init__(self, in_channel, out_channel, stride):
        super().__init__(in_channel, out_channel, stride)
        
    def forward(self, x):
        out = self.bn1(self.conv1(x))
        out = F.relu(out)
        out = torch.mul(self.bn2(self.conv2(out)), 0.5) #Residual Function Constant Scaled
        if self.stride != 1:
            out = out + torch.mul(self.pooling(F.pad(x, (0, 0, 0, 0, 0, self.down_dim))),0.5) # shortcut Constant Scaled
        else:
            out = out + torch.mul(x,0.5) # shortcut Constant Scaled
        out = F.relu(out)
        return out

In [None]:
class ExclusiveGating(OriginalBlock):
    def __init__(self, in_channel, out_channel, stride):
        super().__init__(in_channel, out_channel, stride)
        self.gating = nn.Conv2d(out_channel, out_channel, kernel_size=1, stride=1, bias= -6) #g(x) = wx + b 로 만들어주기
        self.gating_down = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride, bias=-6) # in_dim != out_dim일 시 featuremap size downsample
    def forward(self, x):
        out = self.bn1(self.conv1(x))
        out = F.relu(out)
        out = F.sigmoid(self.gating(self.bn2(self.conv2(out)))) #g(x)
        if self.stride != 1:
            out = out + (1 - F.sigmoid(self.gating_down(x))) # on shortcut: 1-g(x)
        else:
            out = out + (1 - F.sigmoid(self.gating(x)))
        out = F.relu(out)
        return out

In [None]:
class ShortCutOnlyGating(ExclusiveGating):
  def __init__(self, in_channel, out_channel, stride):
    super().__init__(in_channel, out_channel, stride)
  def forward(self, x):
    out = self.bn1(self.conv1(x))
    out = F.relu(out)
    out = self.bn2(self.conv2(out)) # gating only in shortcut
    if self.stride != 1:
      out = out + (1 - F.sigmoid(self.gating_down(x)))
    else:
      out = out + (1 - F.sigmoid(self.gating(x)))
    out = F.relu(out)
    return out

In [None]:
class ConvShortcut(OriginalBlock):
  def __init__(self, in_channel, out_channel, stride):
    super().__init__(in_channel, out_channel, stride)
    self.conv_shortcut = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=1) # 1x1 conv when in_dim == out_dim 
    self.conv_shortcut_down = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride) # 1x1 conv when in_dim != out_dim
  def forward(self, x):
    out = self.bn1(self.conv1(x))
    out = F.relu(out)
    out = self.bn2(self.conv2(out))
    if self.stride != 1:
      out = out + self.conv_shortcut_down(x)
    else:
      out = out + self.conv_shortcut(x)
    out = F.relu(out)
    return out

In [None]:
class DropoutShortcut(OriginalBlock):
  def __init__(self, in_channel, out_channel, stride):
    super().__init__(in_channel, out_channel, stride)
  def forward(self, x):
    out = self.bn1(self.conv1(x))
    out = F.relu(out)
    out = self.bn2(self.conv2(out))
    if self.stride != 1:
      out = out + F.dropout(self.pooling(F.pad(x, (0, 0, 0, 0, 0 ,self.down_dim)))) # Dropout in shortcut
    else:
      out = out + F.dropout(x)
    out = F.relu(out)
    return out

### Forward 순서 변경에 따른 Block 구분

In [None]:
class OriginalBlock_BNAfterAddition(OriginalBlock):
    # Conv => BN => ReLu => Conv => Shortcut Addition => BN => ReLu
  def forward(self,x):
    out = self.bn1(self.conv1(x))
    out = F.relu(out)
    out = self.conv2(out)
    if self.stride != 1:
      out = out + self.pooling(F.pad(x, (0, 0, 0, 0, 0, self.down_dim)))
    else:
      out = out + x
    out = F.relu(self.bn2(out))
    return out

In [None]:
class OriginalBlock_ReLUBeforeAddition(OriginalBlock):
    # Conv => BN => Relu => Conv => BN => Relu => Shortcut Addition
  def forward(self,x):
    out = self.bn1(self.conv1(x))
    out = F.relu(out)
    out = F.relu(self.bn2(self.conv2(out)))
    if self.stride != 1:
      out = out + self.pooling(F.pad(x, (0, 0, 0, 0, 0, self.down_dim)))
    else:
      out = out + x
    return out

In [None]:
class OriginalBlock_ReLUOnlyPreActivation(OriginalBlock):
    # ReLu => ConV => BN => Relu => Conv => BN => Shortcut Addition
  def forward(self, x):
    out = F.relu(x)
    out = self.bn1(self.conv1(out))
    out = F.relu(out)
    out = self.bn2(self.conv2(out))
    if self.stride != 1:
      out = out + self.pooling(F.pad(x, (0, 0, 0, 0, 0, self.down_dim)))
    else:
      out = out + x
    return out

In [None]:
class OriginalBlock_FullPreActivation(OriginalBlock):
    # BN => ReLu => Conv => BN => ReLu => Conv2 => Shortcut Addition
  def __init__(self, in_channel, out_channel, stride):
    super().__init__(in_channel, out_channel, stride)
    self.bn1 = nn.BatchNorm2d(in_channel)
  def forward(self, x):
    out = self.bn1(x)
    out = F.relu(out)
    out = self.conv1(out)
    out = self.bn2(out)
    out = F.relu(out)
    out = self.conv2(out)
    if self.stride != 1:
      out = out + self.pooling(F.pad(x,(0,0,0,0,0, self.down_dim)))
    else:
      out = out + x
    return out

In [None]:
# ResNet 전체 Network 구성하는 Class
class ResNet(nn.Module):
  #CIFAR-10 데이터셋의 클래스 10개에 맞추어 Parameter 조정
  def __init__(self, block, num_blocks, num_classes=10):
    super(ResNet, self).__init__()
    self.in_planes = 16

    # ImageNet 처리 시 가장 앞 단에 layer 7x7, maxPooling 층을 두었으나
    # 3의 input dimension(RGB)를 받아 64개 feature map 생성
    self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
    self.bn1 = nn.BatchNorm2d(16)
    self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
    self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
    self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
    self.linear = nn.Linear(64, num_classes)
    self.softmax = nn.Softmax(1)

  def _make_layer(self, block, planes, num_blocks, stride):
    #num_blocks의 갯수 만큼 strides 리스트에 넣는다 => list의 length만큼 layer 내에 block 만든다
    strides = [stride] + [1] * (num_blocks - 1)
    layers = []
    for stride in strides:
      layers.append(block(self.in_planes, planes, stride))
      self.in_planes = planes # 다음 layer로 넘어갈때 채널 수 맞춰주기
    # *args: 가변 갯수의 인자를 함수에 집어넣어 줌
    return nn.Sequential(*layers)
  
  #순전파 방식
  def forward(self, x):
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.layer1(out)
    out = self.layer2(out)
    out = self.layer3(out) #out: [batch_size, 64, 8,8]
    #1x1로 바꿔주기 위해서 8x8 maxpolling
    out = F.avg_pool2d(out, 8)
    # view: pytorch에서 reshape과 같은 역할을 함
    out = out.view(out.size(0), -1)
    out = self.linear(out)
#     out = self.softmax(out)
    return out

#### ResNet20 with Original

In [None]:
# ResNet20 함수 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(OriginalBlock, [3,3,3])

In [None]:
#Library Import for Visualization
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('./runs/OriginalBlock')

In [None]:
device ='cuda'

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet110_original.pth'

# loss function => Cross-Entropy-Loss
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

#학습 정의

def train(epoch):
  print('Epoch: %d'%epoch)
  net.train()
  train_loss = 0
  correct = 0
  total = 0
  error = 0

  for batch_idx, (inputs, targets) in enumerate(train_loader):
    inputs, targets = inputs.to(device), targets.to(device)
    optimizer.zero_grad()

    outputs = net(inputs)
    loss = criterion(outputs, targets)
    # loss back propagation
    loss.backward()

    optimizer.step()
    train_loss += loss.item()
    writer.add_scalar("Train Loss", train_loss, epoch)
    _, predicted = outputs.max(1)
    #전체 갯수 count
    total += targets.size(0)
    #맞은 갯수 count
    current_correct = predicted.eq(targets).sum().item()
    correct += current_correct
    error = ((total - correct) / total) * 100
    writer.add_scalar("Train Error", error, epoch)

    # #100 batch 마다 정확도 출력
    # if batch_idx % 100 == 0:
    #   print('\nCurrent batch:', str(batch_idx))
    #   print('Current batch average train accuracy:', current_correct / targets.size(0))
    #   print('Current batch average train loss:', loss.item() / targets.size(0))

  print('\nTotal average train accuracy:', correct / total)
  print('Total average train loss:', train_loss / total)

# 평가 정의

def test(epoch):
  print('\n Test epoch: %d'%epoch)
  net.eval()
  loss = 0
  correct = 0
  total = 0
  error = 0

  for batch_idx, (inputs, targets) in enumerate(test_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        total += targets.size(0)

        outputs = net(inputs)
        loss += criterion(outputs, targets).item()
        writer.add_scalar("Test Loss", loss, epoch)
        _, predicted = outputs.max(1)
        correct += predicted.eq(targets).sum().item()
        error = ((total - correct) / total) * 100
        writer.add_scalar("Test Error", error, epoch)
        
  print('\nTotal average test accuarcy:', correct / total)
  print('Total average test loss:', loss / total)

  state = {
        'net' : net.state_dict()
    }
  if not os.path.isdir('checkpoint'):
    os.mkdir('checkpoint')
  torch.save(state, './checkpoint' + file_name)
  print('모델이 저장되었습니다')

In [None]:
import time

def adjust_learning_rate(optimizer, epoch):
  lr = learning_rate
  # iteration in 1 epoch = train_data size / batch size = 45000/128 = 약 350
  # 32000, 48000에서 lr update => 32000/350 = 약 90번째 epoch, 48000/350 = 137번째 epoch
  if epoch >=90:
    lr /= 10
  if epoch >= 137:
    lr /= 10
  for param_group in optimizer.param_groups:
    param_group['lr'] = lr

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with ConstantScaledBlock (0.5 on Shortcut & Residual Function)

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(ConstantScaledBlock, [3,3,3])
writer = SummaryWriter('./runs/ConstantScaledBlock')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet110_ConstantScaled.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with ExclusiveGating (On Shortcut & Residual Function)

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(ExclusiveGating, [3,3,3])
writer = SummaryWriter('./runs/ExclusiveGatingBlock')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet110_ExclusiveGating.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with ExclusiveGating (Only Shortcut)

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(ShortCutOnlyGating, [3,3,3])
writer = SummaryWriter('./runs/ShortCutGatingBlock')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet110_ShortCutGating.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with ConvShortcut

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(ConvShortcut, [3,3,3])
writer = SummaryWriter('./runs/ConvShortcutBlock')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet110_ConvShortcut.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with DropoutShortcut

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(DropoutShortcut, [3,3,3])
writer = SummaryWriter('./runs/DropoutShortcutBlock')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet110_DropoutShortcut.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

### Forward 순서에 따른 모델 실험

- Foward step에 따라 다른 model 구성
        1. BN After Addition
        2. ReLu Before Addition
        3. ReLu Only Pre-Activation
        4. Full Pre-Activation

#### ResNet20 with BN After Addition

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(OriginalBlock_BNAfterAddition, [3,3,3])
writer = SummaryWriter('./runs/BN_after_Addition')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet20_BNafterAddition.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with ReLu Before Addition

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(OriginalBlock_ReLUBeforeAddition, [3,3,3])
writer = SummaryWriter('./runs/ReLu_Before_Addition')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet20_RelubeforeAddition.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with ReLu Only Pre-Activation

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(OriginalBlock_ReLUOnlyPreActivation, [3,3,3])
writer = SummaryWriter('./runs/ReLu_Only_PreActivation')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet20_ReluonlyPreActivation.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with Full Pre-Activation

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(OriginalBlock_FullPreActivation, [3,3,3])
writer = SummaryWriter('./runs/Full_PreActivation')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet20_Fullpreactivation.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)

#### ResNet20 with Conv Shortcut & Full Pre-Activation

In [None]:
class ConvShortcut_FullPA_Block(OriginalBlock):
  def __init__(self, in_channel, out_channel, stride):
    super().__init__(in_channel, out_channel, stride)
    self.bn1 = nn.BatchNorm2d(in_channel)
    self.conv_shortcut = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=1) # 1x1 conv when in_dim == out_dim 
    self.conv_shortcut_down = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride) # 1x1 conv when in_dim != out_dim
  def forward(self, x):
    out = self.bn1(x)
    out = F.relu(out)
    out = self.conv1(out)
    out = self.bn2(out)
    out = F.relu(out)
    out = self.conv2(out)
    if self.stride != 1:
      out = out + self.conv_shortcut_down(x)
    else:
      out = out + self.conv_shortcut(x)
    out = F.relu(out)
    return out

In [None]:
# Net 정의
def ResNet20():
  # 2개의 convolution layer 으로 구성된 블록이 layer마다 3개 있으므로 전체 레이어는 6n개
  return ResNet(ConvShortcut_FullPA_Block, [3,3,3])
writer = SummaryWriter('./runs/Conv&Full_PreActivation')

#신경망 선언
net = ResNet20()

#신경망 GPU loading
net = net.to(device) 

learning_rate = 0.1
file_name = 'resnet20_Fullpreactivation.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)

start_time = time.time()

for epoch in range(0,150):
  adjust_learning_rate(optimizer, epoch)
  train(epoch)
  test(epoch)
  print('\n경과 시간:', time.time()-start_time)