In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 실험을 CPU에서? GPU에서?

In [2]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device = torch.device('cpu')
print(device)

cpu


## 비교대상: BN없는 모델

In [3]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.apply(self._init_weights) # 모델을 만들때, self._init_weights()를 호출하여 parameter 초기화
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): # submodule이 nn.Linear에서 생성된 객체(혹은 인스턴스이면)
            nn.init.kaiming_normal_(submodule.weight) #해당 submodule의 weight는 He Initialization으로 초기화
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) # 해당 submodule의 bias는 0.01로 초기화
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

## BN이 적용된 모델

In [4]:
class MyNet_BN(nn.Module):
    def __init__(self):
        super(MyNet_BN, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.bn1 = nn.BatchNorm1d(100) # input -> hidden1로 가는 과정에서 필요한 batchnorm layer
        self.bn2 = nn.BatchNorm1d(100) # hidden1 -> hidden2로 가는 과정에서 필요한 batchnorm layer
#         self.bn3 = nn.BatchNorm1d(10) # output layer에서는 batchnorm이 통상적으로 잘 사용되지 않는 것 같습니다.
        self.apply(self._init_weights) # 모델을 만들때, self._init_weights()를 호출하여 parameter 초기화
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): # submodule이 nn.Linear에서 생성된 객체(혹은 인스턴스이면)
            nn.init.kaiming_normal_(submodule.weight) #해당 submodule의 weight는 He Initialization으로 초기화
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) # 해당 submodule의 bias는 0.01로 초기화
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x) # batchnorm은 affine연산(matrix multiplication)이후 사용.(activation전에!)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x) # batchnorm은 affine연산(matrix multiplication)이후 사용.(activation전에!)
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

## model 생성

In [5]:
model = MyNet().to(device)
model_bn = MyNet_BN().to(device)

In [13]:
model_bn.bn1.weight

Parameter containing:
tensor([1.0134, 0.9590, 1.0369, 1.0102, 1.0180, 1.0287, 1.0283, 0.9790, 0.9533,
        0.9470, 1.0323, 1.0334, 0.9725, 1.0355, 1.0009, 0.9471, 1.0617, 0.9284,
        0.9864, 1.0751, 0.9663, 1.0070, 1.0738, 0.9379, 1.0808, 0.9841, 0.9952,
        0.9498, 1.0096, 0.9415, 1.0185, 1.0375, 0.9310, 0.9785, 1.0189, 1.0098,
        0.9683, 0.9963, 0.9988, 0.9895, 0.9608, 1.0258, 1.0632, 0.9668, 1.0284,
        1.0814, 0.9274, 1.0098, 0.9484, 0.9672, 1.0444, 0.9983, 0.9513, 0.9975,
        1.0273, 0.9476, 0.9817, 0.9676, 0.9429, 1.0633, 0.9372, 0.9775, 0.9243,
        0.9977, 1.0380, 1.0500, 1.0208, 1.0039, 0.9781, 0.9932, 1.0152, 1.0636,
        0.9608, 0.9651, 0.9950, 1.0031, 1.0339, 1.0499, 0.9567, 0.9936, 1.0434,
        1.0182, 0.9872, 1.0502, 0.9399, 0.9832, 1.0120, 1.0093, 1.0263, 1.0493,
        1.0531, 0.9509, 0.9541, 1.0821, 0.9933, 0.9631, 0.9903, 1.0476, 0.9959,
        0.9461], requires_grad=True)

## optimizer 생성

In [6]:
opt = optim.Adam(params = model.parameters(), lr = 2e-4)
opt_bn = optim.Adam(params = model_bn.parameters(), lr = 2e-4)

In [7]:
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
transform = transforms.Compose([transforms.ToTensor(), # 이미지를 텐서로 변경하고
                                transforms.Normalize((0.1307,), # 이미지를 0.1307, 0.3081값으로 normalize
                                                     (0.3081,))
                               ])

trn_dset = datasets.MNIST(root=data_path, train=True, transform=transform, download=True)
tst_dset = datasets.MNIST(root=data_path, train=False, transform=transform, download=False)

In [8]:
batch_size = 2**8
trn_loader = DataLoader(trn_dset, batch_size = batch_size, shuffle=True, drop_last=False)
tst_loader = DataLoader(tst_dset, batch_size = batch_size, shuffle=False, drop_last=False)

# BN이 없는 모델

In [9]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train() # batchnorm layer, dropout layer 할때 중요함 
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch) 
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 0.3011, Accuracy: 9160/10000 (92%)


Test set: Average loss: 0.2154, Accuracy: 9369/10000 (94%)


Test set: Average loss: 0.1789, Accuracy: 9484/10000 (95%)


Test set: Average loss: 0.1532, Accuracy: 9554/10000 (96%)


Test set: Average loss: 0.1428, Accuracy: 9570/10000 (96%)


Test set: Average loss: 0.1299, Accuracy: 9608/10000 (96%)


Test set: Average loss: 0.1222, Accuracy: 9633/10000 (96%)


Test set: Average loss: 0.1132, Accuracy: 9664/10000 (97%)


Test set: Average loss: 0.1052, Accuracy: 9683/10000 (97%)


Test set: Average loss: 0.1010, Accuracy: 9687/10000 (97%)



## BN이 있는 모델

In [10]:
n_epochs = 10
for epoch in range(n_epochs):
    model_bn.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        opt_bn.zero_grad()
        y_batch_prob = model_bn(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        opt_bn.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model_bn.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model_bn(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 0.3989, Accuracy: 9048/10000 (90%)


Test set: Average loss: 0.2535, Accuracy: 9346/10000 (93%)


Test set: Average loss: 0.1949, Accuracy: 9469/10000 (95%)


Test set: Average loss: 0.1631, Accuracy: 9547/10000 (95%)


Test set: Average loss: 0.1429, Accuracy: 9608/10000 (96%)


Test set: Average loss: 0.1282, Accuracy: 9633/10000 (96%)


Test set: Average loss: 0.1159, Accuracy: 9664/10000 (97%)


Test set: Average loss: 0.1088, Accuracy: 9686/10000 (97%)


Test set: Average loss: 0.1028, Accuracy: 9696/10000 (97%)


Test set: Average loss: 0.0959, Accuracy: 9712/10000 (97%)



# 과제

<span style = 'font-size:1.2em;line-height:1.5em'>1. BatchNorm을 하면 initialization에 크게 신경쓰지 않아도 되고, learning rate를 좀 크게 해도 관계 없다고 합니다. 실제로 그런지 확인해볼까요? Weight Initialization을 평균이 0, 표준편차가 0.2인 정규분포에서 random으로 추출하도록 하고, learning_rate를 0.01로 하고 실험을 해봅시다. BN을 했을때와 하지 않았을 때를 비교해보세요. Epoch에 따라 traning_error, test_error, test_accuracy를 모니터링한 결과를 알려주세요</span>

### BN이 적용되지 않은 모델

In [11]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.apply(self._init_weights) 
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): 
            nn.init.normal_(submodule.weight, mean=0.0, std=0.2) 
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) 
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

### BN이 적용된 모델 

In [14]:
class MyNet_BN(nn.Module):
    def __init__(self):
        super(MyNet_BN, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.bn1 = nn.BatchNorm1d(100)
        self.bn2 = nn.BatchNorm1d(100)
#         self.bn3 = nn.BatchNorm1d(10)
        self.apply(self._init_weights)
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear):
            nn.init.normal_(submodule.weight, mean=0.0, std=0.2)
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01)
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x) 
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

### 모델 생성

In [15]:
model = MyNet().to(device)
model_bn = MyNet_BN().to(device)

### optimizer 생성

In [16]:
opt = optim.Adam(params = model.parameters(), lr = 0.01)
opt_bn = optim.Adam(params = model_bn.parameters(), lr = 0.01)

### Train 

In [17]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 0.1970, Accuracy: 9397/10000 (94%)


Test set: Average loss: 0.1805, Accuracy: 9453/10000 (95%)


Test set: Average loss: 0.1477, Accuracy: 9565/10000 (96%)


Test set: Average loss: 0.1491, Accuracy: 9598/10000 (96%)


Test set: Average loss: 0.1414, Accuracy: 9612/10000 (96%)


Test set: Average loss: 0.1514, Accuracy: 9607/10000 (96%)


Test set: Average loss: 0.1554, Accuracy: 9594/10000 (96%)


Test set: Average loss: 0.1396, Accuracy: 9627/10000 (96%)


Test set: Average loss: 0.1440, Accuracy: 9628/10000 (96%)


Test set: Average loss: 0.1233, Accuracy: 9691/10000 (97%)



In [18]:
# BN 적용 된 모델 
n_epochs = 10
for epoch in range(n_epochs):
    model_bn.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        opt_bn.zero_grad()
        y_batch_prob = model_bn(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        opt_bn.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model_bn.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model_bn(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 0.1086, Accuracy: 9656/10000 (97%)


Test set: Average loss: 0.0952, Accuracy: 9679/10000 (97%)


Test set: Average loss: 0.0950, Accuracy: 9703/10000 (97%)


Test set: Average loss: 0.0882, Accuracy: 9748/10000 (97%)


Test set: Average loss: 0.0825, Accuracy: 9765/10000 (98%)


Test set: Average loss: 0.0906, Accuracy: 9762/10000 (98%)


Test set: Average loss: 0.0858, Accuracy: 9764/10000 (98%)


Test set: Average loss: 0.0799, Accuracy: 9788/10000 (98%)


Test set: Average loss: 0.1156, Accuracy: 9723/10000 (97%)


Test set: Average loss: 0.0993, Accuracy: 9744/10000 (97%)

