In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device = torch.device('cpu')
print(device)

cpu


In [3]:
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
transform = transforms.Compose([transforms.ToTensor(), # 이미지를 텐서로 변경하고
                                transforms.Normalize((0.1307,), # 이미지를 0.1307, 0.3081값으로 normalize
                                                     (0.3081,))
                               ])

trn_dset = datasets.MNIST(root=data_path, train=True, transform=transform, download=True)
tst_dset = datasets.MNIST(root=data_path, train=False, transform=transform, download=False)

# Dropout

<span style = 'font-size:1.4em;line-height:1.5em'>Dropout은 언제 써야될까요?</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>(1) Affine(Matrix Multiplication) - Activation - Dropout?</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>(2) Affine(Matrix Multiplication) - Dropout - Activation?</span>

<span style = 'font-size:1.2em;line-height:1.5em'>확실하게 정해진 건 없습니다. 다만, 일반적으로 relu를 activation function으로 쓸때는 방법 (2)를, 나머지는 방법 (1)을 사용합니다.</span>

참고: https://sebastianraschka.com/faq/docs/dropout-activation.html

In [4]:
batch_size = 2**8
trn_loader = DataLoader(trn_dset, batch_size = batch_size, shuffle=True, drop_last=False)
tst_loader = DataLoader(tst_dset, batch_size = batch_size, shuffle=False, drop_last=False)

In [5]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.dropout = nn.Dropout(0.25) # Dropout layer 생성
        self.apply(self._init_weights) # 모델을 만들때, self._init_weights()를 호출하여 parameter 초기화
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): # submodule이 nn.Linear에서 생성된 객체(혹은 인스턴스이면)
            nn.init.kaiming_normal_(submodule.weight) #해당 submodule의 weight는 He Initialization으로 초기화
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) # 해당 submodule의 bias는 0.01로 초기화
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.dropout(x) # ReLU함수를 쓸 때는 dropout을 activation전에 사용
        x = F.relu(x)
        x = self.fc2(x)
        x = self.dropout(x) # ReLU함수를 쓸 때는 dropout을 activation전에 사용
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

In [6]:
model = MyNet()
my_opt = optim.Adam(params = model.parameters(), lr = 2e-4)

In [7]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        my_opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        my_opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 0.3731, Accuracy: 8958/10000 (90%)


Test set: Average loss: 0.2670, Accuracy: 9231/10000 (92%)


Test set: Average loss: 0.2238, Accuracy: 9351/10000 (94%)


Test set: Average loss: 0.1941, Accuracy: 9418/10000 (94%)


Test set: Average loss: 0.1736, Accuracy: 9488/10000 (95%)


Test set: Average loss: 0.1578, Accuracy: 9523/10000 (95%)


Test set: Average loss: 0.1474, Accuracy: 9558/10000 (96%)


Test set: Average loss: 0.1380, Accuracy: 9580/10000 (96%)


Test set: Average loss: 0.1285, Accuracy: 9606/10000 (96%)


Test set: Average loss: 0.1226, Accuracy: 9607/10000 (96%)



# Weight Decay

In [8]:

# Dropout부분을 뺐습니다.
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.apply(self._init_weights) # 모델을 만들때, self._init_weights()를 호출하여 parameter 초기화
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): # submodule이 nn.Linear에서 생성된 객체(혹은 인스턴스이면)
            nn.init.kaiming_normal_(submodule.weight) #해당 submodule의 weight는 He Initialization으로 초기화
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) # 해당 submodule의 bias는 0.01로 초기화
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

In [9]:
model = MyNet().to(device)
my_opt = optim.Adam(params = model.parameters(), lr = 2e-4, weight_decay=0.1) #L2 penalty에 들어가는 람다 값

In [10]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        my_opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        my_opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 0.4202, Accuracy: 9019/10000 (90%)


Test set: Average loss: 0.3902, Accuracy: 9150/10000 (92%)


Test set: Average loss: 0.4091, Accuracy: 9150/10000 (92%)


Test set: Average loss: 0.4279, Accuracy: 9096/10000 (91%)


Test set: Average loss: 0.4499, Accuracy: 9071/10000 (91%)


Test set: Average loss: 0.4697, Accuracy: 9025/10000 (90%)


Test set: Average loss: 0.4782, Accuracy: 8994/10000 (90%)


Test set: Average loss: 0.4930, Accuracy: 8978/10000 (90%)


Test set: Average loss: 0.5003, Accuracy: 8939/10000 (89%)


Test set: Average loss: 0.5061, Accuracy: 8926/10000 (89%)



# Early Stopping

<span style = 'font-size:1.3em;line-height:1.5em'>Early Stopping은 다음과 같은 방식으로 진행됩니다.</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>(1) 매 epoch마다 train을 진행합니다.</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>(2) 한 epoch에 대해 train이 끝나면 validation set에서 현재까지 학습된 모델로 loss를 계산합니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>val_loss = loss_func(y_val, y_val_est)</span>

- <span style = 'font-size:1.2em;line-height:1.5em'>(3) 현재의 validation loss가 이제까지의 validation loss의 최소값보다 연속으로 n번 크게 되면 학습을 멈춘다</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>val_loss > min_val_loss (n consecutive times) --> stop training</span>

## 구현해봅시다.
### 원래 validation set을 따로 만들어야 하지만, 여기선 편의상 test set을 validation set으로 사용하겠습니다.

In [11]:
batch_size = 2**8
trn_loader = DataLoader(trn_dset, batch_size = batch_size, shuffle=True, drop_last=False)
val_loader = DataLoader(tst_dset, batch_size = batch_size, shuffle=False, drop_last=False)

In [12]:
model = MyNet().to(device)
my_opt = optim.Adam(params = model.parameters(), lr = 2e-4) #L2 penalty에 들어가는 람다 값

In [13]:
min_val_loss = np.inf
n_patience = 3
n_violence = 0

n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        my_opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        my_opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    val_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(val_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            val_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        val_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(val_loader.dataset)
        
        
    if val_loss < min_val_loss:
        print(f'val_loss({val_loss:.4f}) < min_val_loss({min_val_loss:.4f})')
        print(f'>> keep training, min_val_loss is replaced to {val_loss:.4f}')
        min_val_loss = val_loss
        n_violence = 0
    else:
        print(f'val_loss({val_loss:.4f}) >= min_val_loss({min_val_loss:.4f})')
        print(f'>> n_violence is increased. n_violence={n_violence}')
        n_violence+=1
        
    print('val set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(val_loss, 
                                                                            correct, 
                                                                            len(val_loader.dataset),
                                                                            accuracy))
    
    if n_violence >= n_patience:
        print(f'>> n_violence={n_patience}. Stop training!\n')
        break

val_loss(0.2937) < min_val_loss(inf)
>> keep training, min_val_loss is replaced to 0.2937
val set: Average loss: 0.2937, Accuracy: 9115/10000 (91%)

val_loss(0.2129) < min_val_loss(0.2937)
>> keep training, min_val_loss is replaced to 0.2129
val set: Average loss: 0.2129, Accuracy: 9362/10000 (94%)

val_loss(0.1761) < min_val_loss(0.2129)
>> keep training, min_val_loss is replaced to 0.1761
val set: Average loss: 0.1761, Accuracy: 9481/10000 (95%)

val_loss(0.1527) < min_val_loss(0.1761)
>> keep training, min_val_loss is replaced to 0.1527
val set: Average loss: 0.1527, Accuracy: 9538/10000 (95%)

val_loss(0.1369) < min_val_loss(0.1527)
>> keep training, min_val_loss is replaced to 0.1369
val set: Average loss: 0.1369, Accuracy: 9575/10000 (96%)

val_loss(0.1259) < min_val_loss(0.1369)
>> keep training, min_val_loss is replaced to 0.1259
val set: Average loss: 0.1259, Accuracy: 9606/10000 (96%)

val_loss(0.1159) < min_val_loss(0.1259)
>> keep training, min_val_loss is replaced to 0.115

# 과제

<span style = 'font-size:1.3em;line-height:1.5em'>1. Dropout의 비율을 변화시키면서, train_loss, test_loss, test_accuracy가 어떻게 바뀌는지 서술하세요.</span>

#### MyNet에 drop_rate parameter추가
- dropout비율을 0.25, 0.5, 0.75, 1.0으로 각각 설정

In [4]:
batch_size = 2**8
trn_loader = DataLoader(trn_dset, batch_size = batch_size, shuffle=True, drop_last=False)
tst_loader = DataLoader(tst_dset, batch_size = batch_size, shuffle=False, drop_last=False)

In [15]:
class MyNet(nn.Module):
    def __init__(self,drop_rate):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.dropout = nn.Dropout(drop_rate) 
        self.apply(self._init_weights) 
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): 
            nn.init.kaiming_normal_(submodule.weight) 
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) 
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.dropout(x) 
        x = F.relu(x)
        x = self.fc2(x)
        x = self.dropout(x) 
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

In [16]:
drop_rate_li = [0.25, 0.5, 0.75, 1.0]

model_li = [MyNet(idx) for idx in drop_rate_li]
my_opt_li = [optim.Adam(params = model.parameters(), lr = 2e-4) for model in model_li]

In [37]:
for model, my_opt in zip(model_li, my_opt_li):
    print(model.dropout, my_opt)

Dropout(p=0.25, inplace=False) Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.0002
    maximize: False
    weight_decay: 0
)
Dropout(p=0.5, inplace=False) Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.0002
    maximize: False
    weight_decay: 0
)
Dropout(p=0.75, inplace=False) Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.0002
    maximize: False
    weight_decay: 0
)
Dropout(p=1.0, inplace=False) Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.0002
    maximize: False
    weight_decay: 0
)


In [38]:
model_flag = 0

n_epochs = 10
for model, my_opt in zip(model_li, my_opt_li):
    print(f'------------Start model_{model.dropout}------------')
    model_flag += 1
    for epoch in range(n_epochs):
        model.train()
        for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            my_opt.zero_grad()
            y_batch_prob = model(x_batch)
            loss = F.nll_loss(y_batch_prob, y_batch)
            loss.backward()
            my_opt.step()
            if (batch_idx+1)%100 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                               batch_idx * len(x_batch), 
                                                                               len(trn_loader.dataset),
                                                                               100 * batch_idx / len(trn_loader),
                                                                               loss.item()))
        # 매 epoch이 끝날때 결과 찍기
        print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                       len(trn_loader.dataset), 
                                                                       len(trn_loader.dataset),
                                                                    loss.item()))
        model.eval()
        y_pred_list = []
        y_real_list = []
        tst_loss = 0
        with torch.no_grad():
            for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
                x_batch = x_batch.reshape(-1,784).to(device)
                y_batch = y_batch.to(device)
                y_batch_prob = model(x_batch)
                y_batch_pred = np.argmax(y_batch_prob, axis=1)
    #             print(y_batch_pred)
    #             print(y_batch)
    #             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
                loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
                tst_loss += loss

                y_pred_list.append(y_batch_pred.detach().numpy())
                y_real_list.append(y_batch.detach().numpy())

            y_real = np.concatenate([x for x in y_real_list], axis=0)
            y_pred = np.concatenate([x for x in y_pred_list], axis=0)
            tst_loss /= y_real.shape[0]
            correct  = np.sum(y_real == y_pred)
            accuracy = 100*correct / len(tst_loader.dataset)

            print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                         correct, 
                                                                                         len(tst_loader.dataset),
                                                                                         accuracy))

------------Start model_Dropout(p=0.25, inplace=False)------------

Test set: Average loss: 0.1070, Accuracy: 9684/10000 (97%)


Test set: Average loss: 0.1040, Accuracy: 9682/10000 (97%)


Test set: Average loss: 0.0999, Accuracy: 9696/10000 (97%)


Test set: Average loss: 0.0978, Accuracy: 9702/10000 (97%)


Test set: Average loss: 0.0934, Accuracy: 9719/10000 (97%)


Test set: Average loss: 0.0912, Accuracy: 9724/10000 (97%)


Test set: Average loss: 0.0894, Accuracy: 9727/10000 (97%)


Test set: Average loss: 0.0893, Accuracy: 9728/10000 (97%)


Test set: Average loss: 0.0878, Accuracy: 9743/10000 (97%)


Test set: Average loss: 0.0865, Accuracy: 9739/10000 (97%)

------------Start model_Dropout(p=0.5, inplace=False)------------

Test set: Average loss: 0.1814, Accuracy: 9465/10000 (95%)


Test set: Average loss: 0.1763, Accuracy: 9470/10000 (95%)


Test set: Average loss: 0.1687, Accuracy: 9498/10000 (95%)


Test set: Average loss: 0.1659, Accuracy: 9512/10000 (95%)


Test set: Av

#### 결과
- 본 모델은 inference결과가 충분히 좋은 모델이며 Dropout의 비율을 높일수록 규제를 높이는 것이기 때문에 train loss도 올라갈 뿐만 아니라 test accuracy가 내려간다.
- 그러나 overfitting이 충분한 model을 많은 epoch를 가지고 학습한다면 dropout의 비율을 올리면 효과는 더욱 좋아질 것이다. 