In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Initialization을 사용하는 방법: torch.nn.init

In [2]:
fc1 = nn.Linear(3,2)

### 각 모듈별로 weight나 bias객체 속성에 실제값이 들어있습니다.

In [3]:
[x for x in fc1.parameters()]

[Parameter containing:
 tensor([[ 0.3664, -0.2565, -0.5446],
         [-0.3872, -0.3495, -0.3003]], requires_grad=True),
 Parameter containing:
 tensor([-0.3566, -0.5169], requires_grad=True)]

In [4]:
fc1.weight

Parameter containing:
tensor([[ 0.3664, -0.2565, -0.5446],
        [-0.3872, -0.3495, -0.3003]], requires_grad=True)

In [5]:
fc1.bias

Parameter containing:
tensor([-0.3566, -0.5169], requires_grad=True)

In [6]:
nn.init.constant_

<function torch.nn.init.constant_(tensor: torch.Tensor, val: float) -> torch.Tensor>

### nn.init에는 여러가지 종류가 있습니다.

In [7]:
nn.init.normal_(fc1.weight, mean=0.0, std=1.0)
nn.init.zeros_(fc1.bias)

Parameter containing:
tensor([0., 0.], requires_grad=True)

In [8]:
fc1.weight

Parameter containing:
tensor([[ 1.4745, -1.0326, -0.2501],
        [ 0.4300, -0.9808, -1.2280]], requires_grad=True)

In [9]:
fc1.bias

Parameter containing:
tensor([0., 0.], requires_grad=True)

### 직접 값을 지정하는 것도 가능합니다.

In [10]:
tmp_tensor = torch.tensor([[1.,2.,3.],[4.,5.,6.]])

In [11]:
fc1.weight.data = tmp_tensor
fc1.weight

Parameter containing:
tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)

## Xavier Initialization & He Initialization

In [12]:
nn.init.xavier_normal_(fc1.weight)

Parameter containing:
tensor([[-0.8362, -0.4360, -0.7508],
        [ 0.0969, -0.6915,  0.6707]], requires_grad=True)

In [13]:
nn.init.kaiming_normal_(fc1.weight)

Parameter containing:
tensor([[ 0.6311,  0.3224, -0.5526],
        [-2.4464,  0.1049, -0.8517]], requires_grad=True)

# 실제 모델에 적용해봅시다

In [14]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.apply(self._init_weights) # 모델을 만들때, self._init_weights()를 호출하여 parameter 초기화
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): # submodule이 nn.Linear에서 생성된 객체(혹은 인스턴스이면)
            nn.init.kaiming_normal_(submodule.weight) #해당 submodule의 weight는 He Initialization으로 초기화
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) # 해당 submodule의 bias는 0.01로 초기화
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

In [15]:
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
transform = transforms.Compose([transforms.ToTensor(), # 이미지를 텐서로 변경하고
                                transforms.Normalize((0.1307,), # 이미지를 0.1307, 0.3081값으로 normalize
                                                     (0.3081,))
                               ])

trn_dset = datasets.MNIST(root=data_path, train=True, transform=transform, download=True)
tst_dset = datasets.MNIST(root=data_path, train=False, transform=transform, download=False)

In [16]:
batch_size = 2**8
trn_loader = DataLoader(trn_dset, batch_size = batch_size, shuffle=True, drop_last=False)
tst_loader = DataLoader(tst_dset, batch_size = batch_size, shuffle=False, drop_last=False)

In [17]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device = torch.device('cpu')

model = MyNet()
model = model.to(device)

In [18]:
my_opt = optim.Adam(params = model.parameters(), lr = 2e-4)

In [19]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        my_opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        my_opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 0.2863, Accuracy: 9159/10000 (92%)


Test set: Average loss: 0.2120, Accuracy: 9369/10000 (94%)


Test set: Average loss: 0.1824, Accuracy: 9449/10000 (94%)


Test set: Average loss: 0.1621, Accuracy: 9516/10000 (95%)


Test set: Average loss: 0.1426, Accuracy: 9567/10000 (96%)


Test set: Average loss: 0.1329, Accuracy: 9597/10000 (96%)


Test set: Average loss: 0.1247, Accuracy: 9621/10000 (96%)


Test set: Average loss: 0.1170, Accuracy: 9649/10000 (96%)


Test set: Average loss: 0.1115, Accuracy: 9667/10000 (97%)


Test set: Average loss: 0.1089, Accuracy: 9677/10000 (97%)



# 연습문제

<span style = 'font-size:1.2em;line-height:1.5em'>1. Weight와 bias를 전부 0으로 채워서 initialize할 때, 학습이 어떻게 진행되는지 살펴봅시다. 마찬가지로, 1로 채웠을때 어떻게 학습이 되는지 살펴봅시다.</span>

1. Weight, bias 0으로 초기화

In [20]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.apply(self._init_weights) 
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): 
            nn.init.zeros_(submodule.weight) 
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.0) 
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

In [21]:
model = MyNet().to(device)

In [22]:
my_opt = optim.Adam(params = model.parameters(), lr = 2e-4)

In [23]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        my_opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        my_opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 2.3020, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3017, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3015, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3013, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3012, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3011, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3011, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3011, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3011, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3010, Accuracy: 1135/10000 (11%)



2. Weight, bias 1로 초기화

In [24]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
        self.apply(self._init_weights) 
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): 
            nn.init.ones_(submodule.weight) 
            if submodule.bias is not None:
                submodule.bias.data.fill_(1.0) 
            
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        result = F.log_softmax(x, dim=1) 
        return result

In [25]:
model = MyNet().to(device)

In [26]:
my_opt = optim.Adam(params = model.parameters(), lr = 2e-4)

In [27]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        my_opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        my_opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch)
            y_batch_pred = np.argmax(y_batch_prob, axis=1)
#             print(y_batch_pred)
#             print(y_batch)
#             y_batch_pred = y_batch_prob.argmax(dim=1, keepdim=True)
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.detach().numpy())
            y_real_list.append(y_batch.detach().numpy())
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))


Test set: Average loss: 54.1661, Accuracy: 1203/10000 (12%)


Test set: Average loss: 26.5224, Accuracy: 1859/10000 (19%)


Test set: Average loss: 18.7788, Accuracy: 1560/10000 (16%)


Test set: Average loss: 13.0406, Accuracy: 1190/10000 (12%)


Test set: Average loss: 11.9156, Accuracy: 1278/10000 (13%)


Test set: Average loss: 9.5895, Accuracy: 1265/10000 (13%)


Test set: Average loss: 6.3427, Accuracy: 1668/10000 (17%)


Test set: Average loss: 7.6751, Accuracy: 1633/10000 (16%)


Test set: Average loss: 3.4912, Accuracy: 1467/10000 (15%)


Test set: Average loss: 4.3004, Accuracy: 1346/10000 (13%)

