In [33]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim

### 搭建模型

In [34]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [35]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 64)
        self.layer3 = LinearBNAC(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 

### 準備輸入資料、優化器、標籤資料、模型輸出

In [36]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.3, amsgrad=True)

In [37]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9, 5, 4, 4], dtype=torch.long)

In [38]:
output = model(dummy_input)
print(output)

tensor([[0.0586, 0.0987, 0.2047, 0.0624, 0.0742, 0.0750, 0.0976, 0.0858, 0.1048,
         0.1382],
        [0.0764, 0.1112, 0.0665, 0.0748, 0.0794, 0.1381, 0.2308, 0.0388, 0.0801,
         0.1039],
        [0.0816, 0.0807, 0.1878, 0.0731, 0.0591, 0.0736, 0.1021, 0.1279, 0.1566,
         0.0574],
        [0.0919, 0.0479, 0.1803, 0.0864, 0.0688, 0.0922, 0.1345, 0.1008, 0.1463,
         0.0508]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [39]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [43]:
criterion = NLLLoss()

In [44]:
loss = criterion(torch.log(output), target)

In [45]:
loss

tensor(2.3656, grad_fn=<NllLossBackward>)

### 完成back propagation並更新梯度

In [46]:
loss.backward()

In [47]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0171, -0.0324,  0.0133,  ...,  0.0465, -0.0202,  0.0432],
        [ 0.0230, -0.0523, -0.0493,  ..., -0.0475,  0.0181, -0.0511],
        [ 0.0577, -0.0273, -0.0089,  ..., -0.0175, -0.0148, -0.0354],
        ...,
        [ 0.0578,  0.0419,  0.0256,  ..., -0.0077,  0.0530,  0.0195],
        [ 0.0381, -0.0290, -0.0524,  ..., -0.0605, -0.0363, -0.0315],
        [-0.0609,  0.0047,  0.0139,  ...,  0.0434, -0.0317, -0.0072]],
       requires_grad=True)


grad : tensor([[ 1.7681e-02,  3.0425e-02,  2.6314e-02,  ...,  2.4183e-03,
          9.9028e-03, -1.4090e-02],
        [ 2.0626e-02,  2.1725e-02, -2.4732e-02,  ..., -3.5873e-02,
         -2.6912e-03, -5.1798e-02],
        [ 5.1775e-08, -3.3999e-07,  4.1840e-07,  ..., -1.6427e-07,
          2.1186e-07,  1.3377e-07],
        ...,
        [ 1.9677e-02,  2.0703e-02,  1.1523e-02,  ..., -7.4519e-03,
          4.9417e-02, -2.2105e-02],
        [-9.9166e-02, -8.6086e-02, -5.0260e-02,  ...,  8.8949e-02,
       

In [49]:
optimizer.step()

In [50]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0181, -0.0334,  0.0123,  ...,  0.0455, -0.0212,  0.0442],
        [ 0.0220, -0.0533, -0.0483,  ..., -0.0465,  0.0171, -0.0501],
        [ 0.0567, -0.0263, -0.0079,  ..., -0.0165, -0.0138, -0.0344],
        ...,
        [ 0.0568,  0.0409,  0.0246,  ..., -0.0067,  0.0520,  0.0205],
        [ 0.0391, -0.0280, -0.0514,  ..., -0.0615, -0.0353, -0.0325],
        [-0.0619,  0.0037,  0.0129,  ...,  0.0424, -0.0327, -0.0062]],
       requires_grad=True)


grad : tensor([[ 1.7681e-02,  3.0425e-02,  2.6314e-02,  ...,  2.4183e-03,
          9.9028e-03, -1.4090e-02],
        [ 2.0626e-02,  2.1725e-02, -2.4732e-02,  ..., -3.5873e-02,
         -2.6912e-03, -5.1798e-02],
        [ 5.1775e-08, -3.3999e-07,  4.1840e-07,  ..., -1.6427e-07,
          2.1186e-07,  1.3377e-07],
        ...,
        [ 1.9677e-02,  2.0703e-02,  1.1523e-02,  ..., -7.4519e-03,
          4.9417e-02, -2.2105e-02],
        [-9.9166e-02, -8.6086e-02, -5.0260e-02,  ...,  8.8949e-02,
       

### 清空 gradient

In [51]:
optimizer.zero_grad()

In [52]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0181, -0.0334,  0.0123,  ...,  0.0455, -0.0212,  0.0442],
        [ 0.0220, -0.0533, -0.0483,  ..., -0.0465,  0.0171, -0.0501],
        [ 0.0567, -0.0263, -0.0079,  ..., -0.0165, -0.0138, -0.0344],
        ...,
        [ 0.0568,  0.0409,  0.0246,  ..., -0.0067,  0.0520,  0.0205],
        [ 0.0391, -0.0280, -0.0514,  ..., -0.0615, -0.0353, -0.0325],
        [-0.0619,  0.0037,  0.0129,  ...,  0.0424, -0.0327, -0.0062]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
