In [29]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [30]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [31]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128,64)
        self.layer3 = LinearBNAC(64,32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [32]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(params=model.parameters(),lr=0.0001,weight_decay=0.0001)

In [33]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

  target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)


In [34]:
output = model(dummy_input)
print(output)

tensor([[0.0968, 0.0797, 0.0657, 0.0561, 0.1106, 0.1306, 0.1013, 0.1498, 0.1224,
         0.0870],
        [0.0704, 0.1385, 0.0385, 0.0480, 0.0730, 0.1364, 0.0681, 0.1426, 0.1150,
         0.1696],
        [0.1311, 0.0732, 0.0636, 0.0429, 0.0639, 0.1458, 0.0634, 0.1151, 0.1528,
         0.1481],
        [0.0902, 0.0755, 0.0718, 0.0723, 0.0686, 0.2033, 0.0643, 0.0836, 0.1387,
         0.1316]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [35]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [36]:
criterion = NLLLoss()

In [37]:
loss = criterion(torch.log(output), target)

In [38]:
loss

tensor(2.4658, grad_fn=<NllLossBackward>)

### 完成back propagation並更新梯度

In [39]:
loss.backward()

In [40]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0213,  0.0239,  0.0573,  ...,  0.0454,  0.0524,  0.0299],
        [-0.0435, -0.0203, -0.0312,  ..., -0.0117, -0.0064,  0.0396],
        [-0.0031,  0.0590,  0.0119,  ...,  0.0340, -0.0537, -0.0040],
        ...,
        [-0.0300, -0.0106,  0.0465,  ..., -0.0619,  0.0026, -0.0508],
        [ 0.0477,  0.0163, -0.0578,  ..., -0.0364,  0.0555,  0.0344],
        [ 0.0069, -0.0383, -0.0213,  ..., -0.0005,  0.0340,  0.0373]],
       requires_grad=True)


grad : tensor([[-0.0013, -0.0007,  0.0002,  ...,  0.0006,  0.0006,  0.0029],
        [ 0.0151, -0.0028, -0.0037,  ..., -0.0077, -0.0288, -0.0791],
        [-0.0057, -0.0225,  0.0098,  ...,  0.0090, -0.0262, -0.0244],
        ...,
        [-0.0152, -0.0291,  0.0116,  ...,  0.0199, -0.0271, -0.0048],
        [ 0.0157,  0.0250, -0.0181,  ...,  0.0069, -0.0009, -0.0269],
        [-0.0156,  0.0363, -0.0464,  ..., -0.0533,  0.0614,  0.0147]])


In [41]:
optimizer.step()

In [42]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0214,  0.0240,  0.0572,  ...,  0.0453,  0.0523,  0.0298],
        [-0.0436, -0.0202, -0.0311,  ..., -0.0116, -0.0063,  0.0397],
        [-0.0030,  0.0591,  0.0118,  ...,  0.0339, -0.0536, -0.0039],
        ...,
        [-0.0299, -0.0105,  0.0464,  ..., -0.0620,  0.0027, -0.0507],
        [ 0.0476,  0.0162, -0.0577,  ..., -0.0365,  0.0556,  0.0345],
        [ 0.0070, -0.0384, -0.0212,  ..., -0.0004,  0.0339,  0.0372]],
       requires_grad=True)


grad : tensor([[-0.0013, -0.0007,  0.0002,  ...,  0.0006,  0.0006,  0.0029],
        [ 0.0151, -0.0028, -0.0037,  ..., -0.0077, -0.0288, -0.0791],
        [-0.0057, -0.0225,  0.0098,  ...,  0.0090, -0.0262, -0.0244],
        ...,
        [-0.0152, -0.0291,  0.0116,  ...,  0.0199, -0.0271, -0.0048],
        [ 0.0157,  0.0250, -0.0181,  ...,  0.0069, -0.0009, -0.0269],
        [-0.0156,  0.0363, -0.0464,  ..., -0.0533,  0.0614,  0.0147]])


### 清空 gradient

In [43]:
optimizer.zero_grad()

In [44]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0214,  0.0240,  0.0572,  ...,  0.0453,  0.0523,  0.0298],
        [-0.0436, -0.0202, -0.0311,  ..., -0.0116, -0.0063,  0.0397],
        [-0.0030,  0.0591,  0.0118,  ...,  0.0339, -0.0536, -0.0039],
        ...,
        [-0.0299, -0.0105,  0.0464,  ..., -0.0620,  0.0027, -0.0507],
        [ 0.0476,  0.0162, -0.0577,  ..., -0.0365,  0.0556,  0.0345],
        [ 0.0070, -0.0384, -0.0212,  ..., -0.0004,  0.0339,  0.0372]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
