# ゼロから作るDeep Learning
## 5章 誤差逆伝播法
### 5.4 単純なレイヤの実装
#### 5.4.1 乗法レイヤの実装

In [1]:
# 乗法レイヤのクラスを実装する
class MulLayer:

    def __init__(self):

        self.x = None
        self.y = None

    def forward(self, x, y):

        self.x = x
        self.y = y
        out = x * y

        return out

    def backward(self, dout):

        dx = dout * self.y
        dy = dout * self.x

        return dx, dy

MulLayerクラスを使うと、図5-16は順伝播は次のように実装できる。

In [2]:
# 実装例
apple = 100
apple_num = 2
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

# 結果を表示する
print('price =', price)

price = 220.00000000000003


各変数に関する微分は、backwardメソッドを使って次のように実装できる。

In [3]:
dprice = 1

dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

# 結果を表示する
print('(dapple, dapple_num, dtax) =', (dapple, dapple_num, dtax))

(dapple, dapple_num, dtax) = (2.2, 110.00000000000001, 200)


#### 5.4.2 加算レイヤの実装

In [4]:
class AddLayer(object):

    def __init__(self):

        # 自分で追加した部分
        # あとでx, yに何が入力されているのかを確認できるようにする
        self.x = None
        self.y = None

    def forward(self, x, y):

        self.x = x
        self.y = y
        out = x + y

        return out
    
    def backward(self, dout):

        dx = dout * 1
        dy = dout * 1

        return dx, dy

MulLayerとAddLayerを使って図5-17を解いてみる。

In [5]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

# backward
dprice = 1

dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)

# 結果を表示
print('price =', price)
print('(dapple_num, dapple, dorange_num, dorange, dtax) =', (dapple_num, dapple, dorange_num, dorange, dtax))

price = 715.0000000000001
(dapple_num, dapple, dorange_num, dorange, dtax) = (110.00000000000001, 2.2, 165.0, 3.3000000000000003, 650)


### 5.5 活性化関数レイヤの実装
#### 5.5.1 ReLUレイヤ

ReLU(Rectified Linear Unit)は
\begin{equation}
y = \begin{cases}
    x & (x > 0) \\
    0 & (x \le 0)
\end{cases}
\end{equation}
この微分は
\begin{equation}
\frac{\partial y}{\partial x} = \begin{cases}
    1 & (x > 0) \\
    0 & (x \le 0)
\end{cases}
\end{equation}
となる。  
（本来は$x = 0$では微分は定義できないはずなので、便宜上$\frac{\partial y}{\partial x} = 0$と定義しているのか？）

In [6]:
class Relu(object):

    def __init__(self):

        self.mask = None
    
    def forward(self, x):

        self.is_negative = (x <= 0)

        out = x.copy()
        out[is_negative] = 0
        
        return out

    def backward(self, dout):

        dx = dout.copy()
        dx[self.is_negative] = 0

        return dx

#### 5.5.2 Sigmoidレイヤ
シグモイド関数は
\begin{equation}
y = \frac{1}{1 + \exp(-x)}.
\end{equation}
図5-21のSigmoidレイヤの逆伝播計算は
\begin{equation}
\frac{\partial L}{\partial x} = \frac{\partial L}{\partial y}y(1 - y)
\end{equation}
と表せるので、上式を利用してSigmoidレイヤを実装する。

In [7]:
class Sigmoid(object):

    def __init__(self):

        self.out = None

    def forward(self, x):

        self.out = 1 / (1 / np.exp(-x))

        return self.out
    
    def backward(self, dout):

        dx = dout * self.out * (1.0 - self.out)

        return dx

## 5.6 Affine/Softmaxレイヤの実装
#### 5.6.2 バッチ版Affineレイヤ

In [8]:
class Affine(object):

    def __init__(self, W, b):

        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):

        self.x = x
        out = np.dot(x, self.W) + self.b

        return out

    def backward(self, dout):

        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)

        return dx


#### 5.6.3 Softmax-with-Lossレイヤ
Softmaxレイヤを損失関数である交差エントロピー誤差(Cross Entropy Error)を含めて、Softmax-with-Lossレイヤという名前のレイヤで実装する。

In [9]:
class SoftmaxWithLoss(object):

    def __init__(self):

        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x, t):

        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)

        return self.loss

    def backward(self, dout=1):

        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size

        return dx

## 5.7 誤差逆伝播法の実装
#### 5.7.2 誤差逆伝播法に対応したニューラルネットワークの実装

In [10]:
import sys
import os
from collections import OrderedDict

import numpy as np

from common.layers import *
from common.gradient import numerical_gradient

class TwoLayerNet(object):

    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):

        # 重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

        # レイヤの生成
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()

    def predict(self, x):

        for layer in self.layers.values():
            x = layer.forward(x)

        return x

    def loss(self, x, t):

        y = self.predict(x)

        return self.lastLayer.forward(y, t)

    def accuracy(self, x, t):

        y = self.predict(x)
        y = np.argmax(y, axis=1)

        if t.ndim != 1:
            t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])

        return accuracy

    def numerical_gradient(self, x, t):

        loss_W = lambda W: self.loss(x, t)

        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])

        return grads

    def gradient(self, x, t):

        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()

        for layer in layers:
            dout = layer.backward(dout)

        # Settings
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db

        return grads

#### 5.7.3 誤差逆伝播法の勾配確認
数値微分で勾配を求めた結果と、誤差逆電波法で求めた勾配の結果が一致することを確認する作業を勾配確認(gradient check)という。

In [11]:
import sys
import os

import numpy as np

from dataset.mnist import load_mnist
from common.two_layer_net import TwoLayerNet

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

# 各重みの絶対誤差の平均を求める
for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print('{0}:{1}'.format(key, diff))


W1:2.2394887826816718e-13
W2:7.635732876404831e-13
b1:7.097339849073542e-13
b2:1.1990407833284423e-10


#### 5.7.4 誤差逆伝播法を使った学習

In [12]:
import sys
import os
import numpy as np

from dataset.mnist import load_mnist
from common.two_layer_net import TwoLayerNet

# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    # 誤差逆伝播法によって勾配を求める
    grad = network.gradient(x_batch, t_batch)

    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)

        print('{0:.5f} {1:.5f}'.format(train_acc, test_acc))


0.09437 0.08910
0.90650 0.90890
0.92793 0.93090
0.93757 0.93630
0.94653 0.94500
0.95187 0.95140
0.95663 0.95290
0.96027 0.95800
0.96385 0.95870
0.96708 0.96360
0.96935 0.96450
0.97218 0.96700
0.97215 0.96830
0.97340 0.96770
0.97550 0.96800
0.97605 0.96810
0.97847 0.96990
