# Backward propagation of errors

In [1]:
# imports
import numpy as np
from common.functions import *
from common.gradient import numerical_gradient
from common.util import *
import sys, os
sys.path.append(os.pardir)
from collections import OrderedDict
from dataset.mnist import load_mnist

# fundamental classes, functions
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x*y
        return out
    def backward(self, dout):
        dx = dout * self.y
        dy = dout * self.x
        return dx, dy
class AddLayer:
    def __init__(self):
        pass    
    def forward(self, x, y):
        out = x + y
        return out
    def backward(self, dout):
        return dout, dout
class Relu:
    def __init__(self):
        self.mask = None
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx
class Sigmoid:
    def __init__(self):
        self.out = None
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out
    def backward(self, dout):
        dx = dout * (self.out * (1. - self.out))
        return dx
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        self.x = None
        self.original_x_shape = None
        self.dW = None
        self.db = None
    def forward(self, x):
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x
        out = np.dot(self.x, self.W) + self.b
        return out
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        dx = dx.reshape(*self.original_x_shape)  # for tensor
        return dx

## 1. Computational graph

## 2. Chain Rule

## 3. Backward propagation

## 4. Embody simple class

In [2]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
    
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x*y
        
        return out
    
    def backward(self, dout):
        dx = dout * self.y
        dy = dout * self.x
        
        return dx, dy

class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x, y):
        out = x + y
        
        return out
    
    def backward(self, dout):
        return dout, dout

In [3]:
apple = 100
apple_num = 2
tax = 1.1

# classes
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forpropagation
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

# backpropagation
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(price) # 220
print(dapple, dapple_num, dtax) # 2.2 110 200

220.00000000000003
2.2 110.00000000000001 200


In [4]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# classes
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# forpropagation
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

print(price) # 715

# backpropagation
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(dapple, dapple_num, dorange, dorange_num, dtax) # 2.2 110 3.3 165 650

715.0000000000001
2.2 110.00000000000001 3.3000000000000003 165.0 650


## 5. Embody Activation function classes

### 1) ReLU

$
y =
  \begin{cases}
    x  & \quad (x > 0)\\
    0  & \quad (x \leq 0)
  \end{cases}
$

$
\displaystyle\frac{\partial y}{\partial x} =
 \begin{cases}
  1 & \quad (x > 0)\\
  0 & \quad (x \leq 0)
 \end{cases}
$

In [5]:
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        
        return dx

In [6]:
x = np.array([[1.,-.5],[-2.,3.]])
print(x)
mask = (x <= 0)
print(mask)

[[ 1.  -0.5]
 [-2.   3. ]]
[[False  True]
 [ True False]]


### 2) Sigmoid

$
\displaystyle y = \frac{1}{1 + \exp(-x)}
$

<br/>
<br/>

>**Process**\
>$
\times \rightleftharpoons \exp \rightleftharpoons + \rightleftharpoons \div
$

<br/>
<br/>

#### PDE of `\` node

$
\displaystyle y = \frac{1}{x}
$

$
\displaystyle \frac{\partial y}{\partial x} = - \frac{1}{x^2}\\
\displaystyle \quad \text{ }  = - y^2
$

#### PDE of `exp` node

$
\displaystyle \frac{\partial y}{\partial x} = \exp{(x)}
$

<br/>

>**Simply**\
>$\displaystyle\frac{\partial L}{\partial y}y^2\exp{(-x)} \quad \leftarrow \quad \frac{\partial L}{\partial y}$\
$\displaystyle(=\frac{\partial L}{\partial y}y(1-y))$

In [7]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        
        return out
    
    def backward(self, dout):
        dx = dout * (self.out * (1. - self.out))
        
        return dx

## 6. Embody Affine/Softmax

### 1) Affine

backpropa dot
<br/>

$
\displaystyle\frac{\partial L}{\partial X} = \frac{\partial L}{\partial Y} \cdot W^T\\
\displaystyle\frac{\partial L}{\partial W} = X^T \cdot \frac{\partial L}{\partial Y}
$

in $X$

$\displaystyle
X = (x_0,x_1,\dots,x_n)\\
\displaystyle\frac{\partial L}{\partial X} = (\frac{\partial L}{\partial x_0},\frac{\partial L}{\partial x_1},\dots,\frac{\partial L}{\partial x_n})
$

### 2) Classes for batch affine

In [8]:
# Not for tensor input
class AffineBase:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        return dx

In [9]:
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)  # for tensor
        return dx

In [10]:
# Transpose
tempA = np.array([[1,2,3,4],[3,4,5,6],[5,6,7,8]])
print(tempA.T)
# Reshape
print(tempA.shape[0])
print(tempA.reshape(tempA.shape[0], -1))
# -1 fit rest
print(tempA.reshape(2, -1))

[[1 3 5]
 [2 4 6]
 [3 5 7]
 [4 6 8]]
3
[[1 2 3 4]
 [3 4 5 6]
 [5 6 7 8]]
[[1 2 3 4 3 4]
 [5 6 5 6 7 8]]


### 3) Softmax-with-loss classes 

In [11]:
class SoftmaxWithLoss:
    def __init__(self):
        # Loss
        self.loss = None
        # softmax output
        self.y = None
        # one-hot-encoded
        self.t = None
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: #if one-hot vector
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -=1
            dx = dx / batch_size
        
        return dx
    

## 7. Embody BackPropagation

### 2) Embody NNLayer

In [12]:
"""
import sys, os
sys.path.append(os.pardir)
import numpy as np
from collections import OrderedDict
from common.layers import *
from common.gradient import *
"""

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # Layers
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
        accuracy = np.sum(y==t) / float(x.shape[0])
        
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    def gradient(self, x, t):
        self.loss(x, t)
        
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads
        

### 3) Gradient Check
 

In [13]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))

W1:5.023791315118663e-10
b1:3.162654864371844e-09
W2:6.200175352626935e-09
b2:1.4046025118008565e-07


In [19]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grad = network.gradient(x_batch, t_batch)
    
    for key in ('W1','b1','W2','b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i%iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)



0.07778333333333333 0.0763
0.8989 0.8997
0.9225166666666667 0.925
0.93735 0.9371
0.9453333333333334 0.9446
0.9523833333333334 0.9504
0.9568666666666666 0.9553
0.96225 0.9583
0.96515 0.9597
0.9669333333333333 0.9607
0.96985 0.965
0.9711 0.9642
0.9727 0.9657
0.9741333333333333 0.9665
0.9759 0.9652
0.9759833333333333 0.9674
0.9778333333333333 0.9669
