## import module

In [1]:
import time
import numpy as np

## Utility Function

In [2]:
epsilon = 0.0001

def _t(x):
    return np.transpose(x)

def _m(A, B):  # matrix multiplication
    return np.matmul(A, B)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def mean_squared_error(h, y):
    return 1/2*np.mean(np.square(h-y))

## implement Neuron

In [3]:
class Dense:  # Fully connected layer
    def __init__(self, W, b, a):  # a : activation function
        self.W = W
        self.b = b
        self.a = a
        
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        
    def __call__(self, x):  # 정방향 계산
        return self.a(_m(_t(self.W), x) + self.b)  # matmul((ixo)T, ix1) + ox1

## DNN without Back Propagation

In [4]:
class DNN:
    def __init__(self, hidden_depth, num_neuron, num_input, num_output, activation=sigmoid):
        def init_var(i, o):  # i, o : num of input/output variables
            return np. random.normal(0.0, 0.01, (i,o)), np.zeros((o,))  # 초기 W, b 랜덤하게 할당
        
        self.sequence = list()
        # First hidden layer
        W, b = init_var(num_input, num_neuron)
        self.sequence.append(Dense(W, b, activation))
        
        # hidden layers
        for _ in range(hidden_depth-1):
            W, b = init_var(num_neuron, num_neuron)
            self.sequence.append(Dense(W, b, activation))   
        # Output layer
        W, b = init_var(num_neuron, num_output)
        self.sequence.append(Dense(W, b, activation))
        
    def __call__(self, x):  # Network에 저장된 sequence의 모든 layer를 이용하여 정방향으로 출력값 계산
        for layer in self.sequence: 
            x = layer(x)
        return x
    
    def calc_gradient(self, x, y, loss_func):  # calculate numerical gradient
        
        def get_new_sequence(layer_index, new_layer): # layer_index : 바꾸려는 layer의 idx,  # new_layer : 새로운 layer
            new_sequence = list()
            for i, layer in enumerate(self.sequence):
                if i == layer_index:  # 바꾸려고 하는 index에 도달하면
                    new_sequence.append(new_layer)
                else:  # 그 외에는 기존 layer 사용
                    new_sequence.append(layer)
            return new_sequence
        
        # 지정한 sequence의 모든 layer를 이용하여 정방향으로 출력값 계산
        def eval_sequence(x, sequence):
            for layer in sequence:
                x = layer(x)
            return x
        
        loss = loss_func(self(x), y)   # 기준이 되는, 초기 parameter를 통한 loss   # self(x)  : __call__(x)
        
        for layer_id, layer in enumerate(self.sequence):  # 모든 layer 순회
    
            # 모든 parameter w, b에 대해서 순회를 돌면서 numerical gradient 계산
            for w_i, w in enumerate(layer.W):  # 행에 대한 iteration
                for w_j, ww in enumerate(w):  # 열에 대한 iteration
                    W = np.copy(layer.W)
                    W[w_i][w_j] = ww + epsilon
                    
                    new_layer = Dense(W, layer.b, layer.a)
                    new_seq = get_new_sequence(layer_id, new_layer)
                    h = eval_sequence(x, new_seq)
                    
                    num_grad = (loss_func(h, y) - loss) / epsilon  # (f(x+eps) - f(x)) / eps
                    layer.dW[w_i][w_j] = num_grad
                
            for b_i, bb in enumerate(layer.b):
                b = np.copy(layer.b)
                b[b_i] = bb + epsilon

                new_layer = Dense(layer.W, b, layer.a)
                new_seq = get_new_sequence(layer_id, new_layer)
                h = eval_sequence(x, new_seq)

                num_grad = (loss_func(h, y) - loss) / epsilon  # (f(x+eps) - f(x)) / eps
                layer.db[b_i] = num_grad
        
        return loss

## Gradient Descent

In [5]:
def gradient_descent(network, x, y, loss_obj, alpha=0.01):
    loss = network.calc_gradient(x, y, loss_obj) # 각각의 layer의 모든 parameter b, w에 대해 gradient 계산
    for layer in network.sequence:   # update W, b
        layer.W += -alpha * layer.dW
        layer.b += -alpha * layer.db
    return loss  # 학습 과정에서의 loss를 return

## Test

In [6]:
x = np.random.normal(0.0, 1.0, (10, ))
y = np.random.normal(0.0, 1.0, (2, ))

dnn = DNN(hidden_depth=5, num_neuron=32, num_input=10, num_output=2, activation=sigmoid)

t = time.time()
for epoch in range(100):
    loss = gradient_descent(dnn, x, y, mean_squared_error, 0.01)
    if epoch % 10 == 1:
        print('Epoch {}: Test loss {}'.format(epoch, loss))
print('{} seconds elapsed.'.format(time.time() - t))

Epoch 1: Test loss 1.0238182679908203
Epoch 11: Test loss 0.9680340202563503
Epoch 21: Test loss 0.9168151196164134
Epoch 31: Test loss 0.8709576028775313
Epoch 41: Test loss 0.8306145139363748
Epoch 51: Test loss 0.7954961156707366
Epoch 61: Test loss 0.7650786348562719
Epoch 71: Test loss 0.7387589991737111
Epoch 81: Test loss 0.7159467342022199
Epoch 91: Test loss 0.696108198465763
66.88063025474548 seconds elapsed.
