In [3]:
# -*- coding: utf-8 -*-

import numpy as np
import pickle
import sys, os 
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize
sys.path.append(os.pardir)
from dataset.mnist import load_mnist

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

def relu(x):
    return np.maximum(0, x)

class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx

def relu_grad(x):
    grad = np.zeros(x)
    grad[x>0] = 1
    return grad

def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # 오버플로 대책
    return np.exp(x) / np.sum(np.exp(x))

def mean_squared_error(y, t):
    return 0.5 * np.sum((y-t)**2)

def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

def get_data():
    (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, flatten=True, one_hot_label=False)
    return x_test, t_test


def init_network():
    with open("sample_weight.pkl", 'rb') as f:
        network = pickle.load(f)
    return network

def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 값 복원
        it.iternext()   
        
    return grad


class ThreeLayerNet:

    def __init__(self, input_size, hidden_size_1, hidden_size_2, hidden_size_3, output_size, weight_init_std=0.01):
        # 가중치 초기화
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size_1)
        self.params['b1'] = np.zeros(hidden_size_1)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size_1, hidden_size_2)
        self.params['b2'] = np.zeros(hidden_size_2)
        self.params['W3'] = weight_init_std * np.random.randn(hidden_size_2, hidden_size_3)
        self.params['b3'] = np.zeros(hidden_size_3)
        self.params['W4'] = weight_init_std * np.random.randn(hidden_size_3, output_size)
        self.params['b4'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2, W3, W4 = self.params['W1'], self.params['W2'], self.params['W3'], self.params['W4']
        b1, b2, b3, b4 = self.params['b1'], self.params['b2'], self.params['b3'], self.params['b4']
    
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        z2 = sigmoid(a2)
        a3 = np.dot(z2, W3) + b3
        z3 = sigmoid(a3)
        a4 = np.dot(z3, W4) + b4
        y = softmax(a4)
        
        return y
    
    # x : 입력 데이터, t : 정답 레이블
    def loss(self, x, t):
        y = self.predict(x)
        
        return cross_entropy_error(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
          
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy    
        
    # x : 입력 데이터, t : 정답 레이블
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        grads['W3'] = numerical_gradient(loss_W, self.params['W3'])
        grads['b3'] = numerical_gradient(loss_W, self.params['b3'])
        grads['W4'] = numerical_gradient(loss_W, self.params['W4'])
        grads['b4'] = numerical_gradient(loss_W, self.params['b4'])
        
        return grads
    
    def gradient(self, x, t):
        W1, W2, W3, W4 = self.params['W1'], self.params['W2'], self.params['W3'],  self.params['W4']
        b1, b2, b3, b4 = self.params['b1'], self.params['b2'], self.params['b3'],  self.params['b4']
        grads = {}
        
        batch_num = x.shape[0]
        
        # forward
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        z2 = sigmoid(a2)
        a3 = np.dot(z2, W3) + b3
        z3 = sigmoid(a3)
        a4 = np.dot(z3, W4) + b4
        y = softmax(a4)
        
        # backward
        dz4 = (y - t) / batch_num
        grads['W4'] = np.dot(z3.T, dz4)
        grads['b4'] = np.sum(dz4, axis=0)
        
        da3 = np.dot(dz4, W4.T)
        dz3 = sigmoid_grad(a3) * da3
        grads['W3'] = np.dot(z2.T, dz3)
        grads['b3'] = np.sum(dz3, axis=0)
        
        da2 = np.dot(da3, W3.T)
        dz2 = sigmoid_grad(a2) *da2
        grads['W2'] = np.dot(z1.T, dz2)
        grads['b2'] = np.sum(dz2, axis=0)
        
        da1 = np.dot(da2, W2.T)
        dz1 = sigmoid_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)

        return grads
    
    
    # 데이터 읽기
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = ThreeLayerNet(input_size=784, hidden_size_1 = 50,hidden_size_2 = 10, hidden_size_3= 40, output_size=10)

# 하이퍼파라미터
iters_num = 10000
train_size = x_train.shape[0]
print(train_size)
test_size = x_test.shape[0]
batch_size = 153
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list= []
test_loss_list=[]

#1에폭당 반복 수
iter_per_epoch = max(train_size // batch_size, 1)   

loss =1
step = 0
while(loss > 0.01):
        step+=1
        
        # 미니배치 획득
        batch_mask = np.random.choice(train_size, batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]
        
        # 기울기 계산
        #grad = network.numerical_gradient(x_batch, t_batch)
        grad = network.gradient(x_batch, t_batch)
        
        # 매개변수 갱신
        for key in ('W1', 'b1', 'W2', 'b2', 'W3', 'b3', 'W4', 'b4'):
            network.params[key] -= learning_rate * grad[key]
        
        # 학습 경과 기록
        loss = network.loss(x_batch, t_batch)
        train_loss_list.append(loss)
       
        # 1에폭당 손실 계산
        if (step+1) % iter_per_epoch == 0:
            #if(loss<0.1): learning_rate =0.01
            train_acc = network.accuracy(x_train, t_train)
            test_acc = network.accuracy(x_test, t_test)
            train_acc_list.append(train_acc)
            test_acc_list.append(test_acc)
            
            test_batch_mask = np.random.choice(test_size, batch_size)
            test_loss_list.append(network.loss(x_test[test_batch_mask],t_test[test_batch_mask]))
            print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
            print("loss : " + str(loss))
    # 그래프 그리기
fig = plt.figure()
train_loss = np.arange(len(train_loss_list))
test_loss = np.arange(len(test_loss_list))
ax1 = fig.add_subplot(2, 1, 1)
ax2 = fig.add_subplot(2, 1, 2)

ax1.plot(train_loss, train_loss_list)
ax1.set_xlabel("iteration")
ax1.set_ylabel("train loss")

ax2.plot(test_loss, test_loss_list)
ax2.set_xlabel("iteration")
ax2.set_ylabel("test loss")
plt.show() 

60000
train acc, test acc | 0.10441666666666667, 0.1028
loss : 2.2967233160369704
train acc, test acc | 0.09736666666666667, 0.0982
loss : 2.298435848026948
train acc, test acc | 0.10441666666666667, 0.1028
loss : 2.2886881868107913


KeyboardInterrupt: 