In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount("/content/drive")
%cd drive/MyDrive/Colab\ Notebooks

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [10]:
from dataset.mnist import load_mnist
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784")

In [4]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))    


def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # 오버플로 대책
    return np.exp(x) / np.sum(np.exp(x))

def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size


def softmax_loss(X, t):
    y = softmax(X)
    return cross_entropy_error(y, t)

In [5]:
class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx


class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = sigmoid(x)
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out

        return dx


class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 가중치와 편향 매개변수의 미분
        self.dW = None
        self.db = None

    def forward(self, x):
        # 텐서 대응
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)  # 입력 데이터 모양 변경(텐서 대응)
        return dx


class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 손실함수
        self.y = None    # softmax의 출력
        self.t = None    # 정답 레이블(원-핫 인코딩 형태)
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # 정답 레이블이 원-핫 인코딩 형태일 때
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        
        return dx

def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 값 복원
        it.iternext()   
        
    return grad

In [6]:
from collections import OrderedDict

class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 가중치 초기화
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)

        # 계층 생성
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.layers['Relu2'] = Relu()

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x
        
    # x : 입력 데이터, t : 정답 레이블
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x : 입력 데이터, t : 정답 레이블
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 결과 저장
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads

In [7]:
#(X_train, t_train), (X_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

In [14]:
mnist_X, mnist_y = mnist.data.values, mnist.target.values
mnist_y = mnist_y.astype(np.int)
y_ = np.zeros((mnist_y.shape[0], 10), dtype=np.int)
y_[np.arange(y_.shape[0]), mnist_y] = 1

In [15]:
from sklearn.preprocessing import StandardScaler

mnist_X_scaled = StandardScaler().fit_transform(mnist_X)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, t_train, t_test = train_test_split(mnist_X_scaled, y_, test_size=0.1)

In [17]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = X_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = X_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 기울기 계산
    #grad = network.numerical_gradient(x_batch, t_batch) # 수치 미분 방식
    grad = network.gradient(x_batch, t_batch) # 오차역전파법 방식(훨씬 빠르다)
    
    # 갱신
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(X_train, t_train)
        test_acc = network.accuracy(X_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.17714285714285713 0.17514285714285716
0.9405396825396826 0.9351428571428572
0.9557301587301588 0.9461428571428572
0.9668571428571429 0.958
0.9726507936507937 0.9612857142857143
0.9766031746031746 0.964
0.9797619047619047 0.964
0.9824444444444445 0.9657142857142857
0.984920634920635 0.9675714285714285
0.9862063492063492 0.9667142857142857
0.9873015873015873 0.9681428571428572
0.9895238095238095 0.968
0.989984126984127 0.9658571428571429
0.9914603174603175 0.9685714285714285
0.9921587301587301 0.9687142857142857
0.9934126984126984 0.9697142857142858


In [18]:
from collections import OrderedDict

class LayerNet:

    def __init__(self, layer_size=3, layer_infos=None, weight_init_std = 0.01):
        self.layer_size = layer_size
        self.layer_infos = layer_infos

        # 가중치 초기화
        self.params = {}
        for i in range(self.layer_size):
          self.params['W'+str(i+1)] = weight_init_std * np.random.randn(layer_infos[i], layer_infos[i+1])
          self.params['b'+str(i+1)] = np.zeros(layer_infos[i+1])

        # 계층 생성
        self.layers = OrderedDict()
        for i in range(self.layer_size):
          self.layers['Affine'+str(i+1)] = Affine(self.params['W'+str(i+1)], self.params['b'+str(i+1)])
          self.layers['Relu'+str(i+1)] = Relu()
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
        
    # x : 입력 데이터, t : 정답 레이블
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x : 입력 데이터, t : 정답 레이블
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        for i in range(1, self.layer_size+1):
          grads['W'+str(i)] = numerical_gradient(loss_W, self.params['W'+str(i)])
          grads['b'+str(i)] = numerical_gradient(loss_W, self.params['b'+str(i)])
        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 결과 저장
        grads = {}
        for i in range(self.layer_size):
          grads['W'+str(i+1)], grads['b'+str(i+1)] = self.layers['Affine'+str(i+1)].dW, self.layers['Affine'+str(i+1)].db

        return grads

In [19]:
network = LayerNet(layer_size=2, layer_infos=[784, 50, 10])

iters_num = 10000
train_size = X_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size // batch_size, 1)
print(iter_per_epoch)
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    X_batch = X_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grad = network.gradient(X_batch, t_batch) # 오차역전파법 방식(훨씬 빠르다)
    
    # 갱신
    for ii in range(network.layer_size):
      network.params['W'+str(ii+1)] -= learning_rate * grad['W'+str(ii+1)]
      network.params['b'+str(ii+1)] -= learning_rate * grad['b'+str(ii+1)]
    
    loss = network.loss(X_batch, t_batch)
    train_loss_list.append(loss)

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(X_train, t_train)
        test_acc = network.accuracy(X_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

630
0.1179047619047619 0.11828571428571429
0.9385079365079365 0.9318571428571428
0.9558253968253968 0.949
0.9658888888888889 0.9561428571428572
0.9718888888888889 0.9607142857142857
0.9757619047619047 0.961
0.9788571428571429 0.9634285714285714
0.9821587301587301 0.965
0.9844285714285714 0.965
0.9865873015873016 0.9675714285714285
0.9872380952380952 0.9665714285714285
0.9892380952380952 0.9685714285714285
0.9906349206349206 0.9687142857142857
0.9914444444444445 0.969
0.9924603174603175 0.968
0.9934603174603175 0.9694285714285714
