In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict

In [2]:
"""
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784")
"""
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd drive/MyDrive/Colab\ Notebooks

/content/drive/MyDrive/Colab Notebooks


In [4]:
from dataset.mnist import load_mnist

In [5]:
class Affine:
  def __init__(self, W, b):
    self.W = W 
    self.b = b
    self.X = None
    self.dW = None
    self.db = None

  def forward(self, X):
    self.X = X
    out = np.dot(X, self.W) + self.b
    return out

  def backward(self, dout):
    dx = np.dot(dout, self.W.T)
    self.dW = np.dot(self.X.T, dout)
    self.db = np.sum(dout, axis=0)

    return dx

In [6]:
class ReLU:

  def __init__(self):
    self.mask = None

  def forward(self, y):
    self.mask = (y <= 0)
    out = y.copy()
    out[self.mask] = 0
    return out
  
  def backward(self, dout):
    dout[self.mask] = 0
    dy = dout
    return dy

In [7]:
class SoftmaxWithLoss:
  def __init__(self):
    self.loss = None
    self.y = None
    self.t = None
  
  def forward(self, a, t):
    self.t = t
    self.y = SoftmaxWithLoss.softmax(a)
    self.loss = SoftmaxWithLoss.cross_entropy_error(self.y, self.t)
    return self.loss

  def backward(self, dout=1):
    batch_size = self.t.shape[0]
    dx = (self.y - self.t) / batch_size
    return dx

  def softmax(a):
    C = np.max(a, axis=1)
    exp_a = np.exp(a - C.reshape(-1, 1))
    if a.ndim == 1:
      sum_exp_a = np.sum(exp_a)
    else:
      sum_exp_a = np.sum(exp_a, axis=1)
    return exp_a / sum_exp_a.reshape(-1, 1)

  # def cross_entropy_error(y, t):
  #   if y.ndim == 1:
  #     t = t.reshape(1, t.size)
  #     y = y.reshape(1, y.size)
  #   batch_size = y.shape[0]
  #   return -np.sum(np.log(y[np.arange(batch_size),t] + 1e-7)) / batch_size

  def cross_entropy_error(y, t): # t: one-hot-encoded
    if y.ndim == 1:
      t = t.reshape(1, t.size)
      y = y.reshape(1, y.size)
    return -np.sum(t * np.log(y + 1e-7)) / y.shape[0]

In [8]:
class TwoLayerNet:
  def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
    self.params = {}
    self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
    self.params['b1'] = np.zeros(hidden_size)
    self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
    self.params['b2'] = np.zeros(output_size)

    self.layers = OrderedDict()
    self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
    self.layers['Relu1'] = ReLU()
    self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
    self.layers['Relu2'] = ReLU()
    self.lastLayer = SoftmaxWithLoss()

  def predict(self, X):
    for layer in self.layers.values():
      X = layer.forward(X)
    return X

  def loss(self, X, t):
    y = self.predict(X)
    return self.lastLayer.forward(y, t)

  def accuracy(self, X ,t):
    y = self.predict(X)
    y = np.argmax(y, axis=1)
    if t.ndim != 1 :
      t = np.argmax(t, axis=1) 
    accuracy = np.sum(y == t) / float(X.shape[0])
    return accuracy

  def gradient(self, X, t):
    self.loss(X, t)
    dout = 1
    dout = self.lastLayer.backward(dout)

    layers = list(self.layers.values())
    layers.reverse()
    for layer in layers:
      dout = layer.backward(dout)
    
    grads = {}
    grads['W1'] = self.layers['Affine1'].dW
    grads['b1'] = self.layers['Affine1'].db
    grads['W2'] = self.layers['Affine2'].dW
    grads['b2'] = self.layers['Affine2'].db

    return grads

In [9]:
(X_train, t_train), (X_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

In [None]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = X_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
  if i % 100 == 0:
    print("index: "+str(i)) 
  batch_mask = np.random.choice(train_size, batch_size)
  X_batch = X_train[batch_mask]
  t_batch = t_train[batch_mask]

  grad = network.gradient(X_batch, t_batch)

  for key in ('W1', 'b1', 'W2', 'b2'):
    network.params[key] -= learning_rate * grad[key]
  
  loss = network.loss(X_batch, t_batch)
  train_loss_list.append(loss)

  if i % iter_per_epoch == 0:
    train_acc = network.accuracy(X_train, t_train)
    test_acc = network.accuracy(X_test, t_test)
    train_acc_list.append(train_acc)
    test_acc_list.append(test_acc)
    print(train_acc, test_acc)


index: 0
0.1033 0.1006
index: 100
index: 200
index: 300
index: 400
index: 500
index: 600
0.7308833333333333 0.7387
index: 700
index: 800
index: 900
index: 1000
index: 1100
index: 1200
0.7430666666666667 0.7466
index: 1300
index: 1400
index: 1500
index: 1600
index: 1700
index: 1800
0.7490333333333333 0.7526
index: 1900
index: 2000
index: 2100
index: 2200
index: 2300
index: 2400
0.7557 0.7577
index: 2500
index: 2600
index: 2700
index: 2800
index: 2900
index: 3000
0.93645 0.9352
index: 3100
index: 3200
index: 3300
index: 3400
index: 3500
index: 3600
0.9452666666666667 0.9432
index: 3700
index: 3800
index: 3900
index: 4000
index: 4100
index: 4200
0.9531 0.9501
index: 4300
index: 4400
index: 4500
index: 4600
index: 4700
index: 4800
0.9592 0.9566
index: 4900
index: 5000
index: 5100
index: 5200
index: 5300
index: 5400
0.9620333333333333 0.9596
index: 5500
index: 5600
index: 5700
index: 5800
index: 5900
index: 6000
0.9654666666666667 0.9609
index: 6100
index: 6200
index: 6300
index: 6400
index