### 情報工学工房 第3回レポート課題  


#### 課題：課題１～３を以下のとおり実施した。
##### 感想：動画等活用して誤差逆伝搬法の概念を理解するのにはそれほど苦労しなかったが、実装の理解には相当苦労した。今のレベルだとサンプルコードのようなスマートで無駄のない配列の受け渡し処理を独力で組むのはほとんど不可能なように思う。調べたり試行錯誤するうちに理解が深まり、力がついているのは実感しているので
##### 参考文献：
- Snow Tree in June, NumPy♪nditerを使うと様々な次元数に対応できる, https://snowtree-injune.com/2020/06/29/nditer-z009/, 2022-06-08
- 予備校のノリで学ぶ「大学の数学・物理」, 絶対に理解させる誤差逆伝播法【深層学習】, https://www.youtube.com/watch?v=0itH0iDO8BE, 2022-06-04

#### 課題１　Softmax with lossレイヤーを実装

In [1]:
import numpy as np
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
    if t.size == y.size:
        t = t.argmax(axis=1)
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

In [2]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None # softmaxの出力
        self.t = None # 教師データ
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        # forwardの式
        # -sum ( t * log (y))
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    def backward(self, dout=1):
        # backwardの式
        # yi - ti (iはIndex)
        batch_size = self.t.shape[0]
        # Backwardを実装して、微分値をdxに代入してくださいb
        dx = (self.y - self.t) / batch_size 
        return dx

#### 課題２　Two layer netにおける勾配の確認 
Two layer netは教科書の写経を流用した。
結果：重み、バイアスいずれも数値微分との差は小さくなった。ただし1層目と2層目で差があり、複数回実施していずれも1層目の方が大きな差となったが、原因特定に至らなかった。

In [56]:
import sys, os
import numpy as np
from dataset.mnist import load_mnist
sys.path.append(os.pardir)  # 親ディレクトリのファイルをインポートするための設定
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)
        # レイヤの生成
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    # x:入力データ, t:教師データ
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    # x:入力データ, t:教師データ
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        # 設定
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
        return grads
# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[5:30]
t_batch = t_train[5:30]
# 数値微分
grad_numerical = network.numerical_gradient(x_batch, t_batch)
# Backward
grad_backprop = network.gradient(x_batch, t_batch)
#grad_backprop = gradient(x_batch, t_batch)
for key in grad_numerical.keys():
    diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
    print(key + ":" + str(diff))

W1:1.1326125727475538e-07
b1:9.505142234304499e-07
W2:9.788660434246944e-13
b2:1.201261340399995e-10


   #### Softmax with loss 使用

In [28]:
def gradient(network, x, t):
    # 自分で実装したSoftmax with lossクラスを使ってみてください
    lastLayer = SoftmaxWithLoss()
    # forward
    #self.loss(x, t)
    network.loss(x, t)
    # backward
    dout = 1
    dout = lastLayer.backward(dout)
    #layers = list(self.layers.values())
    layers = list(network.layers.values())
    layers.reverse()
    for layer in layers:
        dout = layer.backward(dout)
    # 設定
    grads = {}
    #grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
    grads['W1'], grads['b1'] = network.layers['Affine1'].dW, self.layers['Affine1'].db
    #grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
    grads['W2'], grads['b2'] = network.layers['Affine2'].dW, self.layers['Affine2'].db
    return grads

#### 課題３　Two layer net の学習結果
lr0.1～3の間で複数試した結果、lr0.3の学習速度が最も速かった。 

In [50]:
# import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet
# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.3
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    # 勾配
    grad = network.gradient(x_batch, t_batch)
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.10683333333333334 0.1135
0.9305833333333333 0.9289
0.9488333333333333 0.9455
0.96445 0.9632
0.97145 0.965
0.9750833333333333 0.9672
0.9765333333333334 0.9682
0.9798666666666667 0.9709
0.9835 0.9726
0.9860666666666666 0.9734
0.9842166666666666 0.9711
0.9856 0.9734
0.98775 0.9761
0.9875833333333334 0.9724
0.9895333333333334 0.9741
0.98875 0.9733
0.9921333333333333 0.9755
