# ニューラルネットワークの学習

In [2]:
from typing import *
import os
from glob import glob
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nptyping import NDArray
from IPython.display import display
from rich import print as rprint

sns.set_style('whitegrid')
colors = ['#de3838', '#007bc3', '#ffd12a']
markers = ['o', 'x', ',']
%config InlineBackend.figure_formats = ['svg']

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

cmap = sns.diverging_palette(255, 0, as_cmap=True)  # カラーパレットの定義

## 損失関数

In [2]:
def sum_squared_error(y: NDArray[float], t: NDArray[int]) -> NDArray[float]:
    return 0.5 * np.sum((y - t)**2)

t = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
y = np.array([0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0])
print(sum_squared_error(y, t))

t = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
y = np.array([0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0])
print(sum_squared_error(y, t))

0.09750000000000003
0.5975


In [3]:
def cross_entropy_error(y: NDArray[float], t: NDArray[int]) -> NDArray[float]:
    delta = 1e-7
    return -np.sum(t * np.log(y + delta))

t = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
y = np.array([0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0])
print(cross_entropy_error(y, t))

t = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
y = np.array([0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0])
print(cross_entropy_error(y, t))

0.510825457099338
2.3025840929945454


# ミニバッチ

In [8]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical


(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

train_images = train_images.reshape(60000, 784) / 255
test_images = test_images.reshape(10000, 784) / 255

print(train_labels.shape)

(60000, 10)


In [9]:
train_size = train_images.shape[0]
batch_size = 10
batch_mask : NDArray[(batch_size,), int] = np.random.choice(train_size, batch_size)
print(batch_mask)

[41942 59761 23465 25385 45278 13608 57452 14447 46652 24498]


In [22]:
def cross_entropy_error(y: NDArray[int], t: NDArray[int]) -> float:
    """クロスエントロピー誤差

    UnitTests
    ---------
    >>> cross_entropy_error(np.array([1, 0, 0]), np.array([0, 1, 0]))
    16.118095650958317
    >>> cross_entropy_error(np.array([[1, 0, 0], [0, 0, 1]]), np.array([[0, 1, 0], [0, 0, 1]]))
    8.059047775479161
    >>> cross_entropy_error(np.array([[1, 0, 0], [0, 0, 1]]), np.array([1, 2]))
    8.059047775479161
    """
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

## 勾配法

In [3]:
def numerical_gradient_1d(
        func: Callable[[NDArray[float]], float], 
        x: NDArray[float]) -> NDArray[float]:
    """数値微分(1d_array専用)

    UnitTests
    ---------
    >>> numerical_gradient(lambda x: x[0]**2+x[1]**2, np.array([3.0, 4.0]))
    array([6., 8.])
    """
    delta_x : float = 1e-4
    grad : NDArray[float] = np.zeros_like(x)
    for idx in range(x.size):
        temp : float = x[idx]
        x[idx] = temp + delta_x
        func_x1 : float = func(x)
        x[idx] = temp - delta_x
        func_x2 : float = func(x)
        grad[idx] = (func_x1 - func_x2) / (2*delta_x)
        x[idx] = temp
    return grad


def numerical_gradient(
        func: Callable[[NDArray[float]], float], 
        x: NDArray[float]) -> NDArray[float]:
    delta_x : float = 1e-4
    grad : NDArray[float] = np.zeros_like(x)
    it : np.nditer = np.nditer(x, flags=['multi_index'])
    while not it.finished:
        idx : Tuple[int, ...] = it.multi_index
        temp : float = x[idx]
        x[idx] = temp + delta_x
        func_x1 : float = func(x)  # f(x+Δx)
        x[idx] = temp - delta_x
        func_x2 : float = func(x)  # f(x-Δx)
        grad[idx] = (func_x1 - func_x2) / (2*delta_x)
        x[idx] = temp
        it.iternext()
    return grad


def gradient_descent(
        func: Callable[[NDArray[float]], float],
        init_x: NDArray[float], 
        lr: float = 0.01, 
        num_steps : int = 1000) -> NDArray[float]:
    """勾配降下法

    UnitTests
    ---------
    >>> gradient_descent(lambda x: x[0]**2+x[1]**2, np.array([-3.0, 4.0]))
    array([-5.04890207e-09,  6.73186943e-09])
    """
    x : NDArray[float] = init_x
    for _ in range(num_steps):
        grad : NDArray[float] = numerical_gradient(func=func, x=x)
        x -= lr * grad
    return x

# ニューラルネットワークに対する勾配

In [6]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))    

def softmax(x: NDArray) -> NDArray:
    x = x - np.max(x, axis=-1, keepdims=True)
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

def cross_entropy_error(y: NDArray, t: NDArray) -> float:
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    if t.ndim == y.ndim:  # tがone-hot-vectorの場合、正解ラベルのindexに変換
        t = t.argmax(axis=1)
    
    batch_size : int = y.shape[0]
    epsilon = 1e-7
    return -np.sum(np.log(y[np.arange(batch_size), t] + epsilon)) / batch_size

SampleSize = 1
InputSize = 2
OutputSize = 3

class SimpleNN:
    def __init__(self) -> None:
        np.random.seed(seed=42)
        self.W : NDArray[(InputSize, OutputSize), float] = np.random.randn(2, 3)  # ガウス分布で初期化

    def predict(self, x: NDArray[(1, InputSize), float]) -> NDArray[(1, OutputSize), float]:
        return x @ self.W

    def loss(self, x: NDArray[(1, InputSize), float], t: NDArray[(OutputSize), float]) -> float:
        Z : NDArray[(1, OutputSize), float] = self.predict(x)
        print('Z shape', Z.shape)
        print('softmax(Z) shape', softmax(Z).shape)
        print('softmax(Z) shape', softmax(Z).ndim)
        Y : NDArray[(1, OutputSize), float] = softmax(Z)
        loss : float = cross_entropy_error(Y, t)
        return loss


snn : SimpleNN = SimpleNN()
rprint('重みパラメータ\n', snn.W)
# x = np.array([[0.6, 0.9]])
x = np.array([0.6, 0.9])
p : NDArray[(OutputSize), float] = snn.predict(x=x)
rprint('予測値:', p)
t = np.array([0, 0, 1])  # 正解ラベル
print('loss:', snn.loss(x, t))

Z shape (3,)
softmax(Z) shape (3,)
softmax(Z) shape 1
loss: 1.802525525796413


## 2層ニューラルネットワーク

In [4]:
SampleSize = 0
InputSize = 0
HiddenSize = 0
OutputSize = 0


class TowLayerNN:
    def __init__(self, input_size: int, hidden_size: int, output_size: int, weight_init_std: float = 0.01) -> None:
        # 重みを正規乱数で初期化
        self.params : Dict[str, NDArray] = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x: NDArray[(SampleSize, InputSize)]) -> NDArray[(SampleSize, OutputSize)]:
        W1 : NDArray[(InputSize, HiddenSize)] = self.params['W1']
        W2 : NDArray[(OutputSize, HiddenSize)] = self.params['W2']
        b1 : NDArray[(HiddenSize)] = self.params['b1']
        b2 : NDArray[(OutputSize)] = self.params['b2']

        Z1 : NDArray[(SampleSize, HiddenSize)] = sigmoid(x @ W1 + b1)
        Y : NDArray[(SampleSize, OutputSize)] = sigmoid(Z1 @ W2 + b2)
        return Y

    def loss(self, x: NDArray[(SampleSize, InputSize)], t) -> float:
        y : NDArray[(SampleSize, OutputSize)] = self.predict(x)
        return cross_entropy_error(y, t)

    def accuracy(self, x: NDArray[(SampleSize, InputSize)], t) -> float:
        y : NDArray[(SampleSize, OutputSize)] = self.predict(x)
        y : NDArray[(SampleSize)] = np.argmax(y, axis=1)
        t : NDArray[(SampleSize)] = np.argmax(t, axis=1)
        accuracy : float = np.sum(y==t) / float(x.shape[0])
        return accuracy

    def numerical_gradient(self, x: NDArray[(SampleSize, InputSize)], t) -> Dict[str, NDArray]:
        loss_W = lambda W: self.loss(x, t)
        grads : Dict[str, NDArray] = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads

In [21]:
net = TowLayerNN(input_size=784, hidden_size=100, output_size=10)
print(net.params['W1'].shape)
print(net.params['b1'].shape)
print(net.params['W2'].shape)
print(net.params['b2'].shape)

(784, 100)
(100,)
(100, 10)
(10,)


In [24]:
x = np.random.rand(10, 784)
t = np.random.rand(10, 10)
grads = net.numerical_gradient(x, t)

print(grads['W1'].shape)
print(grads['b1'].shape)
print(grads['W2'].shape)
print(grads['b2'].shape)

(784, 100)
(100,)
(100, 10)
(10,)


## ミニバッチ学習の実装

In [7]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

train_images = train_images.reshape(60000, 784) / 255
test_images = test_images.reshape(10000, 784) / 255

train_loss_list : List[float] = []
train_acc_list : List[float] = []
test_acc_list : List[float] = []
iters_num = 2
train_size = train_images.shape[0]
batch_size = 100
learning_rate = 0.1
iter_per_epoch : int = int(max(train_size / batch_size, 1))

network = TowLayerNN(input_size=784, hidden_size=50, output_size=10)
for i in tqdm(range(iters_num)):
    batch_mask : NDArray[(batch_size), int] = np.random.choice(train_size, batch_size)
    x_batch : NDArray[(batch_size, 784)] = train_images[batch_mask]
    t_batch : NDArray[(batch_size, 10)] = train_labels[batch_mask]
    grad : Dict[str, NDArray] = network.numerical_gradient(x_batch, t_batch)

    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss : float = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    if i % iter_per_epoch == 0:
        train_acc : float = network.accuracy(train_images, train_labels)
        test_acc : float = network.accuracy(test_images, test_labels)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)


100%|██████████| 2/2 [02:54<00:00, 87.43s/it]
