# 神经网络的学习

“学习”就是从训练数据中自动获取最优权重参数的过程，引入损失函数这一指标，学习的过程就是是的损失函数值达到最小的权重参数

## 从数据中学习

泛化能力：是指处理未被观察过（训练过）数据的能力，泛化能力是机器学习的最终目标。

过度拟合：过度拟合是机器学习中一个常见问题，它是指在训练数据上表现非常好，但是在新的、未见过的数据上表现不佳的现象

## 损失函数

神经网络学习中所使用的指标称为损失函数（loss function）。

一般使用均方误差和交叉熵误差

将正确的标签标识为1，其他标签标识为0的表示方法称为one-hot表示

In [29]:
import numpy as np

def mean_squared_error(y, t):
    return np.sum((y - t) ** 2) / 2

t = np.array([0, 0, 1, 0, 0, 0])
y = np.array([0.1, 0.05, 0.6, 0.0, 0.05, 0.1])

mean_squared_error(y, t)

0.09250000000000003

交叉熵误差（cross entropy error）

In [30]:
def cross_entropy_error(y, t):
    delta = 1e-7
    return -np.sum(t * np.log(y + delta))

t = [0, 0, 1, 0]
y = [0.1, 0.05, 0.6, 0.1]

cross_entropy_error(np.array(y), np.array(t))

y = [0.1, 0.05, 0.3, 0.2]
cross_entropy_error(np.array(y), np.array(t))

1.2039724709926583

神经网络的学习也是从训练数据中选出一批数据（mini-batch，小批量），然后对每一个mini-batch进行学习。

In [36]:
import sys, os
sys.path.append(os.pardir)

from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

print(x_train.shape)
print(t_train.shape)

Converting train-images-idx3-ubyte.gz to NumPy Array ...
Done
Converting train-labels-idx1-ubyte.gz to NumPy Array ...
Done
Converting t10k-images-idx3-ubyte.gz to NumPy Array ...
Done
Converting t10k-labels-idx1-ubyte.gz to NumPy Array ...
Done
Creating pickle file ...
Done!
(60000, 784)
(60000, 10)


In [None]:
train_size = x_train.shape[0]
batch_size = 10

batch_mask = np.random.choice(train_size, batch_size)

x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]

print(x_batch)

In [None]:
# 还有可能有重复的索引
np.random.choice(10, 4)

In [None]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, t.size)
        
    batch_size = y.shape[0]
    
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

## 数值微分

In [None]:
def numerical_diff(f, x):
    h = 1e-4
    return (f(x + h) - f(x - h)) / (2 * h)

In [None]:
import matplotlib.pylab as plt

def function_1(x):
    return 0.01 * x ** 2 + 0.1 * x

x = np.arange(0.0, 20.0, 0.1)
y = function_1(x)

plt.xlabel("x")
plt.ylabel("f(x)")

plt.plot(x, y)
plt.show()

In [None]:
numerical_diff(function_1, 5)

In [None]:
numerical_diff(function_1, 10)

## 梯度

In [None]:
def numerical_gradient(f, x):
    h = 1e-4
    grad = np.zeros_like(x)
    
    for i in range(x.size):
        tmp_val = x[i]
        x[i] = tmp_val + h
        fx1 = f(x)
        
        x[i] = tmp_val - h
        fx2 = f(x)
        
        grad[i] = (fx1 - fx2) / (2 * h)
        x[i] = tmp_val
        
    return grad

def function_2(x):
    return np.sum(x ** 2)

numerical_gradient(function_2, np.array([3.0, 4.0]))

In [None]:
def gradient_descent(f, init_x, lr = 0.01, step_num = 100):
    x = init_x
    
    for i in range(step_num):
        grad = numerical_gradient(f, x)
        x -= lr * grad
        
    return x

gradient_descent(function_2, np.array([100., 200.]), 0.01, 1000)

## 学习算法的实现

In [38]:
import sys, os
sys.path.append(os.pardir)

from common.functions import *
from common.gradient import numerical_gradient

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        return y
    
    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
        
       

In [41]:
import numpy as np
from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

train_loss_list = []
train_acc_list = []
test_acc_list = []

# 超参数
iters_num = 10000
batch_size = 100
learning_rate = 0.1
# 平均每个epoch的重复次数
train_size = x_train.shape[0]
iter_per_epoch = max(train_size / batch_size, 1)


network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

for i in range(iters_num):
    # 获取mini-batch
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    # 计算梯度
    grad = network.numerical_gradient(x_batch, t_batch)
    # grad = network.gradient(x_batch, t_batch) # 高速版!

    # 更新参数
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    # 计算每个epoch的识别精度
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

train acc, test acc | 0.10218333333333333, 0.101


KeyboardInterrupt: 