In [None]:
import re
import sys
import os
import numpy as np
import matplotlib.pyplot as plt

## Parameters Updating

In [None]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
    
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]

In [None]:
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
    
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
    
        for key in params.keys():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]

In [None]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
    
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
        
        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

In [None]:
#!cat deep_learning_from_scratch/ch06/optimizer_compare_naive.py

In [None]:
#!cd deep_learning_from_scratch/ch06 && python3 optimizer_compare_naive.py

In [None]:
#!cd deep_learning_from_scratch/ch06 && python3 weight_init_activation_histogram.py

## Initial Weight

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
def sigmoid_alpha(x):
    return 2 / (1 + np.exp(-x)) - 1

In [None]:
def tanh(x):
    return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

In [None]:
def relu(x):
    return np.maximum(x, 0)

In [None]:
x = np.random.randn(1000, 100)
node_num = 100
hidden_layer_size = 5
activations = {}

for i in range(hidden_layer_size):
    if i != 0:
        x = activations[i - 1]
    
    # w = np.random.randn(node_num, node_num) * 1
    # w = np.random.randn(node_num, node_num) * 0.01
    # w = np.random.randn(node_num, node_num) / np.sqrt(node_num)
    w = np.random.randn(node_num, node_num) * (np.sqrt(2 / node_num))
    
    z = np.dot(x, w)
    a = relu(z)
    activations[i] = a

In [None]:
plt.figure(figsize=(12, 4))
for i, a in activations.items():
    plt.subplot(1, len(activations), i + 1)
    plt.title(f"{i + 1}-layer")
    plt.hist(a.flatten(), 30, range=(0, 1))
    # plt.ylim(0, 40000)
    if i != 0: plt.tick_params(labelleft=False)
plt.show()

In [None]:
#!cd deep_learning_from_scratch/ch06 && python3 weight_init_compare.py

## Batch Normalization

In [None]:
#!cd deep_learning_from_scratch/ch06 && python3 batch_norm_test.py

## Regularization

In [None]:
from deep_learning_from_scratch.dataset.mnist import load_mnist

In [None]:
from deep_learning_from_scratch.common.functions import *
from deep_learning_from_scratch.common.util import im2col, col2im
with open("deep_learning_from_scratch/common/layers.py") as f:
    content = f.read()
print(re.findall(r".*common.*", content))
exec(re.sub(r".*common.*", "", content))

In [None]:
from deep_learning_from_scratch.common.gradient import numerical_gradient
with open('deep_learning_from_scratch/common/multi_layer_net.py', 'r') as f:
    content = f.read()
print(re.findall(r".*common.*", content))
exec(re.sub(r".*common.*", "", content))

### Weight decay

In [None]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)
# 過学習を再現するために、学習データを削減
x_train = x_train[:300]
t_train = t_train[:300]

network = MultiLayerNet(input_size=784, hidden_size_list=[100]*6, output_size=10)
optimizer = SGD(lr=0.01)

max_epochs = 201
train_size = x_train.shape[0]
batch_size = 100

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)
epoch_cnt = 0

for i in range(int(1e+9)):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grads = network.gradient(x_batch, t_batch)
    optimizer.update(network.params, grads)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        
        epoch_cnt += 1
        if epoch_cnt >= max_epochs:
            break

In [None]:
markers = {'train': 'o', 'test': 's'}
plt.plot(range(1, 202), train_acc_list, marker='o', label='train', markevery=10)
plt.plot(range(1, 202), test_acc_list, marker='s', label='test', markevery=10)
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.xlim(0, 201)
plt.ylim(0, 1)
plt.legend(loc='lower right')
plt.show()

In [None]:
#!cd deep_learning_from_scratch/ch06 && python3 overfit_weight_decay.py

### Dropout

In [None]:
class Dropout:
    def __init__(self, dropout_ratio=0.5):
        self.dropout_ratio = dropout_ratio
        self.mask = None
    
    def forward(self, x, train_flg=True):
        if train_flg:
            self.mask = np.random.rand(*x.shape) > self.dropout_ratio
            return x * self.mask
        else:
            return x * (1.0 - self.dropout_ratio)
    
    def backward(self, dout):
        return dout * self.mask

In [None]:
#!cd deep_learning_from_scratch/ch06 && python3 overfit_dropout.py

## Hyper Parameter

In [None]:
from deep_learning_from_scratch.common.util import shuffle_dataset

In [None]:
(x_train, t_train), (x_test, t_test) = load_mnist()

# 訓練データをシャッフル
x_train, t_train = shuffle_dataset(x_train, t_train)

# 検証データの分割
validation_rate = 0.20
validation_num = int(x_train.shape[0] * validation_rate)

x_val = x_train[:validation_num]
t_val = t_train[:validation_num]
x_train = x_train[validation_num:]
t_train = t_train[validation_num:]

In [None]:
weight_decay = 10 ** np.random.uniform(-8, 4)
lr = 10 ** np.random.uniform(-6, -2)

In [None]:
#!cd deep_learning_from_scratch/ch06 && python3 hyperparameter_optimization.py