In [17]:
import copy
import math
import time
import struct

import numpy as np                  # 矩阵
import matplotlib.pyplot as plt     # 绘图

from numba import njit              # 加速 cpu
from numba import cuda              # 加速 gpu(cuda)
from pathlib import Path            # 处理路径

from Activation import Activation
from ProgressBar import ProgressBar 

In [18]:
def bypass(x):
    return x
def tanh(x):
    return np.tanh(x)
def softmax(x):
    exp=np.exp(x - x.max())
    return exp / exp.sum()

def d_softmax(data):
    sm = softmax(data)
    return np.diag(sm) - np.outer(sm,sm)

def d_tanh(data):
    return 1 / ((np.cosh(data)) ** 2)

def d_bypass(x):
    return 1

differential = {softmax:d_softmax, tanh:d_tanh, bypass:d_bypass}
d_type = {bypass:'times', softmax:'dot', tanh:'times'}

In [19]:
dimensions=[28*28, 100, 10]
activation=[bypass,tanh,softmax]
distribution=[
    {}, # leave it empty!!
    {'b':[0, 0],'w':[-math.sqrt(6 / (dimensions[0] + dimensions[1])), math.sqrt(6 / (dimensions[0] + dimensions[1]))]},
    {'b':[0, 0],'w':[-math.sqrt(6 / (dimensions[1] + dimensions[2])), math.sqrt(6 / (dimensions[1] + dimensions[2]))]},
]

In [20]:
def init_parameters_b(layer):
    dist=distribution[layer]['b']
    return np.random.rand(dimensions[layer])*(dist[1]-dist[0])+dist[0]
def init_parameters_w(layer):
    dist=distribution[layer]['w']
    return np.random.rand(dimensions[layer-1],dimensions[layer])*(dist[1]-dist[0])+dist[0]
def init_parameters():
    parameter=[]
    for i in range(len(distribution)):
        layer_parameter={}
        for j in distribution[i].keys():
            if j=='b':
                layer_parameter['b']=init_parameters_b(i)
                continue
            if j=='w':
                layer_parameter['w']=init_parameters_w(i)
                continue
        parameter.append(layer_parameter)
    return parameter

In [31]:
def predict(img,parameters):
    l_in = img
    l_out = activation[0](l_in)
    for layer in range(1, len(dimensions)):
        l_in = np.dot(l_out,parameters[layer]['w']) + parameters[layer]['b']
        l_out = activation[layer](l_in)
    return l_out


3


In [22]:
dataset_path=Path('./MNIST')
train_img_path=dataset_path/'train-images.idx3-ubyte'
train_lab_path=dataset_path/'train-labels.idx1-ubyte'
test_img_path=dataset_path/'t10k-images.idx3-ubyte'
test_lab_path=dataset_path/'t10k-labels.idx1-ubyte'
train_num=50000
valid_num=10000
test_num=10000

with open(train_img_path,'rb') as f:
    struct.unpack('>4i',f.read(16))
    tmp_img=np.fromfile(f,dtype=np.uint8).reshape(-1,28*28)/255
    train_img=tmp_img[:train_num]
    valid_img=tmp_img[train_num:]
    
with open(test_img_path,'rb') as f:
    struct.unpack('>4i',f.read(16))
    test_img=np.fromfile(f,dtype=np.uint8).reshape(-1,28*28)/255

with open(train_lab_path,'rb') as f:
    struct.unpack('>2i',f.read(8))
    tmp_lab=np.fromfile(f,dtype=np.uint8)
    train_lab=tmp_lab[:train_num]
    valid_lab=tmp_lab[train_num:]
    
with open(test_lab_path,'rb') as f:
    struct.unpack('>2i',f.read(8))
    test_lab=np.fromfile(f,dtype=np.uint8)

In [23]:
def show_train(index):
    plt.imshow(train_img[index].reshape(28,28),cmap='gray')
    print('label : {}'.format(train_lab[index]))
def show_valid(index):
    plt.imshow(valid_img[index].reshape(28,28),cmap='gray')
    print('label : {}'.format(valid_lab[index]))
def show_test(index):
    plt.imshow(test_img[index].reshape(28,28),cmap='gray')
    print('label : {}'.format(test_lab[index]))

In [24]:
h=0.001
func=softmax
input_len=4
for i in range(input_len):
    test_input=np.random.rand(input_len)
    derivative=differential[func](test_input)
    value1=func(test_input)
    test_input[i]+=h
    value2=func(test_input)
#     print((value2-value1)/h)
#     print(derivative[i])
    print(derivative[i]-(value2-value1)/h)

[-4.38398683e-08  1.43363794e-08  1.17142974e-08  1.78447027e-08]
[ 1.15414990e-08 -4.61081562e-08  1.44046132e-08  2.04395997e-08]
[ 1.12630559e-08  1.98613416e-08 -4.69250933e-08  1.58006958e-08]
[ 1.97459042e-08  1.44457042e-08  1.38065307e-08 -4.81091614e-08]


In [26]:
h=0.0001
func=tanh
input_len=4
for i in range(input_len):
    test_input=np.random.rand(input_len)
    derivative=differential[func](test_input)
    value1=func(test_input)
    test_input[i]+=h
    value2=func(test_input)
#     print((value2-value1)/h)
#     print(derivative[i])
    print(derivative[i]-((value2-value1)/h)[i])

3.646287178737939e-05
3.353441710435279e-05
3.555732824811475e-05
3.6677830799525246e-05


In [28]:
onehot = np.identity(dimensions[-1])
print(onehot)

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [30]:
def sqr_loss(img, lab, parameters):
    y_pred = predict(img, parameters)
    y = onehot[lab]
    diff = y - y_pred
    return np.dot(diff, diff)

In [29]:
# 计算梯度
def grad_parameters(img, lab,parameters):
    l_in_list = [img]
    l_out_list = [activation[0](l_in_list[0])]
    for layer in range(1, len(dimensions)):
        l_in = np.dot(l_out_list[layer-1],parameters[layer]['w'])+parameters[layer]['b']
        l_out=activation[layer](l_in)
        l_in_list.append(l_in)
        l_out_list.append(l_out)
    
    d_layer = -2 * (onehot[lab] - l_out_list[-1])
    
    grad_result = [None] * len(dimensions)
    for layer in range(len(dimensions)-1, 0, -1):
        if d_type[activation[layer]] == 'times':
            d_layer = differential[activation[layer]](l_in_list[layer]) * d_layer
        if d_type[activation[layer]] == 'dot':
            d_layer=np.dot(differential[activation[layer]](l_in_list[layer]), d_layer)
        grad_result[layer] = {}
        grad_result[layer]['b'] = d_layer
        grad_result[layer]['w'] = np.outer(l_out_list[layer-1], d_layer)
        d_layer=np.dot(parameters[layer]['w'], d_layer)
    
    return grad_result