# Neural Networks Learning

In [1]:
import numpy as np
import pandas as pd
import scipy.io as scio

In [2]:
data = scio.loadmat('ex4data1.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'y': array([[10],
        [10],
        [10],
        ...,
        [ 9],
        [ 9],
        [ 9]], dtype=uint8)}

In [144]:
X = data['X']
y = data['y']
X.shape,y.shape

((5000, 400), (5000, 1))

我们也需要对我们的y标签进行一次one-hot 编码。 one-hot 编码将类标签n（k类）转换为长度为k的向量，其中索引n为“hot”（1），而其余为0。 Scikitlearn有一个内置的实用程序，我们可以使用这个。

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
y_onehot,y_onehot.shape

(array([[0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        ...,
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.]]), (5000, 10))

In [7]:
def sigmoidFunc(x):
    return 1 / (1 + np.exp(-x))

## 前向传播

In [81]:
def forwardPropagate(X, theta1, theta2):
    rows = X.shape[0]
    a1 = np.insert(X, 0, np.ones(rows), axis = 1)
    z2 = a1 * theta1.T
    a2 = np.insert(sigmoidFunc(z2), 0, np.ones(rows), axis = 1)
    z3 = a2 * theta2.T
    a3 = sigmoidFunc(z3)

    return a1,z2,a2,z3,a3

## 代价函数

In [82]:
def costFunc(params, input_size, hidden_size, k_nums, X, y):
    rows = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)

    theta1 = params[:hidden_size * (input_size + 1)]
    theta1 = np.matrix(np.reshape(theta1, (hidden_size, input_size + 1)))
    theta2 = params[hidden_size * (input_size + 1):]
    theta2 = np.matrix(np.reshape(theta2, (k_nums, hidden_size + 1)))

    a1,z2,a2,z3,h = forwardPropagate(X, theta1, theta2)

    J = 0
    for i in range(rows):   # sum i from 1 to m
        J += np.sum(np.multiply(-y[i,:], np.log(h[i,:])) - np.multiply(1-y[i,:], np.log(1-h[i,:])))     # sum k from 1 to K
    J = J/rows  # div m

    return J

In [136]:
input_size = 400
hidden_size = 25
k_nums = 10
lamb = 1

params = (np.random.random(size=hidden_size * (input_size + 1) + k_nums * (hidden_size + 1)) - 0.5) * 0.25

rows = X.shape[0]
X = np.matrix(X)
y = np.matrix(y_onehot)

theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, input_size + 1)))
theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (k_nums, hidden_size + 1)))

a1,z2,a2,z3,h = forwardPropagate(X, theta1, theta2)

h.shape,theta1.shape,theta2.shape

((5000, 10), (25, 401), (10, 26))

In [58]:
cost = costFunc(params, input_size, hidden_size, k_nums, X, y)
cost

7.225827627437019

## 正则化代价函数

In [84]:
def costFuncReg(params, input_size, hidden_size, k_nums, X, y, lamb):
    rows = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)

    theta1 = params[:hidden_size * (input_size + 1)]
    theta1 = np.matrix(np.reshape(theta1, (hidden_size, input_size + 1)))
    theta2 = params[hidden_size * (input_size + 1):]
    theta2 = np.matrix(np.reshape(theta2, (k_nums, hidden_size + 1)))

    a1,z2,a2,z3,h = forwardPropagate(X, theta1, theta2)

    J = 0
    for i in range(rows):   # sum i from 1 to m
        J += np.sum(np.multiply(-y[i,:], np.log(h[i,:])) - np.multiply(1-y[i,:], np.log(1-h[i,:])))     # sum k from 1 to K
    J = J/rows  # div m):

    # theta1[:,1:] 25 * 400 \ theta2[:,1:] 10 * 25 \ np.sum 对整个矩阵求和
    R = np.sum(np.power(theta1[:,1:],2)) + np.sum(np.power(theta2[:,1:],2))     # sum k from 1 to 400 \ j from 1 to 25
    R = lamb / (2 * rows) * R

    cost = J+R
    return cost 

In [79]:
t = np.matrix(np.array([[1,3,5],[2,4,6]]))
a = np.power(t,2)
b = np.sum(a)
b

91

In [85]:
cost = costFuncReg(params, input_size, hidden_size, k_nums, X, y, lamb)
cost

6.985988849486904

## 反向传播算法

In [99]:
def sigmoidGradient(z):
    return np.multiply(sigmoidFunc(z), (1 - sigmoidFunc(z))) 

In [118]:
def backPropagation(params, input_size, hidden_size, k_nums, X, y, lamb):
    rows = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)

    theta1 = params[:hidden_size * (input_size + 1)]
    theta1 = np.matrix(np.reshape(theta1, (hidden_size, input_size + 1)))
    theta2 = params[hidden_size * (input_size + 1):]
    theta2 = np.matrix(np.reshape(theta2, (k_nums, hidden_size + 1)))

    a1,z2,a2,z3,h = forwardPropagate(X, theta1, theta2)

    J = 0 
    delta1 = np.zeros(theta1.shape)
    delta2 = np.zeros(theta2.shape)

    for t in range(rows):
        a1t = a1[t,:]       # (1,401)
        z2t = z2[t,:]       # (1,25)
        a2t = a2[t,:]       # (1,26)
        ht = h[t,:]
        yt = y[t,:]
        d3t = ht - yt       # (1,10)
        
        z2t = np.insert(z2t, 0, values = np.ones(1))        # (1,26)
        d2t = np.multiply((theta2.T * d3t.T).T,sigmoidGradient(z2t))        
        # theta2 (10,26) d3t (1,10)  theta2.T * d3t.T).T (1,26) d2t (1,26)

        delta1 = delta1 + (d2t[:,1:]).T * a1t
        delta2 = delta2 + d3t.T * a2t

    delta1 = delta1 / rows      # (25,401)
    delta2 = delta2 / rows      # (10,26)

    delta1[:,1:] = delta1[:,1:] + lamb / rows * (delta1[:,1:])
    delta2[:,1:] = delta2[:,1:] + lamb / rows * (delta2[:,1:])

    grad = np.concatenate((np.ravel(delta1),np.ravel(delta2)))

    return grad

In [119]:
grad = backPropagation(params, input_size, hidden_size, k_nums, X, y, lamb)
grad.shape,params.shape

((10285,), array([-0.01563594,  0.        ,  0.        , ...,  0.22241588,
         0.16556693,  0.26344854]))

In [137]:
from scipy.optimize import minimize
fmin = minimize(fun = costFuncReg, x0 = params, args = (input_size, hidden_size, k_nums, X, y, lamb), method='TNC', jac=backPropagation)
fmin

     fun: 0.46053137784748743
     jac: array([-0.0025787 ,  0.        ,  0.        , ...,  0.0004536 ,
        0.00061774,  0.00103578])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 353
     nit: 21
  status: 1
 success: True
       x: array([ 0.72667445, -0.11096635, -0.00726166, ..., -1.44639989,
        4.52283045,  0.94603657])

训练过程和Mr.Huang有些不同，不到99%

In [151]:
X = np.matrix(X)
theta1 = np.matrix(np.reshape(fmin.x[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(fmin.x[hidden_size * (input_size + 1):], (k_nums, (hidden_size + 1))))

a1, z2, a2, z3, h = forwardPropagate(X, theta1, theta2)
y_pred = np.array(np.argmax(h, axis=1) + 1)

correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print ('accuracy = {0}%'.format(accuracy * 100))

accuracy = 98.74000000000001%
