In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
from scipy.optimize import minimize
%matplotlib inline

In [2]:
data = sio.loadmat('data/ex3data1.mat')
weights = sio.loadmat('data/ex3weights.mat')

In [3]:
data.keys()

dict_keys(['y', '__globals__', '__header__', 'X', '__version__'])

In [4]:
weights.keys()

dict_keys(['__globals__', '__header__', 'Theta2', '__version__', 'Theta1'])

In [5]:
X = data['X']
y = data['y'].flatten()
theta1 = weights['Theta1']
theta2 = weights['Theta2']
para = np.append(theta1.flatten(), theta2.flatten())

In [6]:
temp = np.linspace(0, 9, 10)

# Feedforward and cost function

#### Cost function for the neural network

\begin{equation}
J(\theta) = \frac{1}{m}\sum_{i=1}^m \sum_{k=1}^K [-y_k^{(i)}\log((h_{\theta}(x^{(i)}))_k) -(1 - y_k^{(i)})\log(1-(h_{\theta}(x^{(i)}))_k) ]
\end{equation}

In [7]:
#sigmoid function
def sigm(x):
    return(1/(1+np.exp(-x)))

In [8]:
def sigmoidGradient(x):
    return(sigm(x)*(1-sigm(x)))

In [9]:
def randInitializeWeights(length, epsilon_init = 0.12):
    return(np.random.uniform(-epsilon_init, epsilon_init, length))    

In [10]:
def predict(Theta1, Theta2, X, m):
    a1 = np.column_stack((np.ones(m), X)) #5000x401
    z2 = a1.dot(Theta1.T) # 5000x401 * 401x25 = 5000x25 
    a2 = np.column_stack((np.ones(z2.shape[0]), sigm(z2))) # 5000x26
    z3 = a2.dot(Theta2.T) # 5000x26 * 26x10 = 5000x10
    a3 = sigm(z3) #5000x10
    return(a1, a2, a3, z2, z3)

In [11]:
#cost function
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
    Theta1 = np.reshape(nn_params[:((input_layer_size+1)*hidden_layer_size)], (hidden_layer_size, input_layer_size+1))
    Theta2 = np.reshape(nn_params[((input_layer_size+1)*hidden_layer_size):], (num_labels, hidden_layer_size+1))
    
    m = X.shape[0]
    J = 0
    
    #recode y
    y_new = pd.get_dummies(y).as_matrix() # 5000*10
    
    #feedforwad
    a1, a2, a3, z2, z3 = predict(Theta1, Theta2, X, m)
    
    #cost
    J = 1/m*(-np.sum((y_new * np.log(a3)))-np.sum((1-y_new)*np.log(1-a3)))\
    + lam/(2*m) * (np.sum(np.square(Theta1[:,1:])) + np.sum(np.square(Theta2[:,1:])))
    
    return(J)

In [12]:
nnCostFunction(para, 400, 25, 10, X, y, 0)

0.28762916516131887

In [13]:
#gradient function
def nnGradientFun(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
    Theta1 = np.reshape(nn_params[:((input_layer_size+1)*hidden_layer_size)], (hidden_layer_size, input_layer_size+1))
    Theta2 = np.reshape(nn_params[((input_layer_size+1)*hidden_layer_size):], (num_labels, hidden_layer_size+1))
    
    Theta1_grad = np.zeros(Theta1.size) # 25x401
    Theta2_grad = np.zeros(Theta2.size) # 10x26
    m = X.shape[0]
    #recode y
    y_new = pd.get_dummies(y).as_matrix() # 5000*10
    
    a1, a2, a3, z2, z3 = predict(Theta1, Theta2, X, m)
    #gradient
    d3 = a3 - y_new # 5000x10
    d2 = d3.dot(Theta2[:,1:])*sigmoidGradient(z2) # 5000x10*10x25 = 5000x25
    
    Theta1_grad = 1/m*d2.T.dot(a1) + lam/m*np.column_stack((np.zeros(hidden_layer_size), Theta1[:, 1:])) # 25x5000*5000x401
    Theta2_grad = 1/m*d3.T.dot(a2) + lam/m*np.column_stack((np.zeros(num_labels), Theta2[:,1:]))# 10x5000*5000x26 
    
    return(np.append(Theta1_grad.flatten(), Theta2_grad.flatten()))

#### Gradient checking

\begin{equation}
f_i(\theta) \approx \frac{J(\theta^{(i+)})-J(\theta^{(i-)})}{2\epsilon}
\end{equation}

In [14]:
def checkNNGradients(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam, eps=10^(-4)):
    n = nn_params.size
    numGrad = np.zeros(n)
    
    paraPlus = np.tile(nn_params, (n, 1)) + np.diag(np.ones(n))*eps
    paraMinus = np.tile(nn_params, (n, 1)) - np.diag(np.ones(n))*eps
    
    for i in range(0, 5):
        costPlus = nnCostFunction(paraPlus[i,:], input_layer_size, hidden_layer_size, num_labels, X, y, lam)
        costMinus = nnCostFunction(paraMinus[i, :], input_layer_size, hidden_layer_size, num_labels, X, y, lam)
        numGrad[i] = (costPlus - costMinus)/2/eps
    return(numGrad)

In [15]:
checkNNGradients(para, 400, 25, 10, X, y, 1)

array([  1.80912511e-03,  -2.11248241e-12,   4.38829528e-13, ...,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00])

In [16]:
nnGradientFun(para, 400, 25, 10, X, y, 1)

array([  6.18712766e-05,  -2.11248326e-12,   4.38829369e-13, ...,
         4.70513145e-05,  -5.01718610e-04,   5.07825789e-04])

In [17]:
%time reg = minimize(nnCostFunction, randInitializeWeights(para.size), args=(400, 25, 10, X, y, 1), jac=nnGradientFun, options={'maxiter':200})

Wall time: 3h 38min 42s


In [18]:
paraPrime=reg.x

In [19]:
theta1Prime = np.reshape(paraPrime[:(401*25)], (25, 401))
theta2Prime = np.reshape(paraPrime[(401*25):], (10, 26))

In [20]:
predPrime = np.argmax(predict(theta1Prime, theta2Prime, X, X.shape[0])[4], axis = 1) + 1

In [22]:
print('The accuracy is: {}%'.format(sum(np.isclose(predPrime, y, atol=0))/len(y)*100))

The accuracy is: 98.9%
