In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
from scipy.optimize import minimize
%matplotlib inline

In [2]:
data = sio.loadmat('data/ex3data1.mat')
weights = sio.loadmat('data/ex3weights.mat')

In [3]:
data.keys()

dict_keys(['__version__', 'y', 'X', '__globals__', '__header__'])

In [4]:
weights.keys()

dict_keys(['Theta2', '__version__', 'Theta1', '__globals__', '__header__'])

In [5]:
X = data['X']
y = data['y'].flatten()
theta1 = weights['Theta1']
theta2 = weights['Theta2']
para = np.append(theta1.flatten(), theta2.flatten())

In [6]:
temp = np.linspace(0, 9, 10)

# Feedforward and cost function

#### Cost function for the neural network

\begin{equation}
J(\theta) = \frac{1}{m}\sum_{i=1}^m \sum_{k=1}^K [-y_k^{(i)}\log((h_{\theta}(x^{(i)}))_k) -(1 - y_k^{(i)})\log(1-(h_{\theta}(x^{(i)}))_k) ]
\end{equation}

In [71]:
#sigmoid function
def sigm(x):
    return(1/(1+np.exp(-x)))

In [73]:
def sigmoidGradient(x):
    return(sigm(x)*(1-sigm(x)))

In [74]:
def randInitializeWeights(L_in, L_out, epsilon_init = 0.12):
    return(np.random.uniform(-epsilon_init, epsilon_init, (L_out, 1 + L_in)))    

In [65]:
#cost and gradient function
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
    Theta1 = np.reshape(nn_params[:((input_layer_size+1)*hidden_layer_size)], (hidden_layer_size, input_layer_size+1))
    Theta2 = np.reshape(nn_params[((input_layer_size+1)*hidden_layer_size):], (num_labels, hidden_layer_size+1))
    
    m = X.shape[0]
    J = 0
    Theta1_grad = np.zeros(Theta1.size) # 25x401
    Theta2_grad = np.zeros(Theta2.size) # 10x26
    
    #recode y
    y_new = pd.get_dummies(y).as_matrix() # 5000*10
    
    #predict
    a1 = np.column_stack((np.ones(m), X)) #5000x401
    z2 = a1.dot(Theta1.T) # 5000x401 * 401x25 = 5000x25 
    a2 = np.column_stack((np.ones(z2.shape[0]), sigm(z2))) # 5000x26
    z3 = a2.dot(Theta2.T) # 5000x26 * 26x10 = 5000x10
    y_hat = sigm(z3) #5000x10
    
    #cost
    J = 1/m*(-np.sum((y_new * np.log(y_hat)))-np.sum((1-y_new)*np.log(1-y_hat)))\
    + lam/(2*m) * (np.sum(np.square(Theta1[:,1:])) + np.sum(np.square(Theta2[:,1:])))
    
    #gradient
    d3 = y_hat - y # 5000x10
    d2 = delta3.T.dot(Theta2)*sigmoidGradient(z2) # 5000x10*10x26 = 5000x26
    
    Theta2_grad = 1/m*d3.T.dot(a2) # 10x5000*5000x26 = 10x26
    Theta1_grad = 1/m*d2.T.dot(a1) # 26x5000*5000x401 = 26x401
    
    return(J, Theta1_grad, Theta2_grad)

In [67]:
nnCostFunction(para, 400, 25, 10, X, y, 1)

(0.38376985909092359,
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,