In [5]:
import os
import numpy as np


In [6]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

## Backpropagation:

In [3]:
def nnCostFunction(nn_params,
                   input_layer_size,
                   hidden_layer_size,
                   num_labels,
                   X, y, lambda_=0.0):
    
    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                        (hidden_layer_size, (input_layer_size + 1)))

    Theta2 = np.reshape(nn_params[(hidden_layer_size * (input_layer_size + 1)):],
                        (num_labels, (hidden_layer_size + 1)))

    Theta2_ = Theta2.copy()
    Theta2_[:,0] = 0
    Theta1_ = Theta1.copy()
    Theta1_[:,0] = 0

    # Setup some useful variables
    m = y.size
         
    # You need to return the following variables correctly 
    J = 0
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)
    
    # ====================== YOUR CODE HERE ======================
    #J######################################################
    ones_ = np.ones((X.shape[0],1))
    a1 = np.append(ones_,X,axis=1).transpose()
    
    z2 = Theta1@a1
    a2 = sigmoid(z2)
    
    ones_ = np.ones(a2.shape[1])[np.newaxis]
    a2 = np.append(ones_,a2,axis=0)
    
    z3 = Theta2@a2
    a3 = sigmoid(z3)
    
    #With this we already have h_theta(x)
    #Now let's focus on transforming y_i
    
    y_ = []
    for i in y:
        aux = np.zeros(num_labels)
        aux[i] = 1
        y_.append(aux.copy())
        
    y_ = np.array(y_)
    a3 = a3.T
    
    #Finally, let's compute the value:
    J =  (-1/m) * (y_*np.log(a3) + (1-y_)*np.log(1-a3)).sum() 
    J += (lambda_/(2*m))*((Theta1[:,1:]**2).sum()+(Theta2[:,1:]**2).sum())
    
    ##################################################################
    #Now let's move on to the next part:
    delta3 = np.zeros(num_labels)
    delta2 = np.zeros(hidden_layer_size)
    
    delta3 = (a3 - y_).T
    delta2 = (Theta2.T@delta3) * a2 * (1 - a2)
    
    Theta2_grad = delta3 @ a2.T / m
    #We change in Theta1 and not in Theta2 because in Theta2 we did not have
    #an extra node for the bias
    Theta1_grad = (delta2 @ a1.T)[1:,:] / m
    
    #if(Theta2_grad.shape != Theta2.shape or Theta1_grad.shape!= Theta1.shape):
    #    print(Theta1_grad.shape, Theta2_grad.shape)
    #    print(Theta1.shape,Theta2.shape)
    
    Theta2_grad += lambda_*(Theta2_)/m
    Theta1_grad += lambda_*(Theta1_)/m
    
    # ================================================================
    # Unroll gradients
    # grad = np.concatenate([Theta1_grad.ravel(order=order), Theta2_grad.ravel(order=order)])
    grad = np.concatenate([Theta1_grad.ravel(), Theta2_grad.ravel()])

    return J, grad



##  Random Initialization

One effective strategy for choosing $\epsilon_{init}$ is to base it on the number of units in the network. A good choice of $\epsilon_{init}$ is $\epsilon_{init} = \frac{\sqrt{6}}{\sqrt{L_{in} + L_{out}}}$ where $L_{in} = s_l$ and $L_{out} = s_{l+1}$ are the number of units in the layers adjacent to $\Theta^{l}$.


In [9]:
def randInitializeWeights(L_in, L_out, epsilon_init=0.12):
    """
    Randomly initialize the weights of a layer in a neural network.
    
    Parameters
    ----------
    L_in : int
        Number of incomming connections.
    
    L_out : int
        Number of outgoing connections. 
    
    epsilon_init : float, optional
        Range of values which the weight can take from a uniform 
        distribution.
    
    Returns
    -------
    W : array_like
        The weight initialiatized to random values.  Note that W should
        be set to a matrix of size(L_out, 1 + L_in) as
        the first column of W handles the "bias" terms.
        
    Instructions
    ------------
    Initialize W randomly so that we break the symmetry while training
    the neural network. Note that the first column of W corresponds 
    to the parameters for the bias unit.
    """

    # You need to return the following variables correctly 
    W = np.zeros((L_out, 1 + L_in))

    # ====================== YOUR CODE HERE ======================

    W = epsilon_init - 2 * epsilon_init * np.random.random((L_out, 1 + L_in))

    # ============================================================
    return W

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

## Main takeaways about the language:

In [7]:
Theta1 = np.zeros((3,3))
Theta2 = np.ones((3,3))
np.concatenate([Theta1.ravel(), Theta2.ravel()])

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.])