In [18]:
import numpy as np
import random

from q1_softmax import softmax
from q2_gradcheck import gradcheck_naive
from q2_sigmoid import sigmoid, sigmoid_grad


def normalizeRows(x):
    """ Row normalization function """
    # Implement a function that normalizes each
    # row of a matrix to have unit length

    # ## YOUR CODE HERE
    all_norm2 = np.sqrt(np.sum(np.power(x, 2), 1))
    all_norm2 = 1/all_norm2
    x = x * all_norm2[:, np.newaxis]
    # ## END YOUR CODE

    return x


def test_normalize_rows():
    print("Testing normalizeRows...")
    x = normalizeRows(np.array([[3.0, 4.0], [1, 2]]))
    # the result should be [[0.6, 0.8], [0.4472, 0.8944]]
    assert(x.all() == np.array([[0.6, 0.8], [0.4472, 0.8944]]).all())
    print(" ")


def softmaxCostAndGradient1(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, assuming the softmax prediction function and cross
    # entropy loss.

    # Inputs:
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word
    # - outputVectors: "output" vectors (as rows) for all tokens
    # - dataset: needed for negative sampling, unused here.

    # Outputs:
    # - cost: cross entropy cost for the softmax word prediction
    # - gradPred: the gradient with respect to the predicted word
    #        vector
    # - grad: the gradient with respect to all the other word
    #        vectors

    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    # ## YOUR CODE HERE
    v_hat = predicted
    o = target
    U = outputVectors
    y_hat = (softmax(U.dot(v_hat))).flatten()
    y = np.zeros(U.shape[0])
    y[o] = 1
    cost = np.sum(y * np.log(y_hat)) * -1

    def del_cost_del_v_hat(i):
        subtraction = y_hat - y
        u_w_i = U.T[i]
        result = subtraction * u_w_i
        return np.sum(result)

    def del_cost_del_U_i(i):
        return v_hat*(y_hat[i] - y[i])

    def get_grad(array, grad_function):
        matrix = np.array(array, copy=True)
        for i in range(array.shape[0]):
                matrix[i] = grad_function(i)
        return matrix

    gradPred = get_grad(v_hat, del_cost_del_v_hat)
    grad = get_grad(U, del_cost_del_U_i)
    # ## END YOUR CODE

    return cost, gradPred, grad

def softmaxCostAndGradient2(predicted, target, outputVectors, dadta):
    V, D = outputVectors.shape
    scores = softmax(outputVectors.dot(predicted).reshape(1, V)).reshape(V,)
    cost = - np.log(scores[target])
    
    labels = np.zeros(V)
    labels[target] = 1
    dscores = scores - labels
    gradPred = dscores.dot(outputVectors)
    grad = dscores.reshape(V, 1).dot(predicted.reshape(D, 1).T)    
    ### END YOUR CODE
    
    return cost, gradPred, grad

In [19]:
def test1(params):
    predicted = params[0:2]
    U = np.reshape(params[2:6],(2,2))
    o =  np.random.randint(0, 2, 1)[0]
    cost,gradpre,gradU = softmaxCostAndGradient1(predicted,o,U,None)
    grad = np.concatenate((gradpre,gradU.flatten()))
    return cost,grad

def test2(params):
    predicted = params[0:2]
    U = np.reshape(params[2:6],(2,2))
    o =  np.random.randint(0, 2, 1)[0]
    cost,gradpre,gradU = softmaxCostAndGradient2(predicted,o,U,None)
    grad = np.concatenate((gradpre,gradU.flatten()))
    return cost,grad

In [20]:
test_matrix = (normalizeRows(np.random.random_sample((1,6)))).flatten()
_ =gradcheck_naive(test1,test_matrix)

(0,): 
            Your gradient = 0.146516821704
            Numerical gradient = 480.31028104
fx =0.742330552069
fxh_plus =0.742345203849
fxh_minus =0.646283147641
(1,): 
            Your gradient = 0.0168039088191
            Numerical gradient = -480.302801674
fx =0.742330552069
fxh_plus =0.646268311345
fxh_minus =0.742328871679
(2,): 
            Your gradient = 0.153214912672
            Numerical gradient = 480.310587782
fx =0.742330552069
fxh_plus =0.742345873667
fxh_minus =0.64628375611
(3,): 
            Your gradient = 0.233700107342
            Numerical gradient = 480.314273648
fx =0.742330552069
fxh_plus =0.742353922328
fxh_minus =0.646291067598
(4,): 
            Your gradient = -0.153214912672
            Numerical gradient = -480.310587782
fx =0.742330552069
fxh_plus =0.64628375611
fxh_minus =0.742345873667
(5,): 
            Your gradient = -0.233700107342
            Numerical gradient = 0.212295248336
fx =0.742330552069
fxh_plus =0.646291067598
fxh_minus =0.64624860

In [21]:
gradcheck_naive(test2,test_matrix)

(0,): 
            Your gradient = -0.133097179127
            Numerical gradient = 480.31028104
fx =0.646269837825
fxh_plus =0.742345203849
fxh_minus =0.646283147641
(1,): 
            Your gradient = -0.0152648196713
            Numerical gradient = 0.0168039088194
fx =0.646269837825
fxh_plus =0.742332232461
fxh_minus =0.742328871679
(2,): 
            Your gradient = -0.139181784314
            Numerical gradient = -480.296554654
fx =0.646269837825
fxh_plus =0.646255919753
fxh_minus =0.742315230684
(3,): 
            Your gradient = -0.212295248335
            Numerical gradient = 0.23370010734
fx =0.646269837825
fxh_plus =0.742353922328
fxh_minus =0.742307182306
(4,): 
            Your gradient = 0.139181784314
            Numerical gradient = 480.296554654
fx =0.646269837825
fxh_plus =0.742315230684
fxh_minus =0.646255919753
(5,): 
            Your gradient = 0.212295248335
            Numerical gradient = -0.233700107341
fx =0.646269837825
fxh_plus =0.742307182306
fxh_minus =0.74

array([ 1.00027711,  0.03206873,  0.99971022,  0.44599536,  0.99971022,
        0.44599536])

In [5]:
a*100000

-108001297.0725