In [1]:
import pickle
import numpy as np
from check_gradient import check_gradient

In [2]:
data = np.loadtxt('data.csv', delimiter=',')
labels = np.loadtxt('data-labels.csv', delimiter=',', dtype=np.int) - 1
theta1 = np.loadtxt('theta1.csv', delimiter=',').T
theta2 = np.loadtxt('theta2.csv', delimiter=',').T

In [3]:
def sigmoid(X):
    return 1 / (1 + np.exp(-X))
def unpack(theta):
    # partition theta into weight and bias
    return theta[1:], theta[0]
def neuron_forward(data, theta, activate=sigmoid):
    '''
    Only to fulfill the requirement of question 1.
    The function is not going to be called because computation of neural networks can be further verctorized.
    '''
    weight, bias = unpack(theta)
    return activate(np.dot(data, weight) + bias)
def network_forward(data, theta1, theta2, activate=sigmoid):
    # compute class scores
    weight1, bias1 = unpack(theta1)
    activations = activate(np.dot(data, weight1) + bias1)
    weight2, bias2 = unpack(theta2)
    scores = np.dot(activations, weight2) + bias2
    cache = (data, activations, weight2) # for back-propagation
    return scores, cache
def predict(scores):
    # classify images provided class scores
    return np.argmax(scores, axis=1)
def errors(predictions, labels):
    return np.count_nonzero(predictions - labels)
def error_rate(predictions, labels):
    return errors(predictions, labels) / len(labels)

In [4]:
# classification error rate
scores, network_cache = network_forward(data, theta1, theta2)
predictions = predict(scores)
error_rate(predictions, labels)

0.0248

In [5]:
# negative log likelihood loss (convert class scores to probability via softmax function)
def nll_loss(scores, labels):
    N, D = scores.shape
    normalized_scores = scores - np.max(scores, axis=1).reshape((N, 1))
    exponents = np.exp(normalized_scores)
    total = np.sum(exponents, axis=1)
    probabilities = exponents[np.arange(N), labels] / total
    loss = np.mean(-np.log(probabilities))
    cache = (scores, labels, exponents, total) # for back-propagation
    return loss, cache

In [6]:
# compute loss
loss, loss_cache = nll_loss(scores, labels)
loss

0.086888560374750276

In [7]:
def d_nll_loss(cache):
    # calculate gradient w.r.t. class scores
    scores, labels, exponents, total = cache
    N, D = scores.shape
    d_scores = exponents / (N * total.reshape((N, 1)))
    d_scores[np.arange(N), labels] -= 1 / N
    maximum_indices = np.argmax(scores, axis=1)
    d_scores[np.arange(N), maximum_indices] -= np.sum(d_scores, axis=1)
    return d_scores
def network_backward(d_scores, cache):
    # calculate gradient w.r.t. parameters
    data, activations, weight2 = cache
    d_bias2 = np.sum(d_scores, axis=0)
    d_weight2 = np.dot(activations.T, d_scores)
    d_activations = np.dot(d_scores, weight2.T)
    d_pre_activations = activations * (1 - activations) * d_activations
    d_bias1 = np.sum(d_pre_activations, axis=0)
    d_weight1 = np.dot(data.T, d_pre_activations)
    return d_weight1, d_bias1, d_weight2, d_bias2

In [8]:
d_scores = d_nll_loss(loss_cache)
d_weight1, d_bias1, d_weight2, d_bias2 = network_backward(d_scores, network_cache)

Monitor maximum relative difference of numerical gradient and analytical gradient to check gradient calculation. Numerical gradient is calculated via approximation method, while analytical gradient is calculated via back-propagation. Relative difference is measured by: |p - q| / (|p| + |q|). A relative difference close to 0 indicates good precision.

In [9]:
# check gradient w.r.t class scores
def f(scores):
    loss, _ = nll_loss(scores, labels)
    return loss
print(max(check_gradient(f, scores, d_scores, 100).values()))

0.00161544264488


In [10]:
def pack(weight, bias):
    # stack weight and bias
    return np.vstack((bias, weight))
d_theta1 = pack(d_weight1, d_bias1)
d_theta2 = pack(d_weight2, d_bias2)

In [11]:
# check gradient w.r.t. theta 1
def f(theta1):
    scores, network_cache = network_forward(data, theta1, theta2)
    loss, _ = nll_loss(scores, labels)
    return loss
print(max(check_gradient(f, theta1, d_theta1, 10).values()))

2.61388111324e-05


In [12]:
# check gradient w.r.t. theta 2
def f(theta2):
    scores, network_cache = network_forward(data, theta1, theta2)
    loss, _ = nll_loss(scores, labels)
    return loss
print(max(check_gradient(f, theta2, d_theta2, 10).values()))

1.25883441278e-09


In [13]:
# statistics w.r.t. theta 1
abs_d_theta1 = np.abs(d_theta1)
mean = np.mean(abs_d_theta1)
median = np.median(abs_d_theta1)
std = np.std(abs_d_theta1)
print('mean = %f' % mean)
print('median = %f' % median)
print('standard deviation = %f' % std)

mean = 0.000115
median = 0.000033
standard deviation = 0.000188


In [14]:
# statistics w.r.t. theta 2
abs_d_theta2 = np.abs(d_theta2)
mean = np.mean(abs_d_theta2)
median = np.median(abs_d_theta2)
std = np.std(abs_d_theta2)
print('mean = %f' % mean)
print('median = %f' % median)
print('standard deviation = %f' % std)

mean = 0.000514
median = 0.000379
standard deviation = 0.000443


In [15]:
randomized_theta1 = np.random.uniform(0.9, 1.1, theta1.shape) * theta1
randomized_theta2 = np.random.uniform(0.9, 1.1, theta2.shape) * theta2

In [16]:
scores, network_cache = network_forward(data, randomized_theta1, randomized_theta2)
predictions = predict(scores)
error_rate(predictions, labels)

0.025

In [17]:
loss, loss_cache = nll_loss(scores, labels)
loss

0.088757157114511237

In [18]:
d_scores = d_nll_loss(loss_cache)
d_weight1, d_bias1, d_weight2, d_bias2 = network_backward(d_scores, network_cache)

In [19]:
# statistics w.r.t. randomized theta 1
d_theta1 = pack(d_weight1, d_bias1)
abs_d_theta1 = np.abs(d_theta1)
mean = np.mean(abs_d_theta1)
median = np.median(abs_d_theta1)
std = np.std(abs_d_theta1)
print('mean = %f' % mean)
print('median = %f' % median)
print('standard deviation = %f' % std)

mean = 0.000178
median = 0.000039
standard deviation = 0.000306


In [20]:
# statistics w.r.t. randomized theta 2
d_theta2 = pack(d_weight2, d_bias2)
abs_d_theta2 = np.abs(d_theta2)
mean = np.mean(abs_d_theta2)
median = np.median(abs_d_theta2)
std = np.std(abs_d_theta2)
print('mean = %f' % mean)
print('median = %f' % median)
print('standard deviation = %f' % std)

mean = 0.000822
median = 0.000598
standard deviation = 0.000727


In [21]:
# dump internal results (for checking non-vecterized implementation)
cache = (scores, predictions, d_scores, d_theta1, d_theta2)
pickle.dump(cache, open('internal', 'wb'))