In [1]:
import sys
sys.path.append("modules/")

import numpy as np
import pandas
import mnist_loader
import random
import network2 as network_basic
import network2_L1 as network_L1
import time
import matplotlib.pyplot as plt



In [2]:
# Load data
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [3]:
# Let's do unregularized run
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
net = network_basic.Network([784, 30, 10], cost=network_basic.CrossEntropyCost)
unreg_run = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 0.,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

Epoch 0 training complete
Cost on training data: 0.378541756755
Accuracy on training data: 47171 / 50000
Cost on evaluation data: 0.367653999405
Accuracy on evaluation data: 9468 / 10000

Epoch 1 training complete
Cost on training data: 0.348681503234
Accuracy on training data: 47351 / 50000
Cost on evaluation data: 0.373106784191
Accuracy on evaluation data: 9454 / 10000

Epoch 2 training complete
Cost on training data: 0.286029317927
Accuracy on training data: 47984 / 50000
Cost on evaluation data: 0.326421274874
Accuracy on evaluation data: 9554 / 10000

Epoch 3 training complete
Cost on training data: 0.260231676439
Accuracy on training data: 48134 / 50000
Cost on evaluation data: 0.317277681569
Accuracy on evaluation data: 9573 / 10000

Epoch 4 training complete
Cost on training data: 0.246123257217
Accuracy on training data: 48274 / 50000
Cost on evaluation data: 0.306364225505
Accuracy on evaluation data: 9561 / 10000

Epoch 5 training complete
Cost on training data: 0.226486431

In [4]:
# Let's see if we can outperform that with L1 (note that I'm uing network_L1 module)
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
net = network_L1.Network([784, 30, 10], cost=network_L1.CrossEntropyCost)

# lmbda = 1
L1_run_lmbda_1 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 1.,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

#lmbda = 2
L1_run_lmbda_2 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 2.,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

#lmbda = 3
L1_run_lmbda_3 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 3.,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

Epoch 0 training complete
Cost on training data: 0.421899607467
Accuracy on training data: 46946 / 50000
Cost on evaluation data: 0.410431075387
Accuracy on evaluation data: 9412 / 10000

Epoch 1 training complete
Cost on training data: 0.321460277838
Accuracy on training data: 47732 / 50000
Cost on evaluation data: 0.346499753037
Accuracy on evaluation data: 9525 / 10000

Epoch 2 training complete
Cost on training data: 0.335817019534
Accuracy on training data: 47719 / 50000
Cost on evaluation data: 0.365840536448
Accuracy on evaluation data: 9503 / 10000

Epoch 3 training complete
Cost on training data: 0.298230475407
Accuracy on training data: 47948 / 50000
Cost on evaluation data: 0.33409734593
Accuracy on evaluation data: 9554 / 10000

Epoch 4 training complete
Cost on training data: 0.276524124615
Accuracy on training data: 48058 / 50000
Cost on evaluation data: 0.334651753482
Accuracy on evaluation data: 9543 / 10000

Epoch 5 training complete
Cost on training data: 0.2531486766

In [7]:
# Now let's compare basic and L1

L1_acc_lmbda_1 = np.asarray(L1_run_lmbda_1[1])/10000.
L1_acc_lmbda_2 = np.asarray(L1_run_lmbda_2[1])/10000.
L1_acc_lmbda_3 = np.asarray(L1_run_lmbda_3[1])/10000.
unreg_acc = np.asarray(unreg_run[1])/10000.

plt.plot(L1_acc_lmbda_1, color = "red")
plt.plot(L1_acc_lmbda_2, color = "green")
plt.plot(L1_acc_lmbda_3, color = "violet")
#plt.show()
plt.plot(unreg_acc, color = "black") #baseline
plt.show()

# It seems all the lambdas are doing better than baseline (black line)

In [6]:
# The problem with derivative assigned to the Network class is that it's actually not part of the network
# Network's behavior is dependent on cost function, which, in turn has a derivative.
# Hence we are not able to use different cost functions without modifying the network's code. Which is not cute.

# A simpler problem has to do with cross-entropy function itself. 
# network.py uses C'(sigma)* sigma'(z) formula
# This piece of code: 
# delta = self.cost_derivative(activations[-1], y) * \
#     sigmoid_prime(zs[-1])
# which, while true for crossentropy, shall lead to reliance on numerical internals of python, 
# while delta for cross-entropy has a clean analytical solution.

# New class solves this problem by directly giving analytical solution to delta (for each cost function)