In [1]:
import sys
sys.path.append("modules/")

import numpy as np
import pandas
import mnist_loader
import random
import network2 as network_basic
import network2_L1 as network_L1
import network2_earlyStop as network_earlyStop
import network2_earlyStop_modif as network_earlyStop_modif
import network2_learningSchedule as network_learningSchedule
import network2_momentumGradient as network_momentumGradient
import time
import matplotlib.pyplot as plt



In [2]:
# Load data
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [None]:
# Let's do unregularized run
net = network_basic.Network([784, 30, 10], cost=network_basic.CrossEntropyCost)
unreg_run = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 0.,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

In [None]:
# Let's see if we can outperform that with L1 (note that I'm uing network_L1 module)
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
net = network_L1.Network([784, 30, 10], cost=network_L1.CrossEntropyCost)

# lmbda = 1
L1_run_lmbda_1 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 1.,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

#lmbda = 2
L1_run_lmbda_2 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 2.,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

#lmbda = 3
L1_run_lmbda_3 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 3.,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

In [None]:
# Now let's compare basic and L1

L1_acc_lmbda_1 = np.asarray(L1_run_lmbda_1[1])/10000.
L1_acc_lmbda_2 = np.asarray(L1_run_lmbda_2[1])/10000.
L1_acc_lmbda_3 = np.asarray(L1_run_lmbda_3[1])/10000.
unreg_acc = np.asarray(unreg_run[1])/10000.

plt.plot(L1_acc_lmbda_1, color = "red")
plt.plot(L1_acc_lmbda_2, color = "green") #the greeny is the winner
plt.plot(L1_acc_lmbda_3, color = "violet")
#plt.show()
plt.plot(unreg_acc, color = "black") #baseline
plt.show()

# It seems lambda=2 is conistently better than baseline

In [None]:
# The problem with derivative assigned to the Network class is that it's actually not part of the network
# Network's behavior is dependent on cost function, which, in turn has a derivative.
# Hence we are not able to use different cost functions without modifying the network's code. Which is not cute.

# A simpler problem has to do with cross-entropy function itself. 
# network.py uses C'(sigma)* sigma'(z) formula
# This piece of code: 
# delta = self.cost_derivative(activations[-1], y) * \
#     sigmoid_prime(zs[-1])
# which, while generally true for crossentropy, shall lead to reliance on numerical internals of python 
# (very small numbers multiplied by very large numbers etc.), 
# while delta for cross-entropy has a clean analytical solution.
# New "cost" class solves this problem by directly giving analytical solution to delta (for each cost function)

In [None]:
# Let's apply early stopping 
net = network_earlyStop.Network([784, 30, 10], cost=network_earlyStop.CrossEntropyCost)

# 3 epochs stop rule
earlyStop_run_10 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 5.,
    early_stop_n = 10,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

In [None]:
# Why is early stopping based on maximum a bad idea?
# The goal of early stopping is to answer a question: "Has the network stopped consistently improving on validation?"
# Early stopping based on "no-improvement in n" answers this question by comparing two maximums: 
# 1) Maximum t-n epochs
# 2) Maximum during last n epochs
# These maximums are drawn from two non-independent and complex random distributions. 
# A common sense about random distributions is that the more extreme the metric we want to measure, 
# the more samples we need. In our case number of observations is number of epochs, which is rather low.

# Imagine this case: during the first 7 epochs, the quality increases. Than on 8-th epochs we get a surprise maximum
# After that, we can see growth of quality on average, but not higher than maximum. 
# Thus we shall stop the model, in suboptimal place.

# A better idea seems to stop when average performance over some period becomes lower than average performance over the previous period.
# This allows us to use more "samples" from over ever-changing quality distribution
# So let's try average-by-epochs

In [None]:
# Let's tryout our average approach
# Let's apply early stopping 
net = network_earlyStop_modif.Network([784, 30, 10], cost=network_earlyStop_modif.CrossEntropyCost)

# 3 epochs stop rule
earlyStop_modif_run_4 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 5.,
    early_stop_n = 5,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

In [None]:
# I'll leave this part for some further experimentation (let's do other coding first)

In [None]:
# Now let's test learning schedule
net = network_learningSchedule.Network([784, 30, 10], cost=network_learningSchedule.CrossEntropyCost)

earlyStop_modif_run_4 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 5.,
    eta_modif_n = 10,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

In [None]:
# Why not use gradient descent on eta or lambda?
# First things first: when searching for good hyperparameter value, that's precisely what you do.
# Of course you do that manually, but the procedure is still the same:
# 1) Change parameter
# 2) See how validation quality changed.
# 3) Guess the next best value

# The problem with lambda is actually logical, not performance-based:
# We add regularization to punish model for large weights. The best way to decrease cost
# (given than regularization is strictly non-negative) with change in lambda is to minimize it as far as possible.
# Mathematically it's just another derivative for fundamental equations, which is always positive, so lambda will always decrease
# It's pretty obvious that won't help us much :) 

# Now for eta.
# A simple description of eta is "parameter that regulates how far we move based on current gradient estimation".
# If you look at all the cost functions, eta is not present there, as it has nothing to do with the current value
# of cost function - only the 'travelling process" itself.
# Is there some other, cute way to apply gradient descent technique to eta?
# Maybe, but limited to the scope of this exercise, I'd say nothing simple/obvious.

In [None]:
# What could go wrong for momentum gradient if mu > 1?
# This would mean that each update is larger than the previous one. So we would speed up indefinetly.
# Steps are likely to get bigger and bigger, so no happy gradient descent for you - you are likely to run far far away.

# What could go wrong if mu < 0?
# The means that every next step is decreased by previous step, whichever direction we are trying to go.
# This would mean that cost function at every step shall try to go against each previous step.
# Not very productive :) Moreover, we are likely to get stuck: if "first" step is large, and next one is small, 
# we could circle the same spot on the cost hyperplane forever.

In [None]:
# Let's check how momentum gradient works
net = network_momentumGradient.Network([784, 30, 10], cost=network_momentumGradient.CrossEntropyCost)

earlyStop_modif_run_4 = net.SGD(
    training_data, 30, 10, 0.5,
    lmbda = 5.,
    mu = 0.7,
    evaluation_data=validation_data,
    monitor_evaluation_accuracy=True,
    monitor_evaluation_cost=True,
    monitor_training_accuracy=True,
    monitor_training_cost=True
       )

Epoch 0 training complete
Cost on training data: 0.658974605222
Accuracy on training data: 46674 / 50000
Cost on evaluation data: 1.50337854875
Accuracy on evaluation data: 9386 / 10000

Epoch 1 training complete
Cost on training data: 0.677066438982
Accuracy on training data: 46982 / 50000
Cost on evaluation data: 1.77734659878
Accuracy on evaluation data: 9429 / 10000

Epoch 2 training complete
Cost on training data: 0.678816373362
Accuracy on training data: 47156 / 50000
Cost on evaluation data: 1.91102272231
Accuracy on evaluation data: 9464 / 10000

Epoch 3 training complete
Cost on training data: 0.657624192624
Accuracy on training data: 47549 / 50000
Cost on evaluation data: 1.9744890776
Accuracy on evaluation data: 9511 / 10000

In [None]:
# The last exercise is theoretical, so look in theory solutions for it