In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

def train_model_and_plot_stats(
        model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True,scheduler=None):
    
    # As well as monitoring the error over training also monitor classification
    # accuracy i.e. proportion of most-probable predicted classes being equal to targets
    data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}

    # Use the created objects to initialise a new Optimiser instance.
    optimiser = Optimiser(
        model, error, learning_rule, train_data, valid_data, data_monitors, notebook=notebook,scheduler=scheduler)

    # Run the optimiser for 5 epochs (full passes through the training set)
    # printing statistics every epoch.
    stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)

    # Plot the change in the validation and training set error over training.
    fig_1 = plt.figure(figsize=(8, 4))
    ax_1 = fig_1.add_subplot(111)
    for k in ['error(train)', 'error(valid)']:
        ax_1.plot(np.arange(1, stats.shape[0]) * stats_interval, 
                  stats[1:, keys[k]], label=k)
    ax_1.legend(loc=0)
    ax_1.set_xlabel('Epoch number')

    # Plot the change in the validation and training set accuracy over training.
    fig_2 = plt.figure(figsize=(8, 4))
    ax_2 = fig_2.add_subplot(111)
    for k in ['acc(train)', 'acc(valid)']:
        ax_2.plot(np.arange(1, stats.shape[0]) * stats_interval, 
                  stats[1:, keys[k]], label=k)
    ax_2.legend(loc=0)
    ax_2.set_xlabel('Epoch number')
    
    return stats, keys, run_time, fig_1, ax_1, fig_2, ax_2

In [2]:
# The below code will set up the data providers, random number
# generator and logger objects needed for training runs. As
# loading the data from file take a little while you generally
# will probably not want to reload the data providers on
# every training run. If you wish to reset their state you
# should instead use the .reset() method of the data providers.
import numpy as np
import logging
from mlp.data_providers import MNISTDataProvider, EMNISTDataProvider

# Seed a random number generator
seed = 11102018 
rng = np.random.RandomState(seed)
batch_size = 100
# Set up a logger object to print info about the training run to stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]

# Create data provider objects for the MNIST data set
train_data = EMNISTDataProvider('train', batch_size=batch_size, rng=rng)
valid_data = EMNISTDataProvider('valid', batch_size=batch_size, rng=rng)
test_data = EMNISTDataProvider('test', batch_size=batch_size, rng=rng)

KeysView(<numpy.lib.npyio.NpzFile object at 0x7f9ef01d2d68>)
KeysView(<numpy.lib.npyio.NpzFile object at 0x7f9ef01d2d68>)
KeysView(<numpy.lib.npyio.NpzFile object at 0x7f9ef01d23c8>)


In [3]:
##############SGD With scheduler with no restarts

from mlp.layers import AffineLayer, SoftmaxLayer, SigmoidLayer, ReluLayer, LeakyReluLayer
from mlp.errors import CrossEntropySoftmaxError
from mlp.models import MultipleLayerModel
from mlp.initialisers import ConstantInit, GlorotUniformInit
from mlp.learning_rules import AdamLearningRule,GradientDescentLearningRule
from mlp.optimisers import Optimiser
from mlp.schedulers import CosineAnnealingWithWarmRestarts

#setup hyperparameters
learning_rate = 0.0075
range = 10
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init),
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

sgd_scheduler_no_restart = CosineAnnealingWithWarmRestarts(min_learning_rate=learning_rate / range, max_learning_rate=learning_rate * range,
                                                       total_iters_per_period=100,
                                                       max_learning_rate_discount_factor=0.9,
                                                       period_iteration_expansion_factor=1.0)

error = CrossEntropySoftmaxError()

# Use a basic gradient descent learning rule
learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)

#Remember to use notebook=False when you write a script to be run in a terminal
_ = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True,scheduler=sgd_scheduler_no_restart)

HBox(children=(IntProgress(value=0), HTML(value='')))

learning_rate 0.075
epoch 1


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 1: 10.0s to complete
    error(train)=9.91e-01, acc(train)=6.98e-01, error(valid)=1.01e+00, acc(valid)=6.91e-01


learning_rate 0.07498168105357779
epoch 2


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 2: 10.7s to complete
    error(train)=7.40e-01, acc(train)=7.66e-01, error(valid)=7.64e-01, acc(valid)=7.60e-01


learning_rate 0.07492674229289958
epoch 3


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 3: 9.9s to complete
    error(train)=6.21e-01, acc(train)=8.01e-01, error(valid)=6.56e-01, acc(valid)=7.89e-01


learning_rate 0.07483523793588934
epoch 4


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 4: 8.2s to complete
    error(train)=5.65e-01, acc(train)=8.16e-01, error(valid)=6.13e-01, acc(valid)=8.02e-01


learning_rate 0.0747072582863
epoch 5


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 5: 11.8s to complete
    error(train)=5.38e-01, acc(train)=8.20e-01, error(valid)=5.90e-01, acc(valid)=8.06e-01


learning_rate 0.07454292964459448
epoch 6


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 6: 10.2s to complete
    error(train)=5.20e-01, acc(train)=8.24e-01, error(valid)=5.88e-01, acc(valid)=8.04e-01


learning_rate 0.07434241418330258
epoch 7


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 7: 7.1s to complete
    error(train)=4.72e-01, acc(train)=8.38e-01, error(valid)=5.47e-01, acc(valid)=8.17e-01


learning_rate 0.074105909786976
epoch 8


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 8: 4.6s to complete
    error(train)=4.47e-01, acc(train)=8.48e-01, error(valid)=5.31e-01, acc(valid)=8.23e-01


learning_rate 0.07383364985690043
epoch 9


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 9: 5.6s to complete
    error(train)=4.34e-01, acc(train)=8.51e-01, error(valid)=5.30e-01, acc(valid)=8.24e-01


learning_rate 0.0735259030807565
epoch 10


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 10: 4.9s to complete
    error(train)=4.16e-01, acc(train)=8.56e-01, error(valid)=5.14e-01, acc(valid)=8.28e-01


learning_rate 0.07318297316745757
epoch 11


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 11: 4.4s to complete
    error(train)=3.98e-01, acc(train)=8.60e-01, error(valid)=5.08e-01, acc(valid)=8.31e-01


learning_rate 0.07280519854742562
epoch 12


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 12: 4.3s to complete
    error(train)=3.83e-01, acc(train)=8.66e-01, error(valid)=5.01e-01, acc(valid)=8.34e-01


learning_rate 0.07239295203860133
epoch 13


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 13: 4.4s to complete
    error(train)=3.75e-01, acc(train)=8.68e-01, error(valid)=5.01e-01, acc(valid)=8.35e-01


learning_rate 0.07194664047851779
epoch 14


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 14: 4.4s to complete
    error(train)=3.62e-01, acc(train)=8.70e-01, error(valid)=4.90e-01, acc(valid)=8.39e-01


learning_rate 0.07146670432280097
epoch 15


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 15: 4.5s to complete
    error(train)=3.65e-01, acc(train)=8.70e-01, error(valid)=5.11e-01, acc(valid)=8.34e-01


learning_rate 0.07095361721049315
epoch 16


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 16: 4.5s to complete
    error(train)=3.47e-01, acc(train)=8.75e-01, error(valid)=4.94e-01, acc(valid)=8.36e-01


learning_rate 0.07040788549662842
epoch 17


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 17: 5.1s to complete
    error(train)=3.44e-01, acc(train)=8.75e-01, error(valid)=5.04e-01, acc(valid)=8.32e-01


learning_rate 0.0698300477525214
epoch 18


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 18: 4.9s to complete
    error(train)=3.34e-01, acc(train)=8.80e-01, error(valid)=4.93e-01, acc(valid)=8.39e-01


learning_rate 0.06922067423426231
epoch 19


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 19: 5.3s to complete
    error(train)=3.27e-01, acc(train)=8.83e-01, error(valid)=4.97e-01, acc(valid)=8.40e-01


learning_rate 0.0685803663199431
epoch 20


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 20: 4.6s to complete
    error(train)=3.25e-01, acc(train)=8.83e-01, error(valid)=5.07e-01, acc(valid)=8.35e-01


learning_rate 0.06790975591616992
epoch 21


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 21: 4.6s to complete
    error(train)=3.27e-01, acc(train)=8.76e-01, error(valid)=5.11e-01, acc(valid)=8.28e-01


learning_rate 0.0672095048344475
epoch 22


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 22: 4.5s to complete
    error(train)=3.13e-01, acc(train)=8.85e-01, error(valid)=5.06e-01, acc(valid)=8.38e-01


learning_rate 0.06648030413805117
epoch 23


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 23: 5.0s to complete
    error(train)=3.11e-01, acc(train)=8.86e-01, error(valid)=5.12e-01, acc(valid)=8.38e-01


learning_rate 0.0657228734600308
epoch 24


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 24: 5.2s to complete
    error(train)=3.06e-01, acc(train)=8.87e-01, error(valid)=5.08e-01, acc(valid)=8.36e-01


learning_rate 0.0649379602930199
epoch 25


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 25: 4.7s to complete
    error(train)=2.85e-01, acc(train)=8.96e-01, error(valid)=5.04e-01, acc(valid)=8.38e-01


learning_rate 0.06412633925155058
epoch 26


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 26: 4.7s to complete
    error(train)=3.00e-01, acc(train)=8.87e-01, error(valid)=5.15e-01, acc(valid)=8.32e-01


learning_rate 0.06328881130760257
epoch 27


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 27: 4.7s to complete
    error(train)=2.73e-01, acc(train)=8.98e-01, error(valid)=5.02e-01, acc(valid)=8.39e-01


learning_rate 0.062426203000140575
epoch 28


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 28: 5.4s to complete
    error(train)=2.67e-01, acc(train)=9.01e-01, error(valid)=5.05e-01, acc(valid)=8.42e-01


learning_rate 0.061539365619420104
epoch 29


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 29: 5.3s to complete
    error(train)=2.67e-01, acc(train)=9.00e-01, error(valid)=5.09e-01, acc(valid)=8.39e-01


learning_rate 0.06062917436686675
epoch 30


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 30: 4.8s to complete
    error(train)=2.61e-01, acc(train)=9.03e-01, error(valid)=5.12e-01, acc(valid)=8.40e-01


learning_rate 0.059696527491358065
epoch 31


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 31: 5.1s to complete
    error(train)=2.64e-01, acc(train)=9.00e-01, error(valid)=5.24e-01, acc(valid)=8.37e-01


learning_rate 0.05874234540276035
epoch 32


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 32: 7.5s to complete
    error(train)=2.54e-01, acc(train)=9.05e-01, error(valid)=5.22e-01, acc(valid)=8.39e-01


learning_rate 0.05776756976359525
epoch 33


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 33: 9.6s to complete
    error(train)=2.49e-01, acc(train)=9.06e-01, error(valid)=5.28e-01, acc(valid)=8.40e-01


learning_rate 0.05677316255973253
epoch 34


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 34: 11.6s to complete
    error(train)=2.38e-01, acc(train)=9.11e-01, error(valid)=5.21e-01, acc(valid)=8.40e-01


learning_rate 0.055760105151026175
epoch 35


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 35: 9.3s to complete
    error(train)=2.40e-01, acc(train)=9.10e-01, error(valid)=5.36e-01, acc(valid)=8.36e-01


learning_rate 0.05472939730283067
epoch 36


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 36: 9.3s to complete
    error(train)=2.41e-01, acc(train)=9.10e-01, error(valid)=5.40e-01, acc(valid)=8.37e-01


learning_rate 0.05368205619935332
epoch 37


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 37: 9.0s to complete
    error(train)=2.31e-01, acc(train)=9.14e-01, error(valid)=5.42e-01, acc(valid)=8.37e-01


learning_rate 0.052619115439816226
epoch 38


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




KeyboardInterrupt: 

In [None]:
##############SGD With scheduler with special restarts

from mlp.layers import AffineLayer, SoftmaxLayer, SigmoidLayer, ReluLayer, LeakyReluLayer
from mlp.errors import CrossEntropySoftmaxError
from mlp.models import MultipleLayerModel
from mlp.initialisers import ConstantInit, GlorotUniformInit
from mlp.learning_rules import AdamLearningRule,GradientDescentLearningRule
from mlp.optimisers import Optimiser
from mlp.schedulers import CosineAnnealingWithWarmRestarts

#setup hyperparameters
learning_rate = 0.0075
range = 10
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init),
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

sgd_scheduler_special = CosineAnnealingWithWarmRestarts(min_learning_rate=learning_rate / range, max_learning_rate=learning_rate * range,
                                                       total_iters_per_period=25,
                                                       max_learning_rate_discount_factor=0.9,
                                                       period_iteration_expansion_factor=3.0)

error = CrossEntropySoftmaxError()

# Use a basic gradient descent learning rule
learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)

#Remember to use notebook=False when you write a script to be run in a terminal
_ = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True,scheduler=sgd_scheduler_special)

In [None]:
############# Adam BaseLine with no restart scheduler

from mlp.layers import AffineLayer, SoftmaxLayer, SigmoidLayer, ReluLayer, LeakyReluLayer
from mlp.errors import CrossEntropySoftmaxError
from mlp.models import MultipleLayerModel
from mlp.initialisers import ConstantInit, GlorotUniformInit
from mlp.learning_rules import AdamLearningRule,GradientDescentLearningRule,RMSPropLearningRule
from mlp.optimisers import Optimiser
from mlp.schedulers import CosineAnnealingWithWarmRestarts

#setup hyperparameters
learning_rate = 0.00005
learning_range = 10
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init),
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

adam_scheduler_no_restart = CosineAnnealingWithWarmRestarts(min_learning_rate=learning_rate / learning_range, max_learning_rate=learning_rate * learning_range,
                                                       total_iters_per_period=100,
                                                       max_learning_rate_discount_factor=0.9,
                                                       period_iteration_expansion_factor=1.0)

error = CrossEntropySoftmaxError() 

# Use a basic gradient descent learning rule
learning_rule = AdamLearningRule(learning_rate=learning_rate)

#Remember to use notebook=False when you write a script to be run in a terminal
_ = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True,scheduler=adam_scheduler_no_restart)

In [None]:
############## Adam With scheduler with specified restarts

from mlp.layers import AffineLayer, SoftmaxLayer, SigmoidLayer, ReluLayer, LeakyReluLayer
from mlp.errors import CrossEntropySoftmaxError
from mlp.models import MultipleLayerModel
from mlp.initialisers import ConstantInit, GlorotUniformInit
from mlp.learning_rules import AdamLearningRule,GradientDescentLearningRule,RMSPropLearningRule
from mlp.optimisers import Optimiser
from mlp.schedulers import CosineAnnealingWithWarmRestarts

#setup hyperparameters
learning_rate = 0.00005
learning_range = 10
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init),
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

adam_scheduler_special = CosineAnnealingWithWarmRestarts(min_learning_rate=learning_rate / learning_range, max_learning_rate=learning_rate * learning_range,
                                                       total_iters_per_period=25,
                                                       max_learning_rate_discount_factor=0.9,
                                                       period_iteration_expansion_factor=3.0)

error = CrossEntropySoftmaxError() 

# Use a basic gradient descent learning rule
learning_rule = AdamLearningRule(learning_rate=learning_rate)

#Remember to use notebook=False when you write a script to be run in a terminal
_ = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True,scheduler=adam_scheduler_special)