In [1]:
#!/usr/bin/env python3
from argparse import ArgumentParser
import numpy as np
import torch
from data import PermutedMNIST
from train import train
from model import MLP
import utils
import random
import matplotlib.pyplot as plt



batch_size = 128
fisher_estimation_sample_size = 2048
weight_decay = 0
cuda=False
task_number = 5





In [2]:
def imshow(img,labels):
    fig = plt.figure()
    for i in range(6):
        plt.subplot(2,3,i+1)
        plt.tight_layout()
        plt.imshow(img[i].view(28,28), cmap='gray', interpolation='none')
        plt.title("Ground Truth: {}".format(labels[i]))
        plt.xticks([])
        plt.yticks([])
            
            
def get_permute_mnist(task_number):
   
    train_loader = {}
    test_loader = {}
    idx = list(range(28 * 28))
    for i in range(task_number):
        train_loader[i] = torch.utils.data.DataLoader(PermutedMNIST(train = True, permute_idx=idx),
                                                      batch_size=batch_size,
                                                      num_workers=4)
        test_loader[i] = torch.utils.data.DataLoader(PermutedMNIST(train = False, permute_idx= idx),
                                                     batch_size=batch_size)
        random.shuffle(idx)
    return train_loader, test_loader


train_loader, test_loader = get_permute_mnist(task_number)

#unpermuted data:
examples_unpermuted = enumerate(train_loader[0])

_, (example_data_unpermuted, example_targets_unpermuted) = next(examples_unpermuted)

imshow(example_data_unpermuted[:6], example_targets_unpermuted[:6])

#permuted data:
examples_permuted = enumerate(train_loader[1])

_, (example_data_permuted, example_targets_permuted) = next(examples_permuted)
imshow(example_data_permuted[:6], example_targets_permuted[:6])

NameError: name 'task_number' is not defined

In [None]:
#could also cut last 5 items which are ignored by early stopping
def loss_plot(x):
    num_epochs = 0
    for task in range(1, task_number+1):
        plt.plot(range(num_epochs+1, num_epochs + 1 + len(x[task])), x[task] )
        num_epochs+= len(x[task])
        
def accuracy_plot(x):
    total_epochs = len(x[1])
    for task in range(1, task_number + 1):
        plt.plot(range(total_epochs+1 - len(x[task]), total_epochs+1), x[task] )



In [None]:
def avg_precision_plot(precisions, labels = []): #precisions needs to be in the form of the return value of train
    for num, precision in enumerate(precisions):    
        avg_precisions = []
        total_epochs = task_number*epochs_per_task
        for epoch in range (total_epochs):
            avg_precision = 0
            tasks_considered = epoch // epochs_per_task +1 #gives 1 for first task, 2 for second,...
            for i in range(1,tasks_considered+1): #
                avg_precision += precision[i][epoch - (i-1)*epochs_per_task]
            avg_precision/=tasks_considered
            avg_precisions.append(avg_precision)
        plt.ylim(0.88, 0.94)
        if (len(labels) == len(precisions)):
            plt.plot(range(total_epochs), avg_precisions, label = labels[num])
        else:
            plt.plot(range(total_epochs), avg_precisions)
    plt.legend()
    

In [None]:
#high learning rate, no dropout, no early stopping
hidden_size1=512
hidden_size2 = 256
lamda=1000
lr=5.e-2
hidden_dropout_prob=0
input_dropout_prob=0
early_stopping = False
consolidate = False
epochs_per_task =50

In [None]:
# prepare the model.
mlp_no_dropout_no_earlystopping = MLP( 28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

# initialize the weights.
utils.gaussian_initialize(mlp_no_dropout_no_earlystopping)

# prepare the cuda if needed.
if cuda:
    mlp_no_dropout.cuda()


In [None]:
# run the standard experiment.
(standard_prec_no_dropout_no_earlystopping, 
 standard_total_loss_no_dropout_no_earlystopping,
 standard_ce_loss_no_dropout_no_earlystopping,
 standard_ewc_loss_no_dropout_no_earlystopping) = train(
    mlp_no_dropout_no_earlystopping, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(standard_total_loss_no_dropout_no_earlystopping)

In [None]:
accuracy_plot(standard_prec_no_dropout_no_earlystopping)

In [None]:
mlp_consolidation_no_dropout_no_earlystopping = MLP( 28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

utils.gaussian_initialize(mlp_consolidation_no_dropout_no_earlystopping)

# run the standard experiment.
consolidate = True
(ewc_prec_no_dropout_no_earlystopping, 
 ewc_total_loss_no_dropout_no_earlystopping, 
 ewc_ce_loss_no_dropout_no_earlystopping, 
 ewc_ewc_loss_no_dropout_no_earlystopping) = train(
    mlp_consolidation_no_dropout_no_earlystopping, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(ewc_total_loss_no_dropout_no_earlystopping)

In [None]:
accuracy_plot(ewc_prec_no_dropout_no_earlystopping)

In [None]:
#high learning rate, dropout, no early stopping
hidden_dropout_prob = 0.5
input_dropout_prob = 0.2

In [None]:
# prepare the model.
mlp_dropout_no_earlystopping = MLP(28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

# initialize the weights.
utils.gaussian_initialize(mlp_dropout_no_earlystopping)

# run the standard experiment.
consolidate = False
(standard_prec_dropout_no_earlystopping,
 standard_total_loss_dropout_no_earlystopping,
 standard_ce_loss_dropout_no_earlystopping,
 standard_ewc_loss_dropout_no_earlystopping) = train(
    mlp_dropout_no_earlystopping, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(standard_total_loss_dropout_no_earlystopping)

In [None]:
accuracy_plot(standard_prec_dropout_no_earlystopping)

In [None]:
mlp_consolidation_dropout_no_earlystopping = MLP(28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

utils.gaussian_initialize(mlp_consolidation_dropout_no_earlystopping)

# run the standard experiment.
consolidate = True
(ewc_prec_dropout_no_earlystopping, 
 ewc_total_loss_dropout_no_earlystopping, 
 ewc_ce_loss_dropout_no_earlystopping, 
 ewc_ewc_loss_dropout_no_earlystopping) =train(
    mlp_consolidation_dropout_no_earlystopping, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(ewc_total_loss_dropout_no_earlystopping)

In [None]:
accuracy_plot(ewc_prec_dropout_no_earlystopping)

In [None]:
#dropout and early stopping
early_stopping = True

In [None]:
mlp_dropout_earlystopping = MLP(28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

utils.gaussian_initialize(mlp_dropout_earlystopping)

# run the standard experiment.
consolidate = False
(standard_prec_dropout_earlystopping, 
 standard_total_loss_dropout_earlystopping,
 standard_ce_loss_dropout_earlystopping,
 standard_ewc_loss_dropout_earlystopping) = train(
    mlp_dropout_earlystopping, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(standard_total_loss_dropout_earlystopping)

In [None]:
accuracy_plot(standard_prec_dropout_earlystopping)

In [None]:
mlp_consolidation_dropout_earlystopping = MLP(28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

utils.gaussian_initialize(mlp_consolidation_dropout_earlystopping)

# run the standard experiment.
consolidate = True
(ewc_prec_dropout_earlystopping,
 ewc_total_loss_dropout_earlystopping, 
 ewc_ce_loss_dropout_earlystopping,
 ewc_ewc_loss_dropout_earlystopping) =train(
    mlp_consolidation_dropout_earlystopping, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(ewc_total_loss_dropout_earlystopping)

In [None]:
accuracy_plot(ewc_prec_dropout_earlystopping)

In [None]:
#dropout, early stopping, wider layers, higher lambda
lr = 1.e-2
hidden_size1 = 1600
hidden_size2 = 1400
lamda = 2000
task_number = 10
epochs_per_task = 75
train_loader, test_loader = get_permute_mnist(task_number)

In [None]:
#does even higher lambda + greater network size help to get better long term results + more epochs
mlp_optimised = MLP(28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

utils.gaussian_initialize(mlp_optimised)

# run the standard experiment.
consolidate = False
(standard_prec_optimised,
 standard_total_loss_optimised,
 standard_ce_loss_optimised, 
 standard_ewc_loss_optimised) =train(
    mlp_optimised, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(standard_total_loss_optimised)

In [None]:
accuracy_plot(standard_prec_optimised)

In [None]:
#does even higher lambda + greater network size help to get better long term results + more epochs
mlp_consolidation_optimised = MLP(28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

utils.gaussian_initialize(mlp_consolidation_optimised)

# run the standard experiment.
consolidate = True
(ewc_prec_optimised, 
 ewc_total_loss_optimised,
 ewc_ce_loss_optimised, 
 ewc_ewc_loss_optimised) =train(
    mlp_consolidation_optimised, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(ewc_total_loss_optimised)

In [None]:
accuracy_plot(ewc_prec_optimised)

In [None]:
#compare performance on first task:
fig = plt.figure(figsize = (20,10))
plt.ylim(0.65,1)
plt.plot(range(1, len(standard_prec_no_dropout_no_earlystopping)+1),
         standard_prec_no_dropout_no_earlystopping, label = 'std_no_drop_no_stop')
plt.plot(range(1, len(ewc_prec_no_dropout_no_earlystopping)+1),
         ewc_prec_no_dropout_no_earlystopping, label = 'ewc_no_drop_no_stop')
plt.plot(range(1, len(standard_prec_dropout_no_earlystopping)+1),
         standard_prec_dropout_no_earlystopping, label = 'std_drop_no_stop')
plt.plot(range(1, len(ewc_prec_dropout_no_earlystopping)+1),
         ewc_prec_dropout_no_earlystopping, label = 'ewc_drop_no_stop')
plt.plot(range(1, len(standard_prec_dropopout_earlystopping)+1),
         standard_prec_dropout_earlystopping, label = 'std_drop_stop')
plt.plot(range(1, len(ewc_prec_dropout_earlystopping)+1),
         ewc_prec_dropout_earlystopping, label = 'ewc_drop_stop')
plt.plot(range(1, len(standard_prec_optimised)+1),
         standard_prec_optimised, label = 'std_opt')
plt.plot(range(1, len(ewc_prec_optimised)+1),
         ewc_prec_optimised, label = 'ewc_opt')
plt.legend()

In [None]:
lamda = 3000
lr = 1.e-2
#does even higher lambda + greater network size help to get better long term results + more epochs
mlp_consolidation_optimised2 = MLP(28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

utils.gaussian_initialize(mlp_consolidation_optimised2)

# run the standard experiment.
consolidate = True
(ewc_prec_optimised2, 
 ewc_total_loss_optimised2,
 ewc_ce_loss_optimised2, 
 ewc_ewc_loss_optimised2) =train(
    mlp_consolidation_optimised2, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(ewc_total_loss_optimised2)

In [None]:
accuracy_plot(ewc_prec_optimised2)

In [None]:
lamda = 400
lr = 5.e-2
#does even higher lambda + greater network size help to get better long term results + more epochs
mlp_consolidation_optimised2 = MLP(28*28, 10,
    hidden_size1,
    hidden_size2,
    hidden_dropout_prob,
    input_dropout_prob,
    lamda,
)

utils.gaussian_initialize(mlp_consolidation_optimised2)

# run the standard experiment.
consolidate = True
(ewc_prec_optimised2, 
 ewc_total_loss_optimised2,
 ewc_ce_loss_optimised2, 
 ewc_ewc_loss_optimised2) =train(
    mlp_consolidation_optimised2, train_loader, test_loader,
    epochs_per_task,
    batch_size,
    consolidate,
    fisher_estimation_sample_size,
    lr,
    weight_decay,
    early_stopping,
    cuda
)

In [None]:
loss_plot(ewc_total_loss_optimised2)

In [None]:
accuracy_plot(ewc_prec_optimised2)