# Experiment with MLP via Various Non-VCL Approaches (e.g. MAP, LP, EWC, SI) on Split-MNIST Task

The models are configured in almost the same way (in terms of widths and depths) as in the VCL experiments, namely MLPs.

This notebook is mainly meant for **replicaing the experiments from the VARIATIONAL CONTINUAL LEARNING paper.**

## Model Definition and Data Preparation

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import copy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
# Define non-vcl MLP with task heads 
class MLP(nn.Module):
    def __init__(self, input_dim=1*28*28, num_tasks=5, num_classes_per_task=2):
        super(MLP, self).__init__()
        # Shared layers
        self.shared_layers = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
        )

        # Task-specific heads
        self.task_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Linear(64, num_classes_per_task)
            ) for _ in range(num_tasks)
        ])
        
    def forward(self, x, task_idx):
        x = x.view(x.size(0), -1)  # Flatten the input
        shared_output = self.shared_layers(x)
        task_output = self.task_heads[task_idx](shared_output)
        return F.log_softmax(task_output, dim=1)


# For justifying that CNN-3 can also perform well on Split MNIST, without VCL. 
class CNN(nn.Module):
    def __init__(self, in_channels, num_tasks=5, num_classes_per_task=2):
        super(CNN, self).__init__()
        self.shared_conv_layers = nn.Sequential(
            nn.Conv2d(in_channels, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        
        self.fc_input_dim = 256 * 3 * 3
        
        self.task_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(self.fc_input_dim, 128),
                nn.ReLU(),
                nn.Linear(128, num_classes_per_task)
            ) for _ in range(num_tasks)
        ])
        
    def forward(self, x, task_idx):
        x = self.shared_conv_layers(x)
        x = x.view(-1, self.fc_input_dim)  # Flatten
        task_output = self.task_heads[task_idx](x)
        return F.log_softmax(task_output, dim=1)

In [3]:
# Get split MNIST dataset
from torch.utils.data import Subset
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, ToTensor, Normalize

from util.transforms import Flatten, Scale

# transform = Compose([ToTensor(), Flatten(), Scale()])
transform = Compose([ToTensor(), Scale()])

# download dataset
mnist_train = MNIST(root="data", train=True, download=False, transform=transform)
mnist_test = MNIST(root="data", train=False, download=False, transform=transform)

label_to_task_mapping = {
    0: 0, 1: 0,
    2: 1, 3: 1,
    4: 2, 5: 2,
    6: 3, 7: 3,
    8: 4, 9: 4,
}

train_task_ids = torch.Tensor([label_to_task_mapping[y] for _, y in mnist_train])
test_task_ids = torch.Tensor([label_to_task_mapping[y] for _, y in mnist_test])

## MLE / MAP for Split MNIST with MLP

In [13]:
# Train on Split MNIST without any CL strategies.
import os
import json
import torch.optim as optim

from datetime import datetime
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from util.operations import task_subset

binarize_y = lambda y, task: (y == (2 * task + 1)).long()


def test_model(model, dataloader, device, task_idx):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in dataloader:
            images, labels = data[0].to(device), binarize_y(data[1], task_idx).to(device)
            outputs = model(images, task_idx)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy


def train_mnist_split(model, log_name, mnist_train, mnist_test, train_task_ids, test_task_ids, device):
    """
    Trains a given model on a split MNIST dataset without continual learning strategies.

    """
    # Setup TensorBoard writer
    summary_logdir = os.path.join("logs", log_name, datetime.now().strftime('%b%d_%H-%M-%S'))
    summary_writer = SummaryWriter(summary_logdir)
    os.makedirs("out/models/", exist_ok=True)  # Ensure output directory exists
    experiment_path = f"out/experiments/{log_name}"
    os.makedirs(experiment_path, exist_ok=True)  # Ensure output directory exists
    accuracies = {}

    num_tasks = 5
    for task_idx in range(num_tasks):
        print(f"Training on task {task_idx}")
        task_dataset = task_subset(mnist_train, train_task_ids, task_idx)
        task_dataloader = DataLoader(task_dataset, batch_size=256, shuffle=True)

        test_dataset = task_subset(mnist_test, test_task_ids, task_idx)
        test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False)

        optimizer = optim.Adam(model.parameters(), lr=0.001)

        model.train()
        for epoch in tqdm(range(100), desc=f"Epoch: "):
            for images, labels in task_dataloader:
                images, labels = images.to(device), binarize_y(labels, task_idx).to(device)
                optimizer.zero_grad()
                outputs = model(images, task_idx)
                loss = F.nll_loss(outputs, labels)
                loss.backward()
                optimizer.step()

            summary_writer.add_scalar(f'Task_{task_idx}/Train_Loss', loss.item(), epoch)

        task_accuracy = test_model(model, test_dataloader, device, task_idx)
        print(f"Test accuracy on task {task_idx}: {task_accuracy}%")
        summary_writer.add_scalar(f"Accuracy/task_{task_idx}", task_accuracy, global_step=task_idx)
        accuracies[f"TASK {task_idx}"] = task_accuracy

        for previous_task_idx in range(task_idx + 1):
            test_dataset = task_subset(mnist_test, test_task_ids, previous_task_idx)
            test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False)
            
            accuracy = test_model(model, test_dataloader, device, previous_task_idx)
            print(f"Test accuracy on previous task {previous_task_idx}: {accuracy}%")
            summary_writer.add_scalar(f"Cross_Task_Accuracy/task_{task_idx}_on_{previous_task_idx}", accuracy, global_step=task_idx)
            accuracies[f"TASK {previous_task_idx}"] = task_accuracy
            
    # Save model state
    model_save_path = os.path.join("out/models/", f"{log_name}_model_final.pth")
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

    # Save accuracies to file
    accuracies_file = os.path.join(experiment_path, "final_accuracies.json")
    with open(accuracies_file, 'w') as f:
        json.dump(accuracies, f)
    print(f"Accuracies saved to {accuracies_file}")

    summary_writer.close()

## EWC Method for Split MNIST with MLP

In [42]:
# Test models with EWC method on split cifar10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def compute_fisher_information(model, dataloader, task_idx, device):
    model.eval()
    fisher_information = {}
    with torch.no_grad():
        for name, param in model.named_parameters():
            fisher_information[name] = torch.zeros_like(param.data)

        for data, target in dataloader:
            data, target = data.to(device), binarize_y(target, task_idx).to(device)
            with torch.enable_grad():  # Enables gradient calculation in this block
                model.zero_grad()
                output = model(data, task_idx)
                loss = F.nll_loss(output, target)
                loss.backward()
                for name, param in model.named_parameters():
                    if param.grad is not None:
                        fisher_information[name] += param.grad.data ** 2 / len(dataloader.dataset)
    return fisher_information

def modify_loss_function(original_loss, model, lambda_ewc, fisher_matrices, optimal_params):
    ewc_loss = 0
    for name, param in model.named_parameters():
        if name in fisher_matrices:
            fisher_matrix = fisher_matrices[name]
            optimal_param = optimal_params[name].to(device)
            ewc_loss += (fisher_matrix * (param - optimal_param) ** 2).sum()
    return original_loss + lambda_ewc / 2 * ewc_loss

def run_task_ewc(model, log_name, mnist_train, mnist_test, train_task_ids, test_task_ids, device, epochs, batch_size, lambda_ewc):
    """
    Trains a given model on MNIST split tasks using the Elastic Weight Consolidation (EWC) method.

    """    
    summary_writer = SummaryWriter(log_dir=os.path.join("logs", log_name, datetime.now().strftime('%Y%m%d_%H%M%S')))
    experiment_path = f"out/experiments/{log_name}"
    os.makedirs(experiment_path, exist_ok=True)  # Ensure output directory exists
    accuracies = {}
    
    previous_fisher_matrices = {}
    previous_optimal_params = {}

    for task_idx in range(5):
        print(f"Training on task {task_idx}")
        task_dataset = task_subset(mnist_train, train_task_ids, task_idx)
        train_loader = DataLoader(task_dataset, batch_size=batch_size, shuffle=True)
    
        test_dataset = task_subset(mnist_test, test_task_ids, task_idx)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        model.train()
        for epoch in tqdm(range(epochs), desc=f"Training Task {task_idx}"):
            for data, target in train_loader:
                data, target = data.to(device), binarize_y(target, task_idx).to(device)
                optimizer.zero_grad()
                output = model(data, task_idx)
                loss = F.nll_loss(output, target)
                if task_idx > 0:
                    ewc_loss = modify_loss_function(loss, model, lambda_ewc, previous_fisher_matrices, previous_optimal_params)
                    # print(f"Task {task_idx}'s ewc_loss: ", ewc_loss)
                    loss = ewc_loss
                loss.backward()
                optimizer.step()
        
            summary_writer.add_scalar(f'Task_{task_idx}/Train_Loss', loss.item(), epoch)

        test_accuracy = test_model(model, test_loader, device, task_idx)
        print(f"Test accuracy on task {task_idx}: {test_accuracy}%")
        summary_writer.add_scalar(f'Task_{task_idx}/Test_Accuracy', test_accuracy, epoch)
        accuracies[f"TASK {task_idx}"] = test_accuracy

        for previous_task_idx in range(task_idx + 1):
            test_dataset = task_subset(mnist_test, test_task_ids, previous_task_idx)
            test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False)
            
            accuracy = test_model(model, test_dataloader, device, previous_task_idx)
            print(f"Test accuracy on previous task {previous_task_idx}: {accuracy}%")
            summary_writer.add_scalar(f"Cross_Task_Accuracy/task_{task_idx}_on_{previous_task_idx}", accuracy, global_step=task_idx)
            accuracies[f"TASK {previous_task_idx}"] = accuracy

        # Update for EWC (after model.eval())
        model.eval()
        fisher_information = compute_fisher_information(model, train_loader, task_idx, device)
        optimal_params = copy.deepcopy(model.state_dict())

        if task_idx == 0:
            previous_fisher_matrices = fisher_information
            previous_optimal_params = optimal_params
        else:
            for name in fisher_information:
                previous_fisher_matrices[name] += fisher_information[name]

    # Save accuracies to file
    import json
    accuracies_file = os.path.join(experiment_path, "final_accuracies.json")
    with open(accuracies_file, 'w') as f:
        json.dump(accuracies, f)
    print(f"Accuracies saved to {accuracies_file}")

    summary_writer.close()

In [43]:
lambda_ewc = 1  # follow the paper

model = MLP().to(device)
run_task_ewc(model, f"ewc_disc_s_mnist_lambda_ewc_{lambda_ewc}", mnist_train, mnist_test, train_task_ids, test_task_ids, device, epochs=100, batch_size=256, lambda_ewc=lambda_ewc)

Training on task 0


Training Task 0:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 0: 99.95271867612293%
Test accuracy on previous task 0: 99.95271867612293%
Training on task 1


Training Task 1:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 1: 99.36336924583742%
Test accuracy on previous task 0: 99.90543735224587%
Test accuracy on previous task 1: 99.36336924583742%
Training on task 2


Training Task 2:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 2: 99.94663820704376%
Test accuracy on previous task 0: 98.39243498817967%
Test accuracy on previous task 1: 83.34965719882469%
Test accuracy on previous task 2: 99.94663820704376%
Training on task 3


Training Task 3:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 3: 99.8489425981873%
Test accuracy on previous task 0: 99.66903073286052%
Test accuracy on previous task 1: 43.780607247796276%
Test accuracy on previous task 2: 82.23052294557097%
Test accuracy on previous task 3: 99.8489425981873%
Training on task 4


Training Task 4:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 4: 99.3948562783661%
Test accuracy on previous task 0: 93.61702127659575%
Test accuracy on previous task 1: 38.00195886385896%
Test accuracy on previous task 2: 81.59018143009605%
Test accuracy on previous task 3: 89.82880161127895%
Test accuracy on previous task 4: 99.3948562783661%
Accuracies saved to out/experiments/ewc_disc_s_mnist_lambda_ewc_5000/final_accuracies.json


## Synaptic Intelligence (SI) for Split MNIST with MLP

In [44]:
def run_task_si(model, log_name, mnist_train, mnist_test, train_task_ids, test_task_ids, device, epochs, batch_size, c_si):
    """
    Trains a given model on MNIST split tasks using the Synaptic Intelligence (SI) method.
    
    """
    summary_writer = SummaryWriter(log_dir=os.path.join("logs", log_name, datetime.now().strftime('%Y%m%d_%H%M%S')))
    experiment_path = f"out/experiments/{log_name}"
    os.makedirs(experiment_path, exist_ok=True)  # Ensure output directory exists
    accuracies = {}
    
    # Initialize importance and previous parameters dictionaries
    importance = {}
    prev_params = {}
    
    for name, param in model.named_parameters():
        importance[name] = torch.zeros_like(param, device=device)
        prev_params[name] = param.clone().detach()

    for task_idx in range(5):
        print(f"Training on task {task_idx}")
        task_dataset = task_subset(mnist_train, train_task_ids, task_idx)
        train_loader = DataLoader(task_dataset, batch_size=batch_size, shuffle=True)

        test_dataset = task_subset(mnist_test, test_task_ids, task_idx)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        for epoch in tqdm(range(epochs), desc=f"Training Task {task_idx}"):
            model.train()
            for data, target in train_loader:
                data, target = data.to(device), binarize_y(target, task_idx).to(device)
                optimizer.zero_grad()
                output = model(data, task_idx)
                loss = F.nll_loss(output, target)
                loss.backward()
                optimizer.step()
                
                # Update SI term based on the parameter changes after the step
                for name, param in model.named_parameters():
                    if param.requires_grad:
                        delta_param = param.detach() - prev_params[name]
                        importance[name] += (param.grad.detach() ** 2) * delta_param if param.grad is not None else 0.0
                
                # SI regularization
                si_term = sum((importance[name] * (param - prev_params[name]) ** 2).sum() for name, param in model.named_parameters())
                loss += c_si * si_term

            # Update previous parameters after each epoch
            prev_params = {name: param.clone().detach() for name, param in model.named_parameters()}

            summary_writer.add_scalar(f'Task_{task_idx}/Train_Loss', loss.item(), epoch)
            
        # Evaluate the model on the current task's test set
        test_accuracy = test_model(model, test_loader, device, task_idx)
        print(f"Test accuracy on task {task_idx}: {test_accuracy}%")
        summary_writer.add_scalar(f'Task_{task_idx}/Test_Accuracy', test_accuracy, epoch)
        accuracies[f"TASK {task_idx}"] = test_accuracy
        
        # Evaluate the model on all previous tasks' test sets to measure forgetting
        for previous_task_idx in range(task_idx + 1):
            test_dataset = task_subset(mnist_test, test_task_ids, previous_task_idx)
            test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False)
            accuracy = test_model(model, test_dataloader, device, previous_task_idx)
            print(f"Test accuracy on previous task {previous_task_idx}: {accuracy}%")
            summary_writer.add_scalar(f"Cross_Task_Accuracy/task_{task_idx}_on_{previous_task_idx}", accuracy, global_step=task_idx)
            accuracies[f"TASK {previous_task_idx}"] = accuracy

    # Save accuracies to file
    accuracies_file = os.path.join(experiment_path, "final_accuracies.json")
    with open(accuracies_file, 'w') as f:
        json.dump(accuracies, f)
    print(f"Accuracies saved to {accuracies_file}")

    summary_writer.close()

In [45]:
epochs = 100
batch_size = 256
c_si = 1
log_name = f"si_disc_s_mnist_c_si_{c_si}" 
model = MLP().to(device)

run_task_si(
    model=model,
    log_name=log_name,
    mnist_train=mnist_train,
    mnist_test=mnist_test,
    train_task_ids=train_task_ids,
    test_task_ids=test_task_ids,
    device=device,
    epochs=epochs,
    batch_size=batch_size,
    c_si=c_si
)

Training on task 0


Training Task 0:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 0: 99.95271867612293%
Test accuracy on previous task 0: 99.95271867612293%
Training on task 1


Training Task 1:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 1: 99.60822722820764%
Test accuracy on previous task 0: 99.76359338061465%
Test accuracy on previous task 1: 99.60822722820764%
Training on task 2


Training Task 2:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 2: 99.89327641408751%
Test accuracy on previous task 0: 94.51536643026004%
Test accuracy on previous task 1: 81.29285014691479%
Test accuracy on previous task 2: 99.89327641408751%
Training on task 3


Training Task 3:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 3: 99.8489425981873%
Test accuracy on previous task 0: 98.91252955082743%
Test accuracy on previous task 1: 38.05093046033301%
Test accuracy on previous task 2: 77.00106723585913%
Test accuracy on previous task 3: 99.8489425981873%
Training on task 4


Training Task 4:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 4: 99.24357034795764%
Test accuracy on previous task 0: 87.94326241134752%
Test accuracy on previous task 1: 42.6052889324192%
Test accuracy on previous task 2: 84.63180362860192%
Test accuracy on previous task 3: 89.72809667673717%
Test accuracy on previous task 4: 99.24357034795764%
Accuracies saved to out/experiments/si_disc_s_mnist_c_si_1/final_accuracies.json


## Laplace Propagation Method for Split MNIST Task with MLP

In [46]:
def compute_hessian_diag(model, dataloader, device, task_idx):
    model.eval()
    hessian_diag = {}
    for name, param in model.named_parameters():
        hessian_diag[name] = torch.zeros_like(param)

    for data, target in dataloader:
        data, target = data.to(device), binarize_y(target, task_idx).to(device)
        model.zero_grad()
        output = model(data, task_idx)
        loss = F.nll_loss(output, target)
        # Set allow_unused=True to handle parameters not used in the graph
        grad_params = torch.autograd.grad(loss, model.parameters(), create_graph=True, allow_unused=True)

        for grad, (name, param) in zip(grad_params, model.named_parameters()):
            if grad is not None:  # Only proceed if the gradient is not None
                grad2 = torch.autograd.grad(grad.sum(), param, retain_graph=True, allow_unused=True)[0]
                if grad2 is not None:  # Check if the second derivative is not None
                    hessian_diag[name] += grad2.data / len(dataloader.dataset)

    return hessian_diag


def run_task_lp(model, log_name, mnist_train, mnist_test, train_task_ids, test_task_ids, device, epochs, batch_size, gamma_lp):
    """
    Trains a given model on MNIST split tasks using a simplified Laplace Propagation method, approximated with second-order Taylor Expansion.
    
    """
    summary_writer = SummaryWriter(log_dir=os.path.join("logs", log_name, datetime.now().strftime('%Y%m%d_%H%M%S')))
    experiment_path = f"out/experiments/{log_name}"
    os.makedirs(experiment_path, exist_ok=True) 
    accuracies = {}
    
    # Initialize Hessian approximation (diagonal) and previous parameters
    hessian_diag = {}
    prev_params = {}

    for task_idx in range(5):
        print(f"Training on task {task_idx}")
        task_dataset = task_subset(mnist_train, train_task_ids, task_idx)
        train_loader = DataLoader(task_dataset, batch_size=batch_size, shuffle=True)

        test_dataset = task_subset(mnist_test, test_task_ids, task_idx)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        for epoch in tqdm(range(epochs), desc=f"Training Task {task_idx}"):
            model.train()
            for data, target in train_loader:
                data, target = data.to(device), binarize_y(target, task_idx).to(device)
                optimizer.zero_grad()
                output = model(data, task_idx)
                loss = F.nll_loss(output, target)
                
                if task_idx > 0:
                    # Calculate LP regularization term
                    lp_loss = 0
                    for name, param in model.named_parameters():
                        if name in hessian_diag:
                            lp_loss += (hessian_diag[name] * (param - prev_params[name]) ** 2).sum()
                    loss += gamma_lp * lp_loss
                
                loss.backward()
                optimizer.step()
            
            # Update Hessian approximation and previous parameters after each epoch
            if task_idx > 0:
                hessian_diag = compute_hessian_diag(model, train_loader, device, task_idx)
            prev_params = {name: param.clone().detach() for name, param in model.named_parameters()}

            summary_writer.add_scalar(f'Task_{task_idx}/Train_Loss', loss.item(), epoch)
            
        # Evaluate the model on the current task's test set
        test_accuracy = test_model(model, test_loader, device, task_idx)
        print(f"Test accuracy on task {task_idx}: {test_accuracy}%")
        summary_writer.add_scalar(f'Task_{task_idx}/Test_Accuracy', test_accuracy, epoch)
        accuracies[f"TASK {task_idx}"] = test_accuracy

        # Evaluate the model on all previous tasks' test sets to measure forgetting
        for previous_task_idx in range(task_idx + 1):
            test_dataset = task_subset(mnist_test, test_task_ids, previous_task_idx)
            test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False)
            accuracy = test_model(model, test_dataloader, device, previous_task_idx)
            print(f"Test accuracy on previous task {previous_task_idx}: {accuracy}%")
            summary_writer.add_scalar(f"Cross_Task_Accuracy/task_{task_idx}_on_{previous_task_idx}", accuracy, global_step=task_idx)
            accuracies[f"TASK {previous_task_idx}"] = accuracy

    # Save accuracies to file
    import json
    accuracies_file = os.path.join(experiment_path, "final_accuracies.json")
    with open(accuracies_file, 'w') as f:
        json.dump(accuracies, f)
    print(f"Accuracies saved to {accuracies_file}")
    
    summary_writer.close()

In [47]:
epochs = 100 
batch_size = 256 
gamma_lp = 0.05
log_name = f"lp_s_MNIST_gamma_lp_{gamma_lp}" 

model = MLP().to(device)

run_task_lp(
    model=model,
    log_name=log_name,
    mnist_train=mnist_train,
    mnist_test=mnist_test,
    train_task_ids=train_task_ids,
    test_task_ids=test_task_ids,
    device=device,
    epochs=epochs,
    batch_size=batch_size,
    gamma_lp=gamma_lp
)

Training on task 0


Training Task 0:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 0: 99.95271867612293%
Test accuracy on previous task 0: 99.95271867612293%
Training on task 1


Training Task 1:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 1: 99.4613124387855%
Test accuracy on previous task 0: 96.97399527186761%
Test accuracy on previous task 1: 99.4613124387855%
Training on task 2


Training Task 2:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 2: 99.73319103521878%
Test accuracy on previous task 0: 58.156028368794324%
Test accuracy on previous task 1: 82.85994123408423%
Test accuracy on previous task 2: 99.73319103521878%
Training on task 3


Training Task 3:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 3: 99.8489425981873%
Test accuracy on previous task 0: 56.92671394799054%
Test accuracy on previous task 1: 40.00979431929481%
Test accuracy on previous task 2: 82.17716115261473%
Test accuracy on previous task 3: 99.8489425981873%
Training on task 4


Training Task 4:   0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy on task 4: 99.29399899142713%
Test accuracy on previous task 0: 88.60520094562648%
Test accuracy on previous task 1: 41.57688540646425%
Test accuracy on previous task 2: 91.40875133404482%
Test accuracy on previous task 3: 93.20241691842901%
Test accuracy on previous task 4: 99.29399899142713%
Accuracies saved to out/experiments/lp_s_MNIST_gamma_lp_0.05/final_accuracies.json
