In [110]:
from math import floor
import numpy as np

from torchvision import datasets
from torchvision.transforms import ToTensor

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [34]:
# training MNIST data
train_data = datasets.MNIST(root='data', train=True, download=True, transform=ToTensor())

# testing MNIST data
test_data = datasets.MNIST(root='data', train=False, download=True, transform=ToTensor())

In [35]:
# DEBUG CELLS
print(type(train_data.data))
print(train_data.data.shape)
print(train_data.targets.shape)

<class 'torch.Tensor'>
torch.Size([60000, 28, 28])
torch.Size([60000])


In [36]:
# define the network structure
class Network(nn.Module):
    def __init__(self, input_shape):
        super(Network, self).__init__()
        self.fc1 = nn.Linear(input_shape, 500)
        self.fc2 = nn.Linear(500, 300)
        self.output = nn.Linear(300, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        return x

In [88]:
input_shape = 784

# training hyperparameters
n_epoch = 2
learning_rate = 0.001
minibatch_sz = 64

In [176]:
# create the network, optimizer and define the loss function
network = Network(input_shape)
optimizer = optim.SGD(network.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

##### We will perfom task-wise training. A single task comprises of two classes from the MNIST dataset.

In [39]:
task1 = [0, 1]
task2 = [2, 3]
task3 = [4, 5]
task4 = [6, 7]
task5 = [8, 9]

##### Separate training and testing samples from each task. This is easier to work with.

In [46]:
task1_tr_samples = torch.where(torch.bitwise_or(train_data.targets == task1[0], train_data.targets == task1[1]) == 1)[0]

task2_tr_samples = torch.where(torch.bitwise_or(train_data.targets == task2[0], train_data.targets == task2[1]))[0]

task3_tr_samples = torch.where(torch.bitwise_or(train_data.targets == task3[0], train_data.targets == task3[1]) == 1)[0]

task4_tr_samples = torch.where(torch.bitwise_or(train_data.targets == task4[0], train_data.targets == task4[1]))[0]

task5_tr_samples = torch.where(torch.bitwise_or(train_data.targets == task5[0], train_data.targets == task5[1]))[0]

In [47]:
task1_ts_samples = torch.where(torch.bitwise_or(test_data.targets == task1[0], test_data.targets == task1[1]) == 1)[0]

task2_ts_samples = torch.where(torch.bitwise_or(test_data.targets == task2[0], test_data.targets == task2[1]))[0]

task3_ts_samples = torch.where(torch.bitwise_or(test_data.targets == task3[0], test_data.targets == task3[1]) == 1)[0]

task4_ts_samples = torch.where(torch.bitwise_or(test_data.targets == task4[0], test_data.targets == task4[1]))[0]

task5_ts_samples = torch.where(torch.bitwise_or(test_data.targets == task5[0], test_data.targets == task5[1]))[0]

### **Question 1**: The purpose of this question is to demonstrate the problem of catastrophic forgetting. For this purpose, we will train a single network on two different tasks in a sequence. After training evaluate the performance of the trained network on both tasks. What do you observe?

In [None]:
# train on task 1
# This code has been done for you as an example. Use the following code as a reference for subsequent codes you write.
for e in range(n_epoch):
    n_batch = floor(task1_tr_samples.shape[0] / minibatch_sz)
    
    for b in range(n_batch):
        x_batch = train_data.data[task1_tr_samples[(b*minibatch_sz):((b+1)*minibatch_sz)]]
        y_batch = train_data.targets[task1_tr_samples[(b*minibatch_sz):((b+1)*minibatch_sz)]]

        # flatten image before presenting to the network and normalize intensities to the range [0, 1]
        x_batch = torch.flatten(x_batch / 255, start_dim=1)

        # convert label to one hot
        y_batch = F.one_hot(y_batch).float()

        y_hat_batch = network(x_batch)
        loss = criterion(y_hat_batch, y_batch)
        loss.backward()
        optimizer.step()

        print(f'Epoch {e}: {loss.item()}')

In [None]:
# test on Task 1 and compute the model's classification accuracy. This cell is for debugging. It will let you know whether your model has trained well or not.

# ADD YOUR CODE HERE

Accuracy = 99.57446808510639


In [None]:
# train on task 2

# ADD YOUR CODE HERE

In [182]:
# test on Task 2

# ADD YOUR CODE HERE

Accuracy = 90.26004728132388


In [183]:
# test on Task 1

# ADD YOUR CODE HERE

Accuracy = 97.58865248226951


### **Question 2**: The purpose of this question is to study the effect of replay on catatophic forgetting. In this question also, we will train the network on two tasks in a sequence? When we train the network on the second task, we will also use some samples from the first task for replay. TO keep things simple, select a random proportaion (say 50%) of samples from the first task for replay. After training evaluate the performance of the trained network on both tasks. What do you observe?

In [179]:
# Save some samples from previous tasks for replay
prop_saved = 0.5 # proportion of samples saved from a task for replay

In [None]:
# create a new network
network = Network(input_shape)

In [None]:
# train on task 1. This is basically a copy of the code from above.
for e in range(n_epoch):
    n_batch = floor(task1_tr_samples.shape[0] / minibatch_sz)
    
    for b in range(n_batch):
        x_batch = train_data.data[task1_tr_samples[(b*minibatch_sz):((b+1)*minibatch_sz)]]
        y_batch = train_data.targets[task1_tr_samples[(b*minibatch_sz):((b+1)*minibatch_sz)]]

        # flatten image before presenting to the network and normalize intensities to the range [0, 1]
        x_batch = torch.flatten(x_batch / 255, start_dim=1)

        # convert label to one hot
        y_batch = F.one_hot(y_batch).float()

        y_hat_batch = network(x_batch)
        loss = criterion(y_hat_batch, y_batch)
        loss.backward()
        optimizer.step()

        print(f'Epoch {e}: {loss.item()}')

In [180]:
# select random samples from task 1 for replay.

task1_replay = np.random.choice(task1_tr_samples.numpy(), int(prop_saved * task1_tr_samples.shape[0]))
task1_replay_samples = torch.Tensor(task1_replay).int()

In [None]:
# train on task 2 with replay

# ADD YOUR CODE HERE

In [None]:
# Evaluate your model's performance on both task 1 and 2.

# ADD YOUR CODE HERE

# **Directions for further exploration**
We will not share solutions for these questions.

**Q1**: How does the proportion of samples saved for replay affect the model's performance?

**Q2**: Use replay to train the nentwork on more than two tasks. What is the impact of replay on the memory used by your models? Note that replay-based approach requires that you save the replay samples from previous task forever. This implies that the memory required to store samples contributes to your models memory footprint.

**Q3**: Can we chose replay samples more smartly so that we generate maximal impact while using minimal memory? For instance, can you use the network's prediction on a given task to identify samples stored for replay?
