# Incremental learning on image classification
**Ablation studies**

## Libraries and packages


In [1]:
!nvidia-smi

Tue Jun 23 15:50:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip3 install 'torch==1.4.0'
!pip3 install 'torchvision==0.5.0'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'

In [2]:
import os
import urllib
import logging

import numpy as np

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import Dataset, Subset, DataLoader, ConcatDataset
from torch.backends import cudnn

import torchvision
from torchvision import transforms
from torchvision.models import resnet34

from PIL import Image
from tqdm import tqdm

from copy import deepcopy

from sklearn.metrics import confusion_matrix

In [3]:
# GitHub credentials for cloning private repository
username = ''
password = ''

# Download packages from repository
password = urllib.parse.quote(password)
!git clone https://$username:$password@github.com/manuelemacchia/incremental-learning-image-classification.git
password = ''

!mv -v incremental-learning-image-classification/* .
!rm -rf incremental-learning-image-classification README.md

Cloning into 'incremental-learning-image-classification'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 735 (delta 36), reused 11 (delta 5), pack-reused 674[K
Receiving objects: 100% (735/735), 3.37 MiB | 10.50 MiB/s, done.
Resolving deltas: 100% (395/395), done.
renamed 'incremental-learning-image-classification/data' -> './data'
renamed 'incremental-learning-image-classification/dist_targets_analisys_notebook.ipynb' -> './dist_targets_analisys_notebook.ipynb'
renamed 'incremental-learning-image-classification/icarlSVM.ipynb' -> './icarlSVM.ipynb'
renamed 'incremental-learning-image-classification/joint_training.ipynb' -> './joint_training.ipynb'
renamed 'incremental-learning-image-classification/losses' -> './losses'
renamed 'incremental-learning-image-classification/model' -> './model'
renamed 'incremental-learning-image-classification/notebook.ipynb' -> './notebook.ipyn

In [4]:
from data.cifar100 import Cifar100
from model.resnet_cifar import resnet32
from model.manager import Manager
from model.icarl import Exemplars
from model.icarl import iCaRL
from utils import plot

  import pandas.util.testing as tm


In [13]:
import pickle
import time
from google.colab import files

if not os.path.isdir('./obj'):
    !mkdir 'obj'

def obj_save(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    
    time.sleep(5)

    files.download('obj/'+ name + '.pkl') 

def obj_load(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Arguments

In [5]:
# Directories
DATA_DIR = 'data'       # Directory where the dataset will be downloaded

# Settings
DEVICE = 'cuda'

# Dataset

RANDOM_STATE = None

RANDOM_STATES = [658, 423, 422]      # For reproducibility of results                        
                                     # Note: different random states give very different
                                     # splits and therefore very different results.

NUM_CLASSES = 100       # Total number of classes
NUM_BATCHES = 10
CLASS_BATCH_SIZE = 10   # Size of batch of classes for incremental learning

VAL_SIZE = 0.1          # Proportion of validation set with respect to training set (between 0 and 1)

# Training
BATCH_SIZE = 64         # Batch size (iCaRL sets this to 128)
LR = 2                  # Initial learning rate
                       
MOMENTUM = 0.9          # Momentum for stochastic gradient descent (SGD)
WEIGHT_DECAY = 1e-5     # Weight decay from iCaRL

NUM_RUNS = 3            # Number of runs of every method
                        # Note: this should be at least 3 to have a fair benchmark

NUM_EPOCHS = 70         # Total number of training epochs
MILESTONES = [49, 63]   # Step down policy from iCaRL (MultiStepLR)
                        # Decrease the learning rate by gamma at each milestone
GAMMA = 0.2             # Gamma factor from iCaRL

## Data preparation

In [6]:
# Transformations for Learning Without Forgetting
train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_transform = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))                                    
])

In [7]:
train_subsets = [[] for i in range(NUM_RUNS)]
val_subsets = [[] for i in range(NUM_RUNS)]
test_subsets = [[] for i in range(NUM_RUNS)]

for run_i in range(NUM_RUNS):
    for split_i in range(CLASS_BATCH_SIZE):
        if run_i+split_i == 0: # Download dataset only at first instantiation
            download = True
        else:
            download = False

        # Create CIFAR100 dataset
        train_dataset = Cifar100(DATA_DIR, train=True, download=download, random_state=RANDOM_STATES[run_i], transform=train_transform)
        test_dataset = Cifar100(DATA_DIR, train=False, download=False, random_state=RANDOM_STATES[run_i], transform=test_transform)
    
        # Subspace of CIFAR100 of 10 classes
        train_dataset.set_classes_batch(train_dataset.batch_splits[split_i]) 
        test_dataset.set_classes_batch([test_dataset.batch_splits[i] for i in range(0, split_i+1)])

        # Define train and validation indices
        train_indices, val_indices = train_dataset.train_val_split(VAL_SIZE, RANDOM_STATES[run_i])

        # Define subsets
        train_subsets[run_i].append(Subset(train_dataset, train_indices))
        val_subsets[run_i].append(Subset(train_dataset, val_indices))
        test_subsets[run_i].append(test_dataset)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to data/cifar-100-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting data/cifar-100-python.tar.gz to data


## Classifiers

### K-nearest neighbors

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection  import ParameterGrid
from copy import deepcopy

class iCaRLwithKNN(iCaRL):
    def classifier_fit(self, train_dataset, val_dataset, params):
        """Fit classifier on the union of training dataset and exemplars,
        and validate it on val_dataset."""

        # Union of training dataset and exemplars
        exemplars_dataset = Exemplars(self.exemplars, self.train_transform)
        train_dataset_with_exemplars = ConcatDataset([exemplars_dataset, train_dataset])

        # Convert dataset to numpy format
        # X contains training samples, y contains labels
        X, y = self.dataset_to_numpy(train_dataset_with_exemplars)

        # Extract features from the training dataset
        X_features = self.extract_features(torch.tensor(X, dtype=torch.float))
        for i in range(X_features.size(0)):
            X_features[i] = X_features[i]/X_features[i].norm()
        X_features = X_features.to('cpu').numpy()

        # Initialize classifier
        self.clf = KNeighborsClassifier()

        # Run validation
        best_clf = None
        best_grid = None
        best_score = 0

        X_test, y_test = self.dataset_to_numpy(val_dataset)
        X_test_features = self.extract_features(torch.tensor(X_test, dtype=torch.float))
        for i in range(X_test_features.size(0)):
            X_test_features[i] = X_test_features[i]/X_test_features[i].norm()
        X_test_features = X_test_features.to('cpu').numpy()

        for grid in ParameterGrid(params):
            self.clf.set_params(**grid)
            self.clf.fit(X_features, y)
            y_pred = self.clf.predict(X_test_features)
            score = accuracy_score(y_test, y_pred)

            if score > best_score:
                best_clf = deepcopy(self.clf)
                best_score = score
                best_grid = grid

        # Set the classifier to the best clf found in validation
        self.clf = best_clf

        print(f"Best classifier: {best_grid} with score {best_score}")

    def classifier_predict(self, test_dataset):
        """Predict labels of test_dataset."""

        X_test, y_test = self.dataset_to_numpy(test_dataset)

        # Extract features from the test set
        X_test_features = self.extract_features(torch.tensor(X_test, dtype=torch.float))
        for i in range(X_test_features.size(0)):
            X_test_features[i] = X_test_features[i]/X_test_features[i].norm()
        X_test_features = X_test_features.to('cpu').numpy()
        
        y_pred = self.clf.predict(X_test_features)

        return y_test, y_pred

    def dataset_to_numpy(self, dataset):
        # Preallocate arrays
        X = np.zeros((len(dataset), 3, 32, 32))
        y = np.zeros(len(dataset), dtype=int)

        dataloader = DataLoader(dataset, batch_size=1)

        for idx, (image, labels) in enumerate(dataloader):
            X[idx] = image[0].numpy()
            y[idx] = labels.numpy()[0]

        return X, y

    def test_knn(self, test_dataset, train_dataset, params):
        """Test the model.

        Args:
            test_dataset: dataset on which to test the network
            train_dataset: training set used to train the last split
            params: parameter grid on which to perform hyperparameter tuning
        Returns:
            accuracy (float): accuracy of the model on the test set
        """

        self.net.train(False)
        if self.best_net is not None: self.best_net.train(False)  # Set Network to evaluation mode
        if self.old_net is not None: self.old_net.train(False)

        with torch.no_grad():
            # Use test_dataset as validation set for hyperparameter tuning.
            # This is cheating, but we are not interested in tackling the
            # problem of classifier validation.
            self.classifier_fit(train_dataset, test_dataset, params)
            y_truth, y_pred = self.classifier_predict(test_dataset)

        # Calculate accuracy
        accuracy = accuracy_score(y_truth, y_pred)

        print(f"Test accuracy (iCaRL with KNN): {accuracy} ")

        return accuracy, torch.tensor(y_pred)

In [9]:
NUM_EPOCHS = 70

In [10]:
logs = [[] for _ in range(NUM_RUNS)]

params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance']
}

for run_i in range(NUM_RUNS):
    net = resnet32()
    icarl_knn = iCaRLwithKNN(DEVICE, net, LR, MOMENTUM, WEIGHT_DECAY, MILESTONES, GAMMA, NUM_EPOCHS, BATCH_SIZE, train_transform, test_transform)

    for split_i in range(10):
        print(f"## Split {split_i} of run {run_i} ##")
        
        icarl_knn.incremental_train(split_i, train_subsets[run_i][split_i], val_subsets[run_i][split_i])

        targets = torch.stack([label[0] for _, label in DataLoader(test_subsets[run_i][split_i])])

        logs[run_i].append({})

        # Test classic iCaRL classifier
        # acc, preds = icarl_knn.test(test_subsets[run_i][split_i], train_subsets[run_i][split_i])
        # logs[run_i][split_i]['accuracy'] = acc
        # logs[run_i][split_i]['conf_mat'] = confusion_matrix(targets.to('cpu'), preds.to('cpu'))
        
        # Test KNN classifier
        acc, preds = icarl_knn.test_knn(test_subsets[run_i][split_i], train_subsets[run_i][split_i], params)
        logs[run_i][split_i]['knn_accuracy'] = acc
        logs[run_i][split_i]['knn_conf_mat'] = confusion_matrix(targets.to('cpu'), preds.to('cpu'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Validation loss: 0.059828482036079676, Validation accuracy: 0.6428571428571429
Epoch: 35, LR: [2]
Train loss: 0.08033870197463744, Train accuracy: 0.7428836633663366
Validation loss: 0.059602743812969754, Validation accuracy: 0.5825892857142857
Epoch: 36, LR: [2]
Train loss: 0.0803436139727583, Train accuracy: 0.7337561881188119
Validation loss: 0.05953434056469372, Validation accuracy: 0.5915178571428571
Epoch: 37, LR: [2]
Train loss: 0.08054419127431246, Train accuracy: 0.7252475247524752
Validation loss: 0.06684192376477378, Validation accuracy: 0.6160714285714286
Epoch: 38, LR: [2]
Train loss: 0.0799854500311436, Train accuracy: 0.7479888613861386
Validation loss: 0.06229854854089873, Validation accuracy: 0.5625
Epoch: 39, LR: [2]
Train loss: 0.08026272464211624, Train accuracy: 0.7501547029702971
Validation loss: 0.0575578792818955, Validation accuracy: 0.6584821428571429
Epoch: 40, LR: [2]
Train loss: 0.081415623795

In [17]:
test_accuracy = [[logs[run_i][i]['knn_accuracy'] for i in range(10)] for run_i in range(NUM_RUNS)]
test_accuracy = np.array(test_accuracy)
test_accuracy_stats = np.array([test_accuracy.mean(0), test_accuracy.std(0)]).transpose()

In [18]:
test_accuracy_stats

array([[0.86466667, 0.02304103],
       [0.7525    , 0.027313  ],
       [0.67788889, 0.02464614],
       [0.60216667, 0.01992625],
       [0.52093333, 0.01208562],
       [0.48155556, 0.01866931],
       [0.45080952, 0.03085016],
       [0.40954167, 0.02563939],
       [0.38718519, 0.02158061],
       [0.34983333, 0.01723375]])

### Cosine linear layer

In [None]:
from math import sqrt

# distillation
class LFCLoss(nn.Module):
    def __init__(self, weight = None, reduction = 'mean'):
        super(LFCLoss, self).__init__()

    def forward(self, new_outputs, new_targets, new_features=None, old_features=None, num_classes=10):
        """
        Args:
            new_outputs (torch.tensor): Size = [64, 10]. New classes outputs
            new_targets (torch.tensor): Size = [64, 10]. One hot encoded targets of new classes
        """
        
        BATCH_SIZE = 64
      
        
        clf_criterion = nn.CrossEntropyLoss()
        clf_loss = clf_criterion(new_outputs, new_targets)
        
        if num_classes == 10:
            return clf_loss
        
        lambda_base = 2 # paper uses 5
        cur_lambda = lambda_base * sqrt(num_classes/(num_classes-10)) # from paper

        dist_criterion = nn.CosineEmbeddingLoss()
        dist_loss = dist_criterion(new_features, old_features, torch.ones(BATCH_SIZE).cuda())

        clf = 10/num_classes
        dist = (num_classes-10)/num_classes
        
        loss = clf*clf_loss + dist*dist_loss*cur_lambda
        
        return loss

In [None]:
from model.resnet_cifar import resnet32cosine

class iCaRLwithCosine(iCaRL):
    def do_batch(self, batch, labels):
        """Train network for a batch. Loss is applied here.
        Args:
            batch: batch of data used for training the network
            labels: targets of the batch
        Returns:
            loss: output of the criterion applied
            running_corrects: number of correctly classified elements
        """
        batch = batch.to(self.device)
        labels = labels.to(self.device)

        self.optimizer.zero_grad()

        num_classes = self.output_neurons_count()

        if self.old_net is None:
            outputs = self.net(batch)
            loss = self.criterion(outputs, labels)

        else:
            old_net_batch_features = self.extract_features(batch, old_net=True)
            new_net_batch_features = self.extract_features(batch, old_net=False)

            outputs = self.net(batch)
            loss = self.criterion(outputs, labels, new_net_batch_features, old_net_batch_features, num_classes)

        # Get predictions
        _, preds = torch.max(outputs.data, 1)

        # Accuracy over NEW IMAGES, not over all images
        running_corrects = torch.sum(preds == labels.data).data.item() 

        # Backward pass: computes gradients
        loss.backward()

        self.optimizer.step()

        return loss, running_corrects

    def extract_features(self, sample, batch=True, transform=None, old_net=False):
        assert not (batch is False and transform is None), "if a PIL image is passed to extract_features, a transform must be defined"

        self.net.train(False)
        if self.best_net is not None: self.best_net.train(False)
        if self.old_net is not None: self.old_net.train(False)

        if batch is False: # Treat sample as single PIL image
            sample = transform(sample)
            sample = sample.unsqueeze(0) # https://stackoverflow.com/a/59566009/6486336

        sample = sample.to(self.device)

        if old_net:
            features = self.old_net(sample, features=True)
        else:
            if self.VALIDATE:
                features = self.best_net(sample, features=True)
            else:
                features = self.net(sample, features=True)

        if batch is False:
            features = features[0]

        return features

    def test(self, test_dataset):
        self.net.train(False)
        if self.best_net is not None: self.best_net.train(False) # Set Network to evaluation mode
        if self.old_net is not None: self.old_net.train(False)

        self.test_dataloader = DataLoader(test_dataset, batch_size=self.BATCH_SIZE, shuffle=True, num_workers=4)

        running_corrects = 0
        total = 0

        all_preds = torch.tensor([]) # to store all predictions
        all_preds = all_preds.type(torch.LongTensor)
        
        for images, labels in self.test_dataloader:
            images = images.to(self.device)
            labels = labels.to(self.device)
            total += labels.size(0)

            # Forward Pass
            with torch.no_grad():
                if self.VALIDATE:
                    outputs = self.best_net(images)
                else:
                    outputs = self.net(images)

            # Get predictions
            _, preds = torch.max(outputs.data, 1)

            # Update Corrects
            running_corrects += torch.sum(preds == labels.data).data.item()

            # Append batch predictions
            all_preds = torch.cat(
                (all_preds.to(self.device), preds.to(self.device)), dim=0
            )

        # Calculate accuracy
        accuracy = running_corrects / float(total)  

        print(f"Test accuracy (Cosine): {accuracy}")

        return accuracy, all_preds

    def validate(self):
        self.net.train(False)
        if self.old_net is not None: self.old_net.train(False)
        if self.best_net is not None: self.best_net.train(False)

        running_val_loss = 0
        running_corrects = 0
        total = 0
        batch_idx = 0

        for images, labels in self.val_dataloader:
            images = images.to(self.device)
            labels = labels.to(self.device)
            total += labels.size(0)

            # New net forward pass
            outputs = self.net(images)  
            loss = self.criterion(outputs, labels) # BCE Loss with sigmoids over outputs

            running_val_loss += loss.item()

            # Get predictions
            _, preds = torch.max(outputs.data, 1)

            # Update the number of correctly classified validation samples
            running_corrects += torch.sum(preds == labels.data).data.item()

            batch_idx += 1

        # Calculate scores
        val_loss = running_val_loss / batch_idx
        val_accuracy = running_corrects / float(total)

        print(f"Validation loss: {val_loss}, Validation accuracy: {val_accuracy}")

        return val_loss, val_accuracy

    def increment_classes(self, n=10):
        """Add n classes in the final cosine layer."""

        in_features = self.net.fc.in_features  # size of each input sample
        out_features = self.net.fc.out_features  # size of each output sample
        weight = self.net.fc.weight.data
        eta = self.net.fc.eta.data

        self.net.fc = CosineLayer(in_features, out_features+n)
        self.net.fc.weight.data[:out_features] = weight
        self.net.fc.eta.data = eta

In [None]:
NUM_EPOCHS = 70
LR = 0.1

In [None]:
logs = [[] for _ in range(NUM_RUNS)]

for run_i in range(NUM_RUNS):
    net = resnet32cosine()
    icarl_cosine = iCaRLwithCosine(DEVICE, net, LR, MOMENTUM, WEIGHT_DECAY, MILESTONES, GAMMA, NUM_EPOCHS, BATCH_SIZE, train_transform, test_transform)
    icarl_cosine.criterion = LFCLoss()

    for split_i in range(10):
        print(f"## Split {split_i} of run {run_i} ##")
        
        icarl_cosine.incremental_train(split_i, train_subsets[run_i][split_i], val_subsets[run_i][split_i])

        targets = torch.stack([label[0] for _, label in DataLoader(test_subsets[run_i][split_i])])

        logs[run_i].append({})
        
        # Test Cosine layer classifier
        acc, preds = icarl_cosine.test(test_subsets[run_i][split_i])
        logs[run_i][split_i]['cosine_accuracy'] = acc
        logs[run_i][split_i]['cosine_conf_mat'] = confusion_matrix(targets.to('cpu'), preds.to('cpu'))