In [69]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import numpy as np
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import torch.utils.data as data
from torch.utils.data.sampler import Sampler
import math
import copy
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

!pip install faiss-gpu
import faiss

torch.manual_seed(0)



<torch._C.Generator at 0x7f23b4a07d98>

The model used in this analysis is a simple cnn model with 2 convolutional layer and two fully connected layers. The model is split into features, classifier and top_layer to mimic the architecture used in the original paper. See [here](https://github.com/facebookresearch/deepcluster/tree/master/models)

In [70]:
class SimpleCnn(nn.Module):
    
    def __init__(self, k=10):
        
        super(SimpleCnn, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        
        self.classifier = nn.Sequential(
            nn.Linear(7*7*16, 64),
            nn.ReLU()
        )

        self.top_layer = nn.Linear(64, k)
        self._initialize_weights()
    
    def forward(self, x):
        
        out = self.features(x)
        out = out.reshape(out.size(0), -1)
        out = self.classifier(out)
        if self.top_layer:
            out = self.top_layer(out)
        return out
    
    def _initialize_weights(self):
        for y, m in enumerate(self.modules()):
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                for i in range(m.out_channels):
                    m.weight.data[i].normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


def train_supervised(model, device, train_loader, epoch):
    model.train()
    torch.set_grad_enabled(True)
    optimizer = torch.optim.SGD(
        filter(lambda x: x.requires_grad, model.parameters()),
        lr=0.05,
        momentum=0.9,
        weight_decay=10**(-5)
    )

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    for e in range(epoch):

      for batch_idx, (data, target) in enumerate(train_loader):
          data, target = data.to(device), target.to(device)
          optimizer.zero_grad()
          output = model(data)
          loss = criterion(output, target)
          loss.backward()
          optimizer.step()
          if batch_idx % 100 == 0:
              print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                  e, batch_idx * len(data), len(train_loader.dataset),
                  100. * batch_idx / len(train_loader), loss.item()))
            

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [71]:
# choose device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

For simplicity, the whole analysis is done on the mnist dataset.


In [72]:
transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])
mnist_train = datasets.MNIST('../data', train=True, download=True,
                       transform=transform)
mnist_test = datasets.MNIST('../data', train=False,
                       transform=transform)

# data is splitted in 3 datasets
# 1) 55k images - used for unsupervised training 
# 2) 5k images - used for training of linear calssifier on top of features extracted from network trained with DeepCluster
# 3) 10k images - test set

unsupervised_pretrain, supervised_train = torch.utils.data.random_split(mnist_train, [55000, 5000])



train_loader_unsupervised = torch.utils.data.DataLoader(unsupervised_pretrain, batch_size=64,
                                             shuffle=False, num_workers=4)

train_loader_supervised = torch.utils.data.DataLoader(supervised_train, batch_size=64,
                                             shuffle=False, num_workers=4)

test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=64,
                                             shuffle=True, num_workers=4)

The following code snippets are taken from the Deepcluster github repository (https://github.com/facebookresearch/deepcluster) an adapted to the task stated above

In [73]:
def cluster_assign(images_lists, dataset):
    """Creates a dataset from clustering, with clusters as labels.
    Args:
        images_lists (list of list): for each cluster, the list of image indexes
                                    belonging to this cluster
        dataset (list): initial dataset
    Returns:
        ReassignedDataset(torch.utils.data.Dataset): a dataset with clusters as
                                                     labels
    """
    assert images_lists is not None
    pseudolabels = []
    image_indexes = []
    for cluster, images in enumerate(images_lists):
        image_indexes.extend(images)
        pseudolabels.extend([cluster] * len(images))

    t = transforms.Compose([
               transforms.ToTensor(),
               transforms.Normalize(mean=(0.1307,), std=(0.3081,))]
           )

    return ReassignedDataset(image_indexes, pseudolabels, dataset, t)


In [74]:
class ReassignedDataset(data.Dataset):
    """A dataset where the new images labels are given in argument. This assigns
    each image withits "pseudolabel"
    Args:
        image_indexes (list): list of data indexes
        pseudolabels (list): list of labels for each data
        dataset (list): list of tuples with paths to images
        transform (callable, optional): a function/transform that takes in
                                        an PIL image and returns a
                                        transformed version
    """

    def __init__(self, image_indexes, pseudolabels, dataset, transform=None):
        self.imgs = self.make_dataset(image_indexes, pseudolabels, dataset)
        self.transform = transform

    def make_dataset(self, image_indexes, pseudolabels, dataset):
        label_to_idx = {label: idx for idx, label in enumerate(set(pseudolabels))}
        images = []
        for j, idx in enumerate(image_indexes):
            path = dataset[idx][0]
            pseudolabel = label_to_idx[pseudolabels[j]]
            images.append((path, pseudolabel))
        return images

    def __getitem__(self, index):
        """
        Args:
            index (int): index of data
        Returns:
            tuple: (image, pseudolabel) where pseudolabel is the cluster of index datapoint
        """
        img, pseudolabel = self.imgs[index]
        return img, pseudolabel

    def __len__(self):
        return len(self.imgs)

In [75]:
class UnifLabelSampler(Sampler):
    """Samples elements uniformely accross pseudolabels.
        Args:
            N (int): size of returned iterator.
            images_lists: dict of key (target), value (list of data with this target)
    """

    def __init__(self, N, images_lists):
        self.N = N
        self.images_lists = images_lists
        self.indexes = self.generate_indexes_epoch()

    def generate_indexes_epoch(self):
        nmb_non_empty_clusters = 0
        for i in range(len(self.images_lists)):
            if len(self.images_lists[i]) != 0:
                nmb_non_empty_clusters += 1

        size_per_pseudolabel = int(self.N / nmb_non_empty_clusters) + 1
        res = np.array([])

        for i in range(len(self.images_lists)):
            # skip empty clusters
            if len(self.images_lists[i]) == 0:
                continue
            indexes = np.random.choice(
                self.images_lists[i],
                size_per_pseudolabel,
                replace=(len(self.images_lists[i]) <= size_per_pseudolabel)
            )
            res = np.concatenate((res, indexes))

        np.random.shuffle(res)
        res = list(res.astype('int'))
        if len(res) >= self.N:
            return res[:self.N]
        res += res[: (self.N - len(res))]
        return res

    def __iter__(self):
        return iter(self.indexes)

    def __len__(self):
        return len(self.indexes)

In [76]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def learning_rate_decay(optimizer, t, lr_0):
    for param_group in optimizer.param_groups:
        lr = lr_0 / np.sqrt(1 + lr_0 * param_group['weight_decay'] * t)
        param_group['lr'] = lr


In [77]:
def compute_features(dataloader, model, N, get_labels=False):

    model.eval()
    labels = []

    # discard the label information in the dataloader
    for i, (input_tensor, label) in enumerate(dataloader):
        input_var = torch.autograd.Variable(input_tensor.cuda(), requires_grad=False)
        aux = model(input_var).data.cpu().numpy()

        if i == 0:
            features = np.zeros((N, aux.shape[1]), dtype='float32')

        aux = aux.astype('float32')
        if i < len(dataloader) - 1:
            features[i * 64: (i + 1) * 64] = aux
        else:
            # special treatment for final batch
            features[i * 64:] = aux

        # measure elapsed time

        labels.append(label.numpy())

    labels = np.concatenate(labels)

    if get_labels:
      return features, labels
    
    else:
      return features


In [78]:
def train(loader, model, crit, opt, epoch):
    """Training of the CNN.
        Args:
            loader (torch.utils.data.DataLoader): Data loader
            model (nn.Module): CNN
            crit (torch.nn): loss
            opt (torch.optim.SGD): optimizer for every parameters with True
                                   requires_grad in model except top layer
            epoch (int)
    """
    losses = AverageMeter()
    # switch to train mode
    model.train()

    # create an optimizer for the last fc layer
    optimizer_tl = torch.optim.SGD(
        model.top_layer.parameters(),
        lr=0.01,
        weight_decay=10**-5,
    )

    for i, (input_tensor, target) in enumerate(loader):

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input_tensor.cuda())
        target_var = torch.autograd.Variable(target)

        output = model(input_var)
        loss = crit(output, target_var)

        # record loss
        losses.update(loss.data, input_tensor.size(0))

        # compute gradient and do SGD step
        opt.zero_grad()
        optimizer_tl.zero_grad()
        loss.backward()
        opt.step()
        optimizer_tl.step()

    return losses.avg

In [79]:
def DeepCluster(model, device, train_loader, epoch, k):

    fd = int(model.top_layer.weight.size()[1])
    model.top_layer = None

    model = model.to(device)


    optimizer = torch.optim.SGD(
        filter(lambda x: x.requires_grad, model.parameters()),
        lr=0.05,
        momentum=0.9,
        weight_decay=10**(-5)
    )

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    #cluster_step


    for e in range(epoch):
         
      model.top_layer = None
      model.classifier = nn.Sequential(*list(model.classifier.children())[:-1])

      features = compute_features(train_loader, model, len(unsupervised_pretrain))

       # only 64 dims, so no PCA
      pipeline = Pipeline([('scaling', StandardScaler())])
      
      post_scale = pipeline.fit_transform(features)
      post_norm = normalize(post_scale, norm="l2")

      n_data, d = post_norm.shape

      # faiss implementation of k-means
      clus = faiss.Clustering(d, k)
      clus.seed = np.random.randint(1234)

      clus.niter = 20
      clus.max_points_per_centroid = 60000

      res = faiss.StandardGpuResources()
      flat_config = faiss.GpuIndexFlatConfig()
      flat_config.useFloat16 = False
      flat_config.device = 0
      index = faiss.GpuIndexFlatL2(res, d, flat_config)

      #get new cluster labels
      clus.train(post_norm, index)
      _, I = index.search(post_norm, 1)

      labels = np.squeeze(I)

      unique, counts = np.unique(labels, return_counts=True)
      print(dict(zip(unique, counts)))

      images_lists = [[] for i in range(k)]
      for i in range(len(unsupervised_pretrain)):
            images_lists[int(labels[i])].append(i)


      # create new dataset from pseudolabels
      train_dataset = cluster_assign(images_lists, unsupervised_pretrain)

      #print(len(train_dataset))
      #print(images_lists)

      # sample images from uniform distribution over classes
      sampler = UnifLabelSampler(int(1 * len(train_dataset)),
                                   images_lists)


      train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=64,
            num_workers=4,
            sampler=sampler,
        )
      
      # reset last layer
      mlp = list(model.classifier.children())
      mlp.append(nn.ReLU(inplace=True).cuda())
      model.classifier = nn.Sequential(*mlp)
      model.top_layer = nn.Linear(fd, k)
      model.top_layer.weight.data.normal_(0, 0.01)
      model.top_layer.bias.data.zero_()
      model.top_layer.cuda()



      # train step
      torch.set_grad_enabled(True)
      loss = train(train_dataloader, model, criterion, optimizer, e)
      print("Epoch Nr:" + str(e))
      print(loss.cpu().numpy())


In [80]:
def linear_model(model_base, train_loader, test_loader):

  model = copy.deepcopy(model_base)
  model.to(device)
  model.top_layer = None
  model.classifier = nn.Sequential(*list(model.classifier.children())[:-1])
  features,labels = compute_features(train_loader, model, len(supervised_train), get_labels=True)

  clf = make_pipeline(StandardScaler(),LinearSVC(random_state=0, tol=1e-5, max_iter =10000))
  clf.fit(features, labels)

  x_test = []
  y_true = []

  torch.set_grad_enabled(False)
  for idx, (pics, labels) in enumerate(test_loader):
    pics = pics.to(device)

    model.eval()
    features_test = model(pics)
    x_test.append(features_test.cpu().numpy())
    y_true.append(labels)

  x_test = np.concatenate(x_test)
  y_true = np.concatenate(y_true)

  y_pred = clf.predict(x_test)

  print("Test Accuracy: " + str(accuracy_score(y_true, y_pred)))

Train CNN in self supervised manner using deepcluster (55k images) - 5 epochs

In [81]:
simpleCNN = SimpleCnn()
simpleCNN = simpleCNN.to(device)
DeepCluster(simpleCNN, device, train_loader_unsupervised, 5, 10)

{0: 4222, 1: 7192, 2: 4011, 3: 6000, 4: 5477, 5: 6803, 6: 4735, 7: 4269, 8: 5453, 9: 6838}
Epoch Nr:0
0.3598836
{0: 7284, 1: 7326, 2: 6206, 3: 5224, 4: 5829, 5: 4062, 6: 4982, 7: 4263, 8: 4475, 9: 5349}
Epoch Nr:1
0.16842784
{0: 4801, 1: 5556, 2: 4300, 3: 4050, 4: 4765, 5: 4738, 6: 5431, 7: 7502, 8: 6148, 9: 7709}
Epoch Nr:2
0.15201522
{0: 4276, 1: 5427, 2: 5259, 3: 6610, 4: 4182, 5: 6070, 6: 7175, 7: 6017, 8: 4843, 9: 5141}
Epoch Nr:3
0.1853473
{0: 5946, 1: 5718, 2: 4478, 3: 5679, 4: 6239, 5: 5843, 6: 6056, 7: 5187, 8: 5059, 9: 4795}
Epoch Nr:4
0.18122402


Initialize a model with random weights (baseline)

In [82]:
random_CNN = SimpleCnn()

Initialize a model that is trained in a supervised manner on the 55k train images (should act as an upper bound for the performance)

In [83]:
trainCNN = SimpleCnn()
trainCNN = trainCNN.to(device)
train_supervised(trainCNN, device, train_loader_unsupervised, 5)



Compare the results of a linear model trained on the features of the fist fc layer (5000 labeled images). Evaluated on test set (10000 images)

In [84]:
# random weight cnn (lower bound)
linear_model(random_CNN, train_loader_supervised, test_loader)

Test Accuracy: 0.8916


In [85]:
# cnn trained self supervised
linear_model(simpleCNN, train_loader_supervised, test_loader)

Test Accuracy: 0.9411


In [86]:
# cnn trained supervised
linear_model(trainCNN, train_loader_supervised, test_loader)

Test Accuracy: 0.9804
