In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split
import torch.optim as optim 
import numpy as np
import os
import math
import random
from tqdm import tqdm
from torch.optim.optimizer import Optimizer, required


In [3]:
def load_data(data_set, label_noise, augment_data=False):
    """
    Helper Function to Load data in the form of a tensorflow data set, apply label noise, and return the
    train data and test data.

    Parameters
    ----------
    data_set - str, name of data set to load from tf.keras.datasets
    label_noise - float, percentage of training data to add noise to
    augment_data - boolean, whether or not to use random cropping and horizontal flipping to augment training data
    """

    datasets = ["cifar10", "cifar100", "mnist"]

    # load Cifar 10, Cifar 100, or mnis data set
    if data_set == "cifar10":
      transform = transforms.Compose([transforms.ToTensor()])
      trainset = torchvision.datasets.CIFAR10(root='/content/gdrive/MyDrive/6699/data', train=True, download=True, transform=transform)
      testset = torchvision.datasets.CIFAR10(root='/content/gdrive/MyDrive/6699/data', train=False, download=True, transform=transform)
    # elif data_set == "cifar100":
    #     get_data = tf.keras.datasets.cifar100
    # elif data_set == "mnist":
    #     get_data = tf.keras.datasets.mnist
    else:
      raise Exception(
          f"Please enter a data set from the following options: {datasets}"
      )
    
    # apply label noise to the data set
    if 0 < label_noise:
      random_idx = np.random.choice(
          trainset.data.shape[0], int(label_noise * trainset.data.shape[0])
      )
      rand_labels = np.random.randint(
          low=min(trainset.targets), high=max(trainset.targets), size=len(random_idx)
      )
      # print(trainset.targets[random_idx[0]])
      # print(trainset.targets[random_idx[1]])
      for i in range(len(random_idx)):

        trainset.targets[random_idx[i]] = rand_labels[i]
    # print(trainset.targets[random_idx[0]])
    # print(trainset.targets[random_idx[1]])
    return trainset, testset

In [4]:
class SGLD(Optimizer):
    """ Stochastic Gradient Langevin Dynamics Sampler with preconditioning.
        Optimization variable is viewed as a posterior sample under Stochastic
        Gradient Langevin Dynamics with noise rescaled in eaach dimension
        according to RMSProp.
    """
    def __init__(self,
          params,
          lr=1e-2,
          precondition_decay_rate=0.95,
          num_pseudo_batches=1,
          num_burn_in_steps=3000,
          diagonal_bias=1e-8) -> None:
        """ Set up a SGLD Optimizer.

        Parameters
        ----------
        params : iterable
            Parameters serving as optimization variable.
        lr : float, optional
            Base learning rate for this optimizer.
            Must be tuned to the specific function being minimized.
            Default: `1e-2`.
        precondition_decay_rate : float, optional
            Exponential decay rate of the rescaling of the preconditioner (RMSprop).
            Should be smaller than but nearly `1` to approximate sampling from the posterior.
            Default: `0.95`
        num_pseudo_batches : int, optional
            Effective number of minibatches in the data set.
            Trades off noise and prior with the SGD likelihood term.
            Note: Assumes loss is taken as mean over a minibatch.
            Otherwise, if the sum was taken, divide this number by the batch size.
            Default: `1`.
        num_burn_in_steps : int, optional
            Number of iterations to collect gradient statistics to update the
            preconditioner before starting to draw noisy samples.
            Default: `3000`.
        diagonal_bias : float, optional
            Term added to the diagonal of the preconditioner to prevent it from
            degenerating.
            Default: `1e-8`.

        """
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if num_burn_in_steps < 0:
            raise ValueError("Invalid num_burn_in_steps: {}".format(num_burn_in_steps))

        defaults = dict(
            lr=lr, precondition_decay_rate=precondition_decay_rate,
            num_pseudo_batches=num_pseudo_batches,
            num_burn_in_steps=num_burn_in_steps,
            diagonal_bias=1e-8,
        )
        super().__init__(params, defaults)


    def step(self, closure=None):
        loss = None

        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for parameter in group["params"]:

                if parameter.grad is None:
                    continue

                state = self.state[parameter]
                lr = group["lr"]
                num_pseudo_batches = group["num_pseudo_batches"]
                precondition_decay_rate = group["precondition_decay_rate"]
                gradient = parameter.grad.data

                #  State initialization {{{ #

                if len(state) == 0:
                    state["iteration"] = 0
                    state["momentum"] = torch.ones_like(parameter)

                #  }}} State initialization #

                state["iteration"] += 1

                momentum = state["momentum"]

                #  Momentum update {{{ #
                momentum.add_(
                    (1.0 - precondition_decay_rate) * ((gradient ** 2) - momentum)
                )
                #  }}} Momentum update #

                if state["iteration"] > group["num_burn_in_steps"]:
                    sigma = 1. / torch.sqrt(torch.tensor(lr))
                else:
                    sigma = torch.zeros_like(parameter)

                preconditioner = (
                    1. / torch.sqrt(momentum + group["diagonal_bias"])
                )

                scaled_grad = (
                    0.5 * preconditioner * gradient * num_pseudo_batches +
                    0.01 * torch.normal(
                        mean=torch.zeros_like(gradient),
                        std=torch.ones_like(gradient)
                    ) * sigma * torch.sqrt(preconditioner)
                )

                parameter.data.add_(-lr * scaled_grad)

        return loss

In [5]:
def CNN(width):
  layers = nn.Sequential(
      nn.Conv2d(in_channels=3, out_channels=width, kernel_size=3, stride=1, padding=1),
      nn.BatchNorm2d(width),
      nn.ReLU(),
      # nn.MaxPool2d(kernel_size=2, stride=2),
      
      nn.Conv2d(in_channels=width, out_channels=2*width, kernel_size=3, stride=1, padding=1),
      nn.BatchNorm2d(2*width),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),

      nn.Conv2d(in_channels=2*width, out_channels=4*width, kernel_size=3, stride=1, padding=1),
      nn.BatchNorm2d(4*width),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),

      nn.Conv2d(in_channels=4*width, out_channels=8*width, kernel_size=3, stride=1, padding=1),
      nn.BatchNorm2d(8*width),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),

      nn.MaxPool2d(kernel_size=4, stride=4),
      nn.ReLU(),
      nn.Flatten(),
      nn.Linear(in_features=8*width, out_features=10),

    )

  return layers

In [6]:
model = CNN(1)
x = torch.rand([5,3,32,32])
for name, module in model.named_children():
  x = module(x)
  # print(name)
  print("({}) : {}".format(name,x.shape))

(0) : torch.Size([5, 1, 32, 32])
(1) : torch.Size([5, 1, 32, 32])
(2) : torch.Size([5, 1, 32, 32])
(3) : torch.Size([5, 2, 32, 32])
(4) : torch.Size([5, 2, 32, 32])
(5) : torch.Size([5, 2, 32, 32])
(6) : torch.Size([5, 2, 16, 16])
(7) : torch.Size([5, 4, 16, 16])
(8) : torch.Size([5, 4, 16, 16])
(9) : torch.Size([5, 4, 16, 16])
(10) : torch.Size([5, 4, 8, 8])
(11) : torch.Size([5, 8, 8, 8])
(12) : torch.Size([5, 8, 8, 8])
(13) : torch.Size([5, 8, 8, 8])
(14) : torch.Size([5, 8, 4, 4])
(15) : torch.Size([5, 8, 1, 1])
(16) : torch.Size([5, 8, 1, 1])
(17) : torch.Size([5, 8])
(18) : torch.Size([5, 10])


In [7]:
class inverse_squareroot_lr:
  """
  This is the learning rate used with SGD in the paper (Inverse square root decay).
  Learning Rate starts at 0.1 and then drops every 512 batches.
  """

  def __init__(self, n_steps=512, init_lr=0.1):
      self.n = n_steps
      self.gradient_steps = 0
      self.init_lr = init_lr

  def __call__(self):
      lr = self.init_lr / math.sqrt(
          1.0 + math.floor(self.gradient_steps / self.n)
      )
      self.gradient_steps += 1
      return lr

In [8]:
class Classifier():

    def __init__(self, name, model, trainset, testset, use_cuda=False):
        
        '''
        @name: Experiment name. Will define stored results etc. 
        @model: Either a GradBasicNet() or a GradAlexNet()
        @dataloaders: Dictionary with keys train, val and test and corresponding dataloaders
        @class_names: list of classes, where the idx of class name corresponds to the label used for it in the data
        @use_cuda: whether or not to use cuda
        '''
        
        self.name = name
        if use_cuda and not torch.cuda.is_available():
            raise Exception("Asked for CUDA but GPU not found")
            
        self.use_cuda = use_cuda
        
        self.model = model.to('cuda' if use_cuda else 'cpu')
        self.trainset = trainset
        self.testset = testset
        self.init_lr = 0.1
        self.criterion = nn.CrossEntropyLoss() #use cross entropy loss
        self.optim = SGLD(model.parameters(), lr=self.init_lr) #use SGD with suggest hyperparams; you must select all the model params
        # self.optim = optim.SGD(model.parameters(), lr=0.1)


        save_path = os.path.join(os.getcwd(), 'models', self.name)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            
        self.save_path = save_path

    def train(self, epochs, save=True):
        '''
        @epochs: number of epochs to train
        @save: whether or not to save the checkpoints
        '''

        # best_val_accuracy = - math.inf
        gradient_steps = 0
        for epoch in tqdm(range(epochs)):

            self.model.train()

            n_step = 512
            
            # TODO Iterate over the training dataloader (see how it is done for validation below) and make sure
            # to call the optim.zero_grad(), loss.backward() and optim.step()
            for idx, data in enumerate(self.trainset):
                inputs, labels = data

                inputs = inputs.to('cuda' if self.use_cuda else 'cpu')
                labels = labels.to('cuda' if self.use_cuda else 'cpu')

                outputs = self.model(inputs)

                loss = self.criterion(outputs, labels)

                optimizer = self.optim

                lr = self.init_lr / math.sqrt(1.0 + math.floor(gradient_steps/n_step))
                gradient_steps += 1
                optimizer.param_groups[0]['lr'] = lr
                # print(gradient_steps, optimizer.param_groups[0]['lr'])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if save:
            #  Make sure that your saving pipeline is working well. 
            # Is os library working on your file system? 
            # Is your model being saved and reloaded fine? 
            # When you do the kernel viz, activation maps, 
            # and GradCAM you must be using the model you have saved before.
            
            # torch.save(self.model.state_dict(), os.path.join(self.save_path, f'epoch_{epoch}.pt'))
            
            torch.save(self.model.state_dict(), os.path.join(self.save_path, 'best.pt'))
                  

        # print('Done training!')                       

    def evaluate_train(self):
        
        try:
            assert os.path.exists(os.path.join(self.save_path, 'best.pt'))
            
        except:
            print('It appears you are testing the model without training. Please train first')
            return
        
        self.model.load_state_dict(torch.load(os.path.join(self.save_path, 'best.pt')))
        self.model.eval()

        #total = len(self.dataloaders['test'])
        
        correct = 0.0
        total = 0.0
        for idx, data in enumerate(self.trainset):
            
                inputs, labels = data

                inputs = inputs.to('cuda' if self.use_cuda else 'cpu')
                labels = labels.to('cuda' if self.use_cuda else 'cpu')
                
                outputs = self.model(inputs)
                _, predicted = torch.max(outputs, 1)
                
                total += labels.shape[0]
                correct += (predicted == labels).sum().item()
                
        # print(f'Training Accuracy: {100 * correct/total}%')
        return 1 - correct/total

    def evaluate_test(self):
        
        try:
            assert os.path.exists(os.path.join(self.save_path, 'best.pt'))
            
        except:
            print('It appears you are testing the model without training. Please train first')
            return
        
        self.model.load_state_dict(torch.load(os.path.join(self.save_path, 'best.pt')))
        self.model.eval()

        #total = len(self.dataloaders['test'])
        
        correct = 0.0
        total = 0.0
        for idx, data in enumerate(self.testset):
            
                inputs, labels = data

                inputs = inputs.to('cuda' if self.use_cuda else 'cpu')
                labels = labels.to('cuda' if self.use_cuda else 'cpu')
                
                outputs = self.model(inputs)
                _, predicted = torch.max(outputs, 1)
                total += labels.shape[0]
                correct += (predicted == labels).sum().item()
                
        # print(f'Testing Accuracy: {100 * correct/total}%')
        return 1 - correct/total

In [10]:
batch_size = 128
num_workers = 2
# epochs = 500_000 // (50_000 // 128) # total number desirec SGD steps / number batches per epoch
epochs = 1000
label_noise_int = 20
label_noise = label_noise_int / 100
trainset, testset = load_data("cifar10", label_noise)

trainset, _ = random_split(trainset, [10000, len(trainset)-10000])
testset, _ = random_split(testset, [2000, len(testset)-2000])

trainset = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
testset = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
train_result = []
test_result = []
convnet_widths = [(i + 1) for i in range(16)] + [16 + 4*(i + 1) for i in range(12)] 
for width in convnet_widths:
  experiment_name = f'CNN_{width}_{label_noise_int}%noise'  #Provide name to model experiment
  model_name = f'CNN_{width}_{label_noise_int}%noise' 

  model = CNN(width)

  classifier = Classifier(experiment_name, model, trainset, testset, use_cuda=True)

  classifier.train(epochs=epochs)
  train_error = classifier.evaluate_train()
  test_error = classifier.evaluate_test()
  print(f'width = {width}, train_error = {train_error}, test_error = {test_error}')
  train_result.append(train_error)
  test_result.append(test_error)

100%|██████████| 1000/1000 [21:14<00:00,  1.27s/it]


width = 1, train_error = 0.6472, test_error = 0.629


100%|██████████| 1000/1000 [21:29<00:00,  1.29s/it]


width = 2, train_error = 0.5396000000000001, test_error = 0.5409999999999999


100%|██████████| 1000/1000 [22:00<00:00,  1.32s/it]


width = 3, train_error = 0.5363, test_error = 0.5375


100%|██████████| 1000/1000 [22:14<00:00,  1.33s/it]


width = 4, train_error = 0.5154000000000001, test_error = 0.5075000000000001


100%|██████████| 1000/1000 [22:36<00:00,  1.36s/it]


width = 5, train_error = 0.6345000000000001, test_error = 0.641


100%|██████████| 1000/1000 [22:32<00:00,  1.35s/it]


width = 6, train_error = 0.659, test_error = 0.649


100%|██████████| 1000/1000 [22:55<00:00,  1.38s/it]


width = 7, train_error = 0.486, test_error = 0.488


100%|██████████| 1000/1000 [23:12<00:00,  1.39s/it]


width = 8, train_error = 0.6159, test_error = 0.5915


100%|██████████| 1000/1000 [23:31<00:00,  1.41s/it]


width = 9, train_error = 0.6253, test_error = 0.622


100%|██████████| 1000/1000 [26:35<00:00,  1.60s/it]


width = 10, train_error = 0.6786, test_error = 0.6759999999999999


100%|██████████| 1000/1000 [27:00<00:00,  1.62s/it]


width = 11, train_error = 0.6694, test_error = 0.6699999999999999


100%|██████████| 1000/1000 [25:38<00:00,  1.54s/it]


width = 12, train_error = 0.6493, test_error = 0.6355


100%|██████████| 1000/1000 [26:19<00:00,  1.58s/it]


width = 13, train_error = 0.6402, test_error = 0.642


100%|██████████| 1000/1000 [27:08<00:00,  1.63s/it]


width = 14, train_error = 0.7602, test_error = 0.7444999999999999


100%|██████████| 1000/1000 [28:01<00:00,  1.68s/it]


width = 15, train_error = 0.7123999999999999, test_error = 0.6950000000000001


100%|██████████| 1000/1000 [28:33<00:00,  1.71s/it]


width = 16, train_error = 0.5432, test_error = 0.5485


100%|██████████| 1000/1000 [41:22<00:00,  2.48s/it]


width = 20, train_error = 0.8318, test_error = 0.8345


100%|██████████| 1000/1000 [36:50<00:00,  2.21s/it]


width = 24, train_error = 0.5562, test_error = 0.538


100%|██████████| 1000/1000 [40:26<00:00,  2.43s/it]


width = 28, train_error = 0.7735, test_error = 0.7555000000000001


100%|██████████| 1000/1000 [38:43<00:00,  2.32s/it]


width = 32, train_error = 0.6675, test_error = 0.659


100%|██████████| 1000/1000 [43:28<00:00,  2.61s/it]


width = 36, train_error = 0.8012, test_error = 0.81


100%|██████████| 1000/1000 [51:29<00:00,  3.09s/it]


width = 40, train_error = 0.604, test_error = 0.585


100%|██████████| 1000/1000 [55:19<00:00,  3.32s/it]


width = 44, train_error = 0.5666, test_error = 0.5685


 78%|███████▊  | 785/1000 [46:41<12:47,  3.57s/it]

In [None]:
train_result = []
test_result = []
for width in range(17, 31):
  experiment_name = f'CNN_{width}_{label_noise_int}%noise'  #Provide name to model experiment
  model_name = f'CNN_{width}_{label_noise_int}%noise' 

  model = CNN(width)

  classifier = Classifier(experiment_name, model, trainset, testset, use_cuda=True)

  classifier.train(epochs=epochs)
  train_error = classifier.evaluate_train()
  test_error = classifier.evaluate_test()
  print(f'width = {width}, train_error = {train_error}, test_error = {test_error}')
  train_result.append(train_error)
  test_result.append(test_error)

100%|██████████| 1282/1282 [21:50<00:00,  1.02s/it]


width = 17, train_error = 0.0, test_error = 0.4195


100%|██████████| 1282/1282 [22:58<00:00,  1.08s/it]


width = 18, train_error = 0.0, test_error = 0.42800000000000005


100%|██████████| 1282/1282 [24:32<00:00,  1.15s/it]


width = 19, train_error = 0.0, test_error = 0.38449999999999995


100%|██████████| 1282/1282 [26:07<00:00,  1.22s/it]


width = 20, train_error = 0.0, test_error = 0.40900000000000003


100%|██████████| 1282/1282 [27:26<00:00,  1.28s/it]


width = 21, train_error = 0.0, test_error = 0.394


100%|██████████| 1282/1282 [28:05<00:00,  1.31s/it]


width = 22, train_error = 0.0, test_error = 0.394


100%|██████████| 1282/1282 [29:21<00:00,  1.37s/it]


width = 23, train_error = 0.0, test_error = 0.372


100%|██████████| 1282/1282 [29:54<00:00,  1.40s/it]


width = 24, train_error = 0.0, test_error = 0.392


100%|██████████| 1282/1282 [35:06<00:00,  1.64s/it]


width = 25, train_error = 0.0, test_error = 0.389


 35%|███▌      | 455/1282 [12:46<23:21,  1.69s/it]

In [None]:
train_result = []
test_result = []
for width in range(1, 31):
  experiment_name = f'CNN_{width}_{label_noise_int}%noise'  #Provide name to model experiment
  model_name = f'CNN_{width}_{label_noise_int}%noise' 

  model = CNN(width)

  classifier = Classifier(experiment_name, model, trainset, testset, use_cuda=True)

  classifier.train(epochs=epochs)
  train_error = classifier.evaluate_train()
  test_error = classifier.evaluate_test()
  print(f'width = {width}, train_error = {train_error}, test_error = {test_error}')
  train_result.append(train_error)
  test_result.append(test_error)

100%|██████████| 1282/1282 [28:10<00:00,  1.32s/it]


width = 1, train_error = 0.632, test_error = 0.6295


100%|██████████| 1282/1282 [27:59<00:00,  1.31s/it]


width = 2, train_error = 0.48660000000000003, test_error = 0.5569999999999999


100%|██████████| 1282/1282 [28:03<00:00,  1.31s/it]


width = 3, train_error = 0.35329999999999995, test_error = 0.589


100%|██████████| 1282/1282 [27:41<00:00,  1.30s/it]


width = 4, train_error = 0.22499999999999998, test_error = 0.609


100%|██████████| 1282/1282 [28:00<00:00,  1.31s/it]


width = 5, train_error = 0.08540000000000003, test_error = 0.602


100%|██████████| 1282/1282 [30:13<00:00,  1.41s/it]


width = 6, train_error = 0.17769999999999997, test_error = 0.579


100%|██████████| 1282/1282 [30:42<00:00,  1.44s/it]


width = 7, train_error = 0.0, test_error = 0.567


100%|██████████| 1282/1282 [29:53<00:00,  1.40s/it]


width = 8, train_error = 9.999999999998899e-05, test_error = 0.5389999999999999


100%|██████████| 1282/1282 [30:04<00:00,  1.41s/it]


width = 9, train_error = 0.0, test_error = 0.5075000000000001


100%|██████████| 1282/1282 [31:52<00:00,  1.49s/it]


width = 10, train_error = 0.0, test_error = 0.502


100%|██████████| 1282/1282 [31:52<00:00,  1.49s/it]


width = 11, train_error = 0.0, test_error = 0.487


100%|██████████| 1282/1282 [32:35<00:00,  1.53s/it]


width = 12, train_error = 0.0, test_error = 0.493


100%|██████████| 1282/1282 [32:45<00:00,  1.53s/it]


width = 13, train_error = 0.0, test_error = 0.4585


100%|██████████| 1282/1282 [34:30<00:00,  1.62s/it]


width = 14, train_error = 0.0, test_error = 0.44199999999999995


100%|██████████| 1282/1282 [34:38<00:00,  1.62s/it]


width = 15, train_error = 9.999999999998899e-05, test_error = 0.45599999999999996


100%|██████████| 1282/1282 [34:27<00:00,  1.61s/it]


width = 16, train_error = 0.0, test_error = 0.42500000000000004


 92%|█████████▏| 1180/1282 [32:19<02:47,  1.65s/it]