In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split
import torch.optim as optim 
import numpy as np
import os
import math
import random
from tqdm import tqdm

In [2]:
def load_data(data_set, label_noise, augment_data=False):
    """
    Helper Function to Load data in the form of a tensorflow data set, apply label noise, and return the
    train data and test data.

    Parameters
    ----------
    data_set - str, name of data set to load from tf.keras.datasets
    label_noise - float, percentage of training data to add noise to
    augment_data - boolean, whether or not to use random cropping and horizontal flipping to augment training data
    """

    datasets = ["cifar10", "cifar100", "mnist"]

    # load Cifar 10, Cifar 100, or mnis data set
    if data_set == "cifar10":
      transform = transforms.Compose([transforms.ToTensor()])
      trainset = torchvision.datasets.CIFAR10(root='/content/gdrive/MyDrive/6699/data', train=True, download=True, transform=transform)
      testset = torchvision.datasets.CIFAR10(root='/content/gdrive/MyDrive/6699/data', train=False, download=True, transform=transform)
    # elif data_set == "cifar100":
    #     get_data = tf.keras.datasets.cifar100
    # elif data_set == "mnist":
    #     get_data = tf.keras.datasets.mnist
    else:
      raise Exception(
          f"Please enter a data set from the following options: {datasets}"
      )
    
    # apply label noise to the data set
    if 0 < label_noise:
      random_idx = np.random.choice(
          trainset.data.shape[0], int(label_noise * trainset.data.shape[0])
      )
      rand_labels = np.random.randint(
          low=min(trainset.targets), high=max(trainset.targets), size=len(random_idx)
      )
      # print(trainset.targets[random_idx[0]])
      # print(trainset.targets[random_idx[1]])
      for i in range(len(random_idx)):

        trainset.targets[random_idx[i]] = rand_labels[i]
    # print(trainset.targets[random_idx[0]])
    # print(trainset.targets[random_idx[1]])
    return trainset, testset

In [3]:
def CNN(width):
  layers = nn.Sequential(
      nn.Conv2d(in_channels=3, out_channels=width, kernel_size=3, stride=1, padding=1),
      nn.BatchNorm2d(width),
      nn.ReLU(),
      # nn.MaxPool2d(kernel_size=2, stride=2),
      
      nn.Conv2d(in_channels=width, out_channels=2*width, kernel_size=3, stride=1, padding=1),
      nn.BatchNorm2d(2*width),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),

      nn.Conv2d(in_channels=2*width, out_channels=4*width, kernel_size=3, stride=1, padding=1),
      nn.BatchNorm2d(4*width),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),

      nn.Conv2d(in_channels=4*width, out_channels=8*width, kernel_size=3, stride=1, padding=1),
      nn.BatchNorm2d(8*width),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),

      nn.MaxPool2d(kernel_size=4, stride=4),
      nn.ReLU(),
      nn.Flatten(),
      nn.Linear(in_features=8*width, out_features=10),

    )

  return layers

In [4]:
model = CNN(1)
x = torch.rand([5,3,32,32])
for name, module in model.named_children():
  x = module(x)
  # print(name)
  print("({}) : {}".format(name,x.shape))

(0) : torch.Size([5, 1, 32, 32])
(1) : torch.Size([5, 1, 32, 32])
(2) : torch.Size([5, 1, 32, 32])
(3) : torch.Size([5, 2, 32, 32])
(4) : torch.Size([5, 2, 32, 32])
(5) : torch.Size([5, 2, 32, 32])
(6) : torch.Size([5, 2, 16, 16])
(7) : torch.Size([5, 4, 16, 16])
(8) : torch.Size([5, 4, 16, 16])
(9) : torch.Size([5, 4, 16, 16])
(10) : torch.Size([5, 4, 8, 8])
(11) : torch.Size([5, 8, 8, 8])
(12) : torch.Size([5, 8, 8, 8])
(13) : torch.Size([5, 8, 8, 8])
(14) : torch.Size([5, 8, 4, 4])
(15) : torch.Size([5, 8, 1, 1])
(16) : torch.Size([5, 8, 1, 1])
(17) : torch.Size([5, 8])
(18) : torch.Size([5, 10])


In [None]:
class inverse_squareroot_lr:
  """
  This is the learning rate used with SGD in the paper (Inverse square root decay).
  Learning Rate starts at 0.1 and then drops every 512 batches.
  """

  def __init__(self, n_steps=512, init_lr=0.1):
      self.n = n_steps
      self.gradient_steps = 0
      self.init_lr = init_lr

  def __call__(self):
      lr = self.init_lr / math.sqrt(
          1.0 + math.floor(self.gradient_steps / self.n)
      )
      self.gradient_steps += 1
      return lr

In [None]:
class Classifier():

    def __init__(self, name, model, trainset, testset, use_cuda=False):
        
        '''
        @name: Experiment name. Will define stored results etc. 
        @model: Either a GradBasicNet() or a GradAlexNet()
        @dataloaders: Dictionary with keys train, val and test and corresponding dataloaders
        @class_names: list of classes, where the idx of class name corresponds to the label used for it in the data
        @use_cuda: whether or not to use cuda
        '''
        
        self.name = name
        if use_cuda and not torch.cuda.is_available():
            raise Exception("Asked for CUDA but GPU not found")
            
        self.use_cuda = use_cuda
        
        self.model = model.to('cuda' if use_cuda else 'cpu')
        self.trainset = trainset
        self.testset = testset
        self.init_lr = 0.1
        self.criterion = nn.CrossEntropyLoss() #use cross entropy loss
        self.optim = optim.SGD(model.parameters(), lr=self.init_lr) #use SGD with suggest hyperparams; you must select all the model params
        # self.optim = optim.SGD(model.parameters(), lr=0.1)


        save_path = os.path.join(os.getcwd(), 'models', self.name)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            
        self.save_path = save_path

    def train(self, epochs, save=True):
        '''
        @epochs: number of epochs to train
        @save: whether or not to save the checkpoints
        '''

        # best_val_accuracy = - math.inf
        gradient_steps = 0
        for epoch in tqdm(range(epochs)):

            self.model.train()

            n_step = 512
            
            # TODO Iterate over the training dataloader (see how it is done for validation below) and make sure
            # to call the optim.zero_grad(), loss.backward() and optim.step()
            for idx, data in enumerate(self.trainset):
                inputs, labels = data

                inputs = inputs.to('cuda' if self.use_cuda else 'cpu')
                labels = labels.to('cuda' if self.use_cuda else 'cpu')

                outputs = self.model(inputs)

                loss = self.criterion(outputs, labels)

                optimizer = self.optim

                lr = self.init_lr / math.sqrt(1.0 + math.floor(gradient_steps/n_step))
                gradient_steps += 1
                optimizer.param_groups[0]['lr'] = lr
                # print(gradient_steps, optimizer.param_groups[0]['lr'])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if save:
            #  Make sure that your saving pipeline is working well. 
            # Is os library working on your file system? 
            # Is your model being saved and reloaded fine? 
            # When you do the kernel viz, activation maps, 
            # and GradCAM you must be using the model you have saved before.
            
            # torch.save(self.model.state_dict(), os.path.join(self.save_path, f'epoch_{epoch}.pt'))
            
            torch.save(self.model.state_dict(), os.path.join(self.save_path, 'best.pt'))
                  

        # print('Done training!')                       

    def evaluate_train(self):
        
        try:
            assert os.path.exists(os.path.join(self.save_path, 'best.pt'))
            
        except:
            print('It appears you are testing the model without training. Please train first')
            return
        
        self.model.load_state_dict(torch.load(os.path.join(self.save_path, 'best.pt')))
        self.model.eval()

        #total = len(self.dataloaders['test'])
        
        correct = 0.0
        total = 0.0
        for idx, data in enumerate(self.trainset):
            
                inputs, labels = data

                inputs = inputs.to('cuda' if self.use_cuda else 'cpu')
                labels = labels.to('cuda' if self.use_cuda else 'cpu')
                
                outputs = self.model(inputs)
                _, predicted = torch.max(outputs, 1)
                
                total += labels.shape[0]
                correct += (predicted == labels).sum().item()
                
        # print(f'Training Accuracy: {100 * correct/total}%')
        return 1 - correct/total

    def evaluate_test(self):
        
        try:
            assert os.path.exists(os.path.join(self.save_path, 'best.pt'))
            
        except:
            print('It appears you are testing the model without training. Please train first')
            return
        
        self.model.load_state_dict(torch.load(os.path.join(self.save_path, 'best.pt')))
        self.model.eval()

        #total = len(self.dataloaders['test'])
        
        correct = 0.0
        total = 0.0
        for idx, data in enumerate(self.testset):
            
                inputs, labels = data

                inputs = inputs.to('cuda' if self.use_cuda else 'cpu')
                labels = labels.to('cuda' if self.use_cuda else 'cpu')
                
                outputs = self.model(inputs)
                _, predicted = torch.max(outputs, 1)
                total += labels.shape[0]
                correct += (predicted == labels).sum().item()
                
        # print(f'Testing Accuracy: {100 * correct/total}%')
        return 1 - correct/total

In [None]:
batch_size = 128
num_workers = 2
epochs = 500_000 // (50_000 // 128) # total number desirec SGD steps / number batches per epoch
label_noise_int = 20
label_noise = label_noise_int / 100
trainset, testset = load_data("cifar10", label_noise)

trainset, _ = random_split(trainset, [10000, len(trainset)-10000])
testset, _ = random_split(testset, [2000, len(testset)-2000])

trainset = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
testset = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /content/gdrive/MyDrive/6699/data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting /content/gdrive/MyDrive/6699/data/cifar-10-python.tar.gz to /content/gdrive/MyDrive/6699/data
Files already downloaded and verified


In [None]:
train_result = []
test_result = []
for width in range(17, 31):
  experiment_name = f'CNN_{width}_{label_noise_int}%noise'  #Provide name to model experiment
  model_name = f'CNN_{width}_{label_noise_int}%noise' 

  model = CNN(width)

  classifier = Classifier(experiment_name, model, trainset, testset, use_cuda=True)

  classifier.train(epochs=epochs)
  train_error = classifier.evaluate_train()
  test_error = classifier.evaluate_test()
  print(f'width = {width}, train_error = {train_error}, test_error = {test_error}')
  train_result.append(train_error)
  test_result.append(test_error)

100%|██████████| 1282/1282 [30:20<00:00,  1.42s/it]


width = 26, train_error = 0.0, test_error = 0.377


100%|██████████| 1282/1282 [31:11<00:00,  1.46s/it]


width = 27, train_error = 0.0, test_error = 0.37


100%|██████████| 1282/1282 [31:46<00:00,  1.49s/it]


width = 28, train_error = 0.0, test_error = 0.374


100%|██████████| 1282/1282 [33:01<00:00,  1.55s/it]


width = 29, train_error = 0.0, test_error = 0.359


100%|██████████| 1282/1282 [33:54<00:00,  1.59s/it]


width = 30, train_error = 0.0, test_error = 0.36550000000000005


In [None]:
train_result = []
test_result = []
for width in range(17, 31):
  experiment_name = f'CNN_{width}_{label_noise_int}%noise'  #Provide name to model experiment
  model_name = f'CNN_{width}_{label_noise_int}%noise' 

  model = CNN(width)

  classifier = Classifier(experiment_name, model, trainset, testset, use_cuda=True)

  classifier.train(epochs=epochs)
  train_error = classifier.evaluate_train()
  test_error = classifier.evaluate_test()
  print(f'width = {width}, train_error = {train_error}, test_error = {test_error}')
  train_result.append(train_error)
  test_result.append(test_error)

100%|██████████| 1282/1282 [21:50<00:00,  1.02s/it]


width = 17, train_error = 0.0, test_error = 0.4195


100%|██████████| 1282/1282 [22:58<00:00,  1.08s/it]


width = 18, train_error = 0.0, test_error = 0.42800000000000005


100%|██████████| 1282/1282 [24:32<00:00,  1.15s/it]


width = 19, train_error = 0.0, test_error = 0.38449999999999995


100%|██████████| 1282/1282 [26:07<00:00,  1.22s/it]


width = 20, train_error = 0.0, test_error = 0.40900000000000003


100%|██████████| 1282/1282 [27:26<00:00,  1.28s/it]


width = 21, train_error = 0.0, test_error = 0.394


100%|██████████| 1282/1282 [28:05<00:00,  1.31s/it]


width = 22, train_error = 0.0, test_error = 0.394


100%|██████████| 1282/1282 [29:21<00:00,  1.37s/it]


width = 23, train_error = 0.0, test_error = 0.372


100%|██████████| 1282/1282 [29:54<00:00,  1.40s/it]


width = 24, train_error = 0.0, test_error = 0.392


100%|██████████| 1282/1282 [35:06<00:00,  1.64s/it]


width = 25, train_error = 0.0, test_error = 0.389


 35%|███▌      | 455/1282 [12:46<23:21,  1.69s/it]

In [None]:
train_result = []
test_result = []
for width in range(1, 31):
  experiment_name = f'CNN_{width}_{label_noise_int}%noise'  #Provide name to model experiment
  model_name = f'CNN_{width}_{label_noise_int}%noise' 

  model = CNN(width)

  classifier = Classifier(experiment_name, model, trainset, testset, use_cuda=True)

  classifier.train(epochs=epochs)
  train_error = classifier.evaluate_train()
  test_error = classifier.evaluate_test()
  print(f'width = {width}, train_error = {train_error}, test_error = {test_error}')
  train_result.append(train_error)
  test_result.append(test_error)

100%|██████████| 1282/1282 [28:10<00:00,  1.32s/it]


width = 1, train_error = 0.632, test_error = 0.6295


100%|██████████| 1282/1282 [27:59<00:00,  1.31s/it]


width = 2, train_error = 0.48660000000000003, test_error = 0.5569999999999999


100%|██████████| 1282/1282 [28:03<00:00,  1.31s/it]


width = 3, train_error = 0.35329999999999995, test_error = 0.589


100%|██████████| 1282/1282 [27:41<00:00,  1.30s/it]


width = 4, train_error = 0.22499999999999998, test_error = 0.609


100%|██████████| 1282/1282 [28:00<00:00,  1.31s/it]


width = 5, train_error = 0.08540000000000003, test_error = 0.602


100%|██████████| 1282/1282 [30:13<00:00,  1.41s/it]


width = 6, train_error = 0.17769999999999997, test_error = 0.579


100%|██████████| 1282/1282 [30:42<00:00,  1.44s/it]


width = 7, train_error = 0.0, test_error = 0.567


100%|██████████| 1282/1282 [29:53<00:00,  1.40s/it]


width = 8, train_error = 9.999999999998899e-05, test_error = 0.5389999999999999


100%|██████████| 1282/1282 [30:04<00:00,  1.41s/it]


width = 9, train_error = 0.0, test_error = 0.5075000000000001


100%|██████████| 1282/1282 [31:52<00:00,  1.49s/it]


width = 10, train_error = 0.0, test_error = 0.502


100%|██████████| 1282/1282 [31:52<00:00,  1.49s/it]


width = 11, train_error = 0.0, test_error = 0.487


100%|██████████| 1282/1282 [32:35<00:00,  1.53s/it]


width = 12, train_error = 0.0, test_error = 0.493


100%|██████████| 1282/1282 [32:45<00:00,  1.53s/it]


width = 13, train_error = 0.0, test_error = 0.4585


100%|██████████| 1282/1282 [34:30<00:00,  1.62s/it]


width = 14, train_error = 0.0, test_error = 0.44199999999999995


100%|██████████| 1282/1282 [34:38<00:00,  1.62s/it]


width = 15, train_error = 9.999999999998899e-05, test_error = 0.45599999999999996


100%|██████████| 1282/1282 [34:27<00:00,  1.61s/it]


width = 16, train_error = 0.0, test_error = 0.42500000000000004


 92%|█████████▏| 1180/1282 [32:19<02:47,  1.65s/it]