
Optimization for machine learning - Mini Project: Large mini-batch scaled learning rate and gradual warm-up

## 1. Loading libraries, modules and setting directories

In [None]:
## Run this cell if PyTorch is not installed on Colab
#!pip3 install torch torchvision

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from google.colab import drive
drive.mount('/content/gdrive', force_remount= True)

import sys
sys.path.append('gdrive/My Drive/Colab Notebooks/OptML/')

import torch
import torchvision
import torchvision.transforms as transforms

from models import CNN as CNN


import time
import math

Mounted at /content/gdrive


### Creating device and checking GPU availability 

In [5]:
### Checking if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# 2. Downloading and normalizing CIFAR 10

In [6]:
# Normalize dataset and transform into tensor
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5))
    ]
)

In [7]:
# Load dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

used_categories = range(len(classes))

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


## 3. Training the network - scaled learning rate and gradual warm-up 

#### Training and testing is set up with a simple validation loop (no CV), looping over for the trainset for a specified number of epochs. At every epoch, the loss, accuracy and time is captured as raw data for later plotting. If GPU was available the model is transferred to the device along with a transformation of the tensor into cuda. To load the data for every batchsize, the torch Dataloader has been utilized. The Cross Entropy loss is used as the loss function and the optimizer used througout the training is the SGD (no momentum or weight-decay). Learning rate equal to 0.001 for all of the small minibatches.

### Scaled learning rate
#### In this training setup the learning rate is scaled according to the batch size (differing from baseline notebooks) - for the largest batchsizes training becomes meaningless, which is commented and described more thoroughly in the report. This can be contributed to the very large learning rate that arises with the multiplication of the large batch sizes, along with a less sophisticated netork compared the the studies of Goyal et al (2017) and  Masters D., Luschi C. (2018).

### Gradual warm-up
#### Furthermore a gradual warm-up of the learning rate was implemented to see if this would improve predictions even further. Again training became meaningless at some epoch for the largest batchsizes. Four settings for the warm-up rate has been implemented - exponential, linear, discontinuous (step function) and no warm-up. Used setting is specified in the variable "gradual_warmup_scheme".

In [10]:
#from torch.autograd import Variable
import torch.optim as optim

# Settings for training
power = list(range(5,0,-1))
p = 0 
batches = [128, 256, 512, 1048, 2048]
num_epoch = 70
learning_rate_global = 0.001

accuracies = []
epochs = []
time_pr_epoch = []
batch_sizes = []
loss_pr_epoch = []



# Scheme can be either exponential/linear/discontinuous/nowarmup
gradual_warmup_scheme= "exponential"


for BATCH_SIZE in batches:
  model = CNN()
  model.to(device)
  print("######### BATCH SIZE = ",BATCH_SIZE," ######### " )

  # Learning rate divide
  learning_rate = learning_rate_global * BATCH_SIZE
  # Initialize trainloader and test loader   
  trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                            shuffle=True, num_workers=2)
  testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)

  criterion =  torch.nn.CrossEntropyLoss()
  warmup= (num_epoch*5)/100

  print("Warmup for ",warmup," epochs.")
  for epoch in range(num_epoch):  # Loop over the dataset multiple times


      # Setup gradual warmup schemes
      if(gradual_warmup_scheme == "exponential"):
        
        # Continue with our initial learning rate after reaching 5% of the epochs
        if(warmup==1):
          optimizer = optim.SGD(model.parameters(),lr=learning_rate)
          print("normal LR: ", learning_rate)      
        if(warmup>1):
          lr_warmup = (learning_rate/(2**warmup))
          optimizer = optim.SGD(model.parameters(),lr=lr_warmup )
          print("warmup LR: ", lr_warmup)
          warmup=warmup-1

      # Setup gradual warmup schemes
      if(gradual_warmup_scheme == "linear"):
        
        # Continue with our initial learning rate after reaching 5% of the epochs
        if(warmup==1):
          optimizer = optim.SGD(model.parameters(),lr=learning_rate)
          print("normal LR: ", learning_rate)      
        if(warmup>1):
          slope = learning_rate/warmup
          lr_warmup = epoch*slope
          optimizer = optim.SGD(model.parameters(),lr=lr_warmup )
          print("warmup LR: ", lr_warmup)
          warmup=warmup-1

      # Setup gradual warmup schemes
      if(gradual_warmup_scheme == "discontinuous"):
        
        # Continue with our initial learning rate after reaching 5% of the epochs
        if(warmup==1):
          optimizer = optim.SGD(model.parameters(),lr=learning_rate)
          print("normal LR: ", learning_rate)      
        if(warmup>1):
          lr_warmup = (learning_rate/warmup)
          optimizer = optim.SGD(model.parameters(),lr=0.0005 )
          print("warmup LR: ", lr_warmup)
          warmup=warmup-1
      
      # Setup gradual warmup schemes
      if(gradual_warmup_scheme == "nowarmup"):
        optimizer = optim.SGD(model.parameters(),lr=learning_rate)
      
      




      start_time = time.time()

      running_loss = 0.0
      running_loss_for_epoch = 0.0

      for i, data in enumerate(trainloader, 0):
          
          # Get the inputs
          inputs, labels = data
          

          # Wrap them in Variable
          inputs, labels = inputs.to(device), labels.to(device)

          # Zero the parameter gradients
          optimizer.zero_grad()

          # Forward + backward + optimize
          outputs = model(inputs)
          loss = criterion(outputs,labels)
          loss.backward()
          optimizer.step()
          

          # Print statistics
          running_loss += loss.item()
          running_loss_for_epoch += loss.item()
          if i % 100 == 99:    # print every 100 mini-batches
              print('[epoch : %d, minibatch : %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 100))
              running_loss = 0.0

          correct = 0
          total = 0

      for i, data in enumerate(testloader, 0):
          inputs, labels = data
          inputs, labels = inputs.to(device), labels.to(device)

          outputs = model(inputs)
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

      print('Accuracy of the network on the {} test images: {:4.2f} %'.format(
          testset.data.shape[0], 100 * (correct / total)))

      finish_time = (time.time() - start_time)
      batches_pr_epoch = math.ceil(trainset.data.shape[0] / BATCH_SIZE)
      loss_pr_epoch.append(running_loss_for_epoch / batches_pr_epoch)
      accuracies.append(100 * (correct / total))
      batch_sizes.append(BATCH_SIZE)
      epochs.append(epoch)
      time_pr_epoch.append(finish_time)
      
    
  print('Finished Training')

  
print("loss pr epoch", loss_pr_epoch)
print("accuracies", accuracies)
print("batch_sizes", batch_sizes)
print("epochs", epochs)
print("time_pr_epoch", time_pr_epoch)

######### BATCH SIZE =  2048  ######### 
Warmup for  3.5  epochs.
warmup LR:  0.18101933598375616
Accuracy of the network on the 10000 test images: 29.64 %
warmup LR:  0.3620386719675123
Accuracy of the network on the 10000 test images: 18.11 %
warmup LR:  0.7240773439350247
Accuracy of the network on the 10000 test images: 12.86 %
Accuracy of the network on the 10000 test images: 22.02 %
Accuracy of the network on the 10000 test images: 22.65 %
Accuracy of the network on the 10000 test images: 14.80 %
Accuracy of the network on the 10000 test images: 23.78 %
Accuracy of the network on the 10000 test images: 33.06 %
Accuracy of the network on the 10000 test images: 34.23 %
Accuracy of the network on the 10000 test images: 29.86 %
Accuracy of the network on the 10000 test images: 40.77 %
Accuracy of the network on the 10000 test images: 46.27 %
Accuracy of the network on the 10000 test images: 42.52 %
Accuracy of the network on the 10000 test images: 51.43 %
Accuracy of the network on t

KeyboardInterrupt: ignored

## 4. Saving results

#### Results are saved in a tab-seperated csv.

In [9]:
import pandas as pd

results = pd.DataFrame(
    {'batch_size': batch_sizes,
     'epoch': epochs,
     'accuracy': accuracies,
     'loss': loss_pr_epoch,
     'time': time_pr_epoch
    })

results.to_csv('gdrive/MyDrive/results_large_batches_scaled_lr_warmup.csv', sep ='\t')