Part1. Train VGG16 with quantization-aware training (15%)

 - Train for 4-bit input activation and 4-bit weight to achieve >90% accuracy. 

 - But, this time, reduce a certain convolution layer's input channel numbers to be 8 and output channel numbers to be 8. (v)

 - Also, remove the batch normalization layer after the squeezed convolution. (v)

  e.g., replace "conv -> relu -> batchnorm" with "conv -> relu"

 - This layer will be mapped on your 8x8 2D systolic array. Thus, reducing to 8 channels helps your layer's mapping in an array nicely without tiling.

 - This time, compute your "psum_recovered" such as HW5 including ReLU and compare with your prehooked input for the next layer (instead of your computed psum_ref).

 - [hint] It is recommended not to reduce the input channel of Conv layer at too early layer position because the early layer's feature map size (nij) is large incurring long verification cycles.

   (recommended location: around 27-th layer, e.g., features[27] for VGGNet)

 - Measure of success: accuracy >90%  with 8 input/output channels + error < 10^-3 for psum_recorvered for VGGNet.

In [1]:
import argparse
import os
import time
import shutil

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn     

import torchvision
import torchvision.transforms as transforms

from models import *

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"

global best_prec
use_gpu = torch.cuda.is_available()
print('=> Building model...')
    
    
batch_size = 128
model_name = "VGG16_quant"
model = VGG16_quant()
print(model)

normalize = transforms.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262])


train_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)


test_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        normalize,
    ]))

testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)


print_freq = 100 # every 100 batches, accuracy printed. Here, each batch includes "batch_size" data points
# CIFAR10 has 50,000 training data, and 10,000 validation data.

def train(trainloader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    model.train()

    end = time.time()
    for i, (input, target) in enumerate(trainloader):
        # measure data loading time
        data_time.update(time.time() - end)

        input, target = input.cuda(), target.cuda()

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        prec = accuracy(output, target)[0]
        losses.update(loss.item(), input.size(0))
        top1.update(prec.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()


        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {top1.val:.3f}% ({top1.avg:.3f}%)'.format(
                   epoch, i, len(trainloader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1))

            

def validate(val_loader, model, criterion ):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
         
            input, target = input.cuda(), target.cuda()

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec = accuracy(output, target)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(prec.item(), input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:  # This line shows how frequently print out the status. e.g., i%5 => every 5 batch, prints out
                print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {top1.val:.3f}% ({top1.avg:.3f}%)'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1))

    print(' * Prec {top1.avg:.3f}% '.format(top1=top1))
    return top1.avg


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

        
def save_checkpoint(state, is_best, fdir):
    filepath = os.path.join(fdir, 'checkpoint.pth')
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(fdir, 'model_best.pth.tar'))


def adjust_learning_rate(optimizer, epoch):
    """For resnet, the lr starts from 0.1, and is divided by 10 at 80 and 120 epochs"""
    adjust_list = [100, 200,300]
    if epoch in adjust_list:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * 0.1        

#model = nn.DataParallel(model).cuda()
#all_params = checkpoint['state_dict']
#model.load_state_dict(all_params, strict=False)
#criterion = nn.CrossEntropyLoss().cuda()
#validate(testloader, model, criterion)

=> Building model...
VGG_quant(
  (features): Sequential(
    (0): QuantConv2d(
      3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): QuantConv2d(
      64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): QuantConv2d(
      64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): QuantConv2d(
      128, 128, kernel_size=(3, 3), stride

In [2]:
lr = 1e-2
weight_decay = 1e-3
epochs = 100
best_prec = 0
model = model.cuda()
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
# weight decay: for regularization to prevent overfitting
     

if not os.path.exists('result'):
    os.makedirs('result')
    
fdir = 'result/'+str(model_name)+str('_1206_2bit')

if not os.path.exists(fdir):
    os.makedirs(fdir)
        
#PATH = "result/VGG16_quant1129_5/model_best.pth.tar"
#checkpoint = torch.load(PATH)
#model.load_state_dict(checkpoint['state_dict'])
#device = torch.device("cuda") 
        
        
        
for epoch in range(0, epochs):
    adjust_learning_rate(optimizer, epoch)

    train(trainloader, model, criterion, optimizer, epoch)
    
    # evaluate on test set
    print("Validation starts")
    prec = validate(testloader, model, criterion)

    # remember best precision and save checkpoint
    is_best = prec > best_prec
    best_prec = max(prec,best_prec)
    print('best acc: {:1f}'.format(best_prec))
    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_prec': best_prec,
        'optimizer': optimizer.state_dict(),
    }, is_best, fdir)

Epoch: [0][0/391]	Time 1.862 (1.862)	Data 0.444 (0.444)	Loss 2.4682 (2.4682)	Prec 7.812% (7.812%)
Epoch: [0][100/391]	Time 0.045 (0.063)	Data 0.002 (0.006)	Loss 2.3877 (2.4142)	Prec 11.719% (14.534%)
Epoch: [0][200/391]	Time 0.045 (0.054)	Data 0.002 (0.004)	Loss 2.0488 (2.2710)	Prec 21.094% (16.834%)
Epoch: [0][300/391]	Time 0.045 (0.051)	Data 0.002 (0.003)	Loss 2.0191 (2.2028)	Prec 16.406% (18.246%)
Validation starts
Test: [0/79]	Time 0.316 (0.316)	Loss 2.0058 (2.0058)	Prec 17.188% (17.188%)
 * Prec 23.160% 
best acc: 23.160000
Epoch: [1][0/391]	Time 0.384 (0.384)	Data 0.347 (0.347)	Loss 1.9399 (1.9399)	Prec 23.438% (23.438%)
Epoch: [1][100/391]	Time 0.045 (0.049)	Data 0.002 (0.006)	Loss 2.0678 (1.9813)	Prec 22.656% (24.134%)
Epoch: [1][200/391]	Time 0.045 (0.047)	Data 0.002 (0.005)	Loss 2.0899 (1.9734)	Prec 17.969% (24.056%)
Epoch: [1][300/391]	Time 0.045 (0.047)	Data 0.002 (0.004)	Loss 2.0215 (1.9588)	Prec 22.656% (24.647%)
Validation starts
Test: [0/79]	Time 0.393 (0.393)	Loss 2.08

In [4]:
PATH = "result/VGG16_quant_1206_2bit/model_best.pth.tar"
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
device = torch.device("cuda") 

model.cuda()
model.eval()

test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device) # loading to GPU
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(testloader.dataset)

print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(testloader.dataset),
        100. * correct / len(testloader.dataset)))


Test set: Accuracy: 8336/10000 (83%)



In [5]:
## Send an image and use prehook to grab the inputs of all the QuantConv2d layers

class SaveOutput:
    def __init__(self):
        self.outputs = []
    def __call__(self, module, module_in):
        self.outputs.append(module_in)
    def clear(self):
        self.outputs = []  
        
######### Save inputs from selected layer ##########
save_output = SaveOutput()
device = torch.device("cuda" if use_gpu else "cpu") 
counter =0
for layer in model.modules():
    if isinstance(layer, torch.nn.Conv2d):
        print("prehooked")
        counter += 1
        print(layer, counter)
        layer.register_forward_pre_hook(save_output)       ## Input for the module will be grapped       
####################################################

dataiter = iter(trainloader)
images, labels = next(dataiter)
images = images.to(device)
out = model(images)

prehooked
QuantConv2d(
  3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
) 1
prehooked
QuantConv2d(
  64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
) 2
prehooked
QuantConv2d(
  64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
) 3
prehooked
QuantConv2d(
  128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
) 4
prehooked
QuantConv2d(
  128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
) 5
prehooked
QuantConv2d(
  256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
) 6
prehooked
QuantConv2d(
  256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
) 7
prehooked
QuantConv2d(
  2

In [6]:
w_bit = 2
quantConv2d = model.features[27]
print(quantConv2d)
#weight_q = model.layer1[0].conv2.weight_q # quantized value is stored during the training
weight_q = quantConv2d.weight_q
print(weight_q)

weight_alpha = quantConv2d.weight_quant.wgt_alpha
print(weight_alpha)
w_delta = weight_alpha / (2 ** (w_bit - 1) - 1)   # delta can be calculated by using alpha and w_bit
weight_int =  weight_q / w_delta# w_int can be calculated by weight_q and w_delta
print(weight_int) # you should see clean integer numbers

QuantConv2d(
  8, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
)
Parameter containing:
tensor([[[[ 1.8314,  0.0000, -0.0000],
          [ 1.8314,  0.0000, -0.0000],
          [ 1.8314,  1.8314,  0.0000]],

         [[-1.8314, -1.8314, -1.8314],
          [ 1.8314,  0.0000, -1.8314],
          [ 1.8314,  0.0000, -0.0000]],

         [[-1.8314, -1.8314, -1.8314],
          [-1.8314, -0.0000, -1.8314],
          [ 0.0000,  0.0000,  0.0000]],

         [[ 1.8314,  1.8314, -0.0000],
          [ 1.8314,  1.8314, -0.0000],
          [ 1.8314,  1.8314, -0.0000]],

         [[ 1.8314,  1.8314,  0.0000],
          [-1.8314, -0.0000, -1.8314],
          [-1.8314, -0.0000, -0.0000]],

         [[-0.0000, -0.0000, -0.0000],
          [-0.0000, -0.0000, -0.0000],
          [-0.0000, -0.0000, -0.0000]],

         [[ 0.0000, -0.0000, -0.0000],
          [ 1.8314,  1.8314,  1.8314],
          [ 0.0000, -0.0000, -1.8314]],

         [[ 1.8314, -

In [7]:
##### Find "weight_int" for features[3] ####
w_bit = 2
weight_q = model.features[27].weight_q
w_alpha = model.features[27].weight_quant.wgt_alpha
w_delta = w_alpha /(2**(w_bit-1)-1)

weight_int = weight_q / w_delta
print(weight_int)

tensor([[[[ 1.,  0., -0.],
          [ 1.,  0., -0.],
          [ 1.,  1.,  0.]],

         [[-1., -1., -1.],
          [ 1.,  0., -1.],
          [ 1.,  0., -0.]],

         [[-1., -1., -1.],
          [-1., -0., -1.],
          [ 0.,  0.,  0.]],

         [[ 1.,  1., -0.],
          [ 1.,  1., -0.],
          [ 1.,  1., -0.]],

         [[ 1.,  1.,  0.],
          [-1., -0., -1.],
          [-1., -0., -0.]],

         [[-0., -0., -0.],
          [-0., -0., -0.],
          [-0., -0., -0.]],

         [[ 0., -0., -0.],
          [ 1.,  1.,  1.],
          [ 0., -0., -1.]],

         [[ 1., -0., -0.],
          [-1., -1., -1.],
          [-1., -1., -1.]]],


        [[[ 0.,  1.,  1.],
          [ 1.,  1.,  1.],
          [ 0.,  0., -0.]],

         [[-1., -1., -1.],
          [-1., -1., -1.],
          [-0.,  0., -0.]],

         [[-1.,  0.,  1.],
          [-0.,  1.,  1.],
          [ 0.,  1.,  1.]],

         [[-1., -0.,  1.],
          [ 0.,  0.,  1.],
          [-1., -0., -1.]],

  

In [8]:
act = save_output.outputs[8][0] # input of 27th quantconv
act_alpha  = model.features[27].act_alpha
act_bit = 2
act_quant_fn = act_quantization(act_bit)
act_q = act_quant_fn(act, act_alpha)
act_delta = act_alpha / (2**act_bit - 1)
act_int = act_q / act_delta
print(act_int)

tensor([[[[1., 2., 1., 0.],
          [0., 1., 1., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],

         [[0., 0., 0., 0.],
          [0., 0., 1., 0.],
          [0., 1., 2., 1.],
          [0., 1., 1., 0.]],

         [[1., 1., 0., 1.],
          [0., 0., 0., 0.],
          [1., 0., 0., 0.],
          [1., 0., 0., 0.]],

         ...,

         [[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],

         [[0., 0., 0., 1.],
          [1., 1., 0., 1.],
          [1., 1., 0., 0.],
          [0., 0., 0., 0.]],

         [[1., 1., 1., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]]],


        [[[1., 2., 3., 3.],
          [0., 0., 0., 2.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],

         [[0., 0., 0., 0.],
          [0., 0., 1., 1.],
          [0., 1., 3., 1.],
          [0., 0., 0., 0.]],

         [[2., 1., 1., 2.],
          [2., 1., 0., 1.],
          [0., 

In [9]:
out_ref = save_output.outputs[9][0]

In [10]:
conv_int = torch.nn.Conv2d(8,8, kernel_size = 3, padding = 1, bias = False)
conv_int.weight = torch.nn.parameter.Parameter(weight_int)
out_int = conv_int(act_int)
relu = model.features[28]
out_recovered = out_int * act_delta * w_delta
out_recovered = relu(out_recovered)


In [11]:
difference = abs(out_ref - out_recovered)
print(difference.mean())

tensor(2.7509e-07, device='cuda:0', grad_fn=<MeanBackward0>)
