<a href="https://colab.research.google.com/github/Krishna14/quantization-bit-serial/blob/main/modelQuantize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This cell contains the code for the Quantizer. This quantizer uses asymmetric quantization from ```FP32``` to ```INTN``` bits.

In [None]:
from torch.quantization import QuantStub, DeQuantStub
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

class Quantizer(nn.Module):
  """
    This class takes in any kind of input tensor.
    Quantizes the tensor and returns it as the output for the next module to use
  """
  def __init__(self, model_fp32, inp_tensor, numBits, debug_mode=False):
    super(Quantizer, self).__init__()
    self.input = inp_tensor
    self.quant = torch.quantization.QuantStub()
    self.model_fp32 = model_fp32
    self.dequant = torch.quantization.DeQuantStub()
    self.numBits = numBits
    self._debug_mode = debug_mode

  def quantizeTensor(self, input_tensor, numBits, isActivations=False, debug_mode=False):
    """
      Args:
        input_tensor - Input tensor
        numBits - number of bits to quantize the value.
        isActivations - Determines whether activation should be quantized or weights
      Returns:
        Finds the scale, zero_point and quantizes it between value based on the number of bits
        Signed n bit representation.
    """
    print("quantizeTensor: type(input_tensor) = {}".format(type(input_tensor)))
    minVal, maxVal = torch.min(input_tensor).item(), torch.max(input_tensor).item()
    if isActivations:
      assert len(input_tensor.shape) == 3, "Shape of activations tensor isn't correct"
    else:
      assert len(input_tensor.shape) == 4, "Shape of weights tensor isn't correct"
    
    print("Types are {} and {}".format(type(minVal), type(maxVal)))
    if debug_mode:
      print("quantizeTensor: min and max values of the tensor are {} and {}".format(minVal, maxVal))
    divisor, shifter = ((2 ** numBits) - 1), 2**(numBits-1)
    q_scale = float((maxVal - minVal)/divisor)
    if input_tensor.requires_grad:
      input_np = input_tensor.detach().numpy()
    else:
      input_np = input_tensor.numpy()
    zero_point = round(minVal/q_scale)
    for x in np.nditer(input_np,op_flags=['readwrite']):
      x[...] = round(x/q_scale - zero_point) - shifter
    output = torch.tensor(input_np)
    assert output.shape == input_tensor.shape
    return output, q_scale, zero_point, minVal

  def quantizeTensorBits(self, input_tensor, numBits, isActivations=False,
                         debug_mode=False):
    """
      Args:
        Input - Tensor which needs to be quantized
      Returns:
        Output - Quantized tensor with each value being a ```numBits``` wide bits
    """
    # This function is used to quantize the input tensor
    signQuantizedTensor, scale, zero_point, minVal = self.quantizeTensor(input_tensor, numBits, isActivations)
    # assert len(signQuantizedTensor.shape) == 4, "quantizeTensorBits: Tensor has wrong length"
    if isActivations:
      assert len(input_tensor.shape) == 3, "quantizeTensorBits: activations tensor has wrong shape"
    else:
      assert len(input_tensor.shape) == 4, "quantizeTensorBits: weights tensor has wrong shape"
    # 
    if input_tensor.requires_grad:
      input_np = input_tensor.detach().numpy()
    else:
      input_np = input_tensor.numpy()
    
    output = np.zeros((signQuantizedTensor.shape[0], signQuantizedTensor.shape[1], signQuantizedTensor.shape[2], signQuantizedTensor.shape[3], numBits))
    # 
    for i in range(input_np.shape[0]):
      for j in range(input_np.shape[1]):
        for k in range(input_np.shape[2]):
          for l in range(input_np.shape[3]):
            output[i, j, k, l, :] = self.generateTwosComplement(input_np[i, j, k, l], numBits)
    
    input_tensor = torch.from_numpy(input_np)
    return torch.tensor(output), scale, zero_point, minVal
  
  def generateTwosComplement(self, value, numBits, debug_mode=False):
    """
      Args:
        value - Input number for which we need to generate two's complement
        numBits - Number of bits in the input value
      Returns:
        output - Two's complement representation for the input num
    """
    bins = [2**i for i in range(0, numBits)]
    bitMask = (2 ** numBits) - 1
    divisor = 2 ** (numBits-1)
    value = int(value)
    sign = (value & bitMask) >> (numBits - 1)
    if debug_mode:
      print("Bins generated are {}".format(bins))
      print("Sign is {}".format(sign))

    result = [0] * numBits
    left, right = 0, numBits - 1

    # sign bit 
    if sign:
      value = (1 << (numBits-1)) + value

    # Finding the binary representation
    while value > 0 and left <= right:
      # START
      if debug_mode:
        print("START: Left = {}, right = {}, value = {}".format(left, right, value))

      mid = left + int((right - left)/2)
      # The first case arises only when 1 is left and mid = 0
      if value == bins[mid]:
        result[mid] = 1
        value = 0
        break
      elif value <= bins[mid] and mid >= 1 and value > bins[mid-1]:
        result[mid-1] = 1
        value -= bins[mid-1]
        left, right = 0, numBits - 1
        if value == 0:
          break
      elif bins[mid] > value:
        right = mid-1
      else:
        left = mid + 1

      # STOP
      if debug_mode:
        print("STOP: Left = {}, right = {}, value = {}".format(left, right, value))

    if debug_mode:
      print("Result for input = {} is {}".format(value, result))

    if sign:
      result[-1] = 1
    # Reverse the list and return it  
    result.reverse()
    return result

  def print_size_of_model(self, model, filename="defaultModel.p"):
    """
      Function used to print the size of the model
    """
    torch.save(model.state_dict(), filename)
    print("Size (MB): ", os.path.getsize(filename)/1e6)
  
  def quantizeActivations(self, inputs):
    """
      Function to quantize the input activations and return the value accordingly
    """
    self.model_fp32.qconfig = torch.quantization.default_qconfig
    x = self.quantizeTensorBits(inputs, numBits)
    return x

  def quantizeModel(self):
    """
      Takes the input tensor provided and quantizes it to a static INT8 representation
      While returning, it returns the bit level representation of each value in a string
    """
    self.model_fp32.qconfig = torch.quantization.default_qconfig
    print("Quantization configuration is {}".format(self.model_fp32.qconfig))
    print("Model size prior to Quantization is ")
    model_size_pre_quantize = self.print_size_of_model(self.model_fp32, "defaultVGG16.p")
    torch.quantization.prepare(self.model_fp32, inplace=True)
    print("Post training quantization preparation step")
    self.model_fp32.eval()
    # Conversion from FP32 to Fixed point representation
    torch.quantization.convert(self.model_fp32, inplace=True)
    print("Model size post Quantization is ")
    model_size_post_quantize = self.print_size_of_model(self.model_fp32, "quantizedVGG16.p")
    # Print statements after quantizing the model
    print("quantized the model as per the default quantization configuration")

### This cell has the code for the ```Bit-Serial``` product computation of two tensors.

In [None]:
import math

class SerialInnerProduct:
  """
    This class models the behavior of a Serial Inner Product Unit.

  """
  # Default constructor
  def __init__(self, inputs, weights, kernel_size_x, \
               kernel_size_y, numBits, quantizer, \
               stride_x=1, stride_y=1, padding_x=0, padding_y=0):
    """
      Args: inputs  - Quantized inputs
            weights - Quantized weights
            integer_bits - Number of Integer bits used for representation
            frac_bits - Number of Fractional bits used for representation
            kernel_size_x - Filter size (Fx)
            kernel_size_y - Filter size (Fy)
      Returns:
            None
    """
    self._activations = inputs  # Input = 16 pixels * 8 bits per pixel
    self._weights = weights     # Weight = 16 pixels * 1 bit per pixel
    # Configuration for 16 x 16 SIP unit
    self._num_input_lanes = 16  # Convert this to a knob
    self._num_weight_lanes = 16 # Convert this to a knob
    # Kernel sizes have been mentioned here
    self._kernel_size_x = kernel_size_x # Compute the kernel_size_x
    self._kernel_size_y = kernel_size_y # Compute the kernel_size_y
    # Store the x and y strides
    self._stride_x = stride_x
    self._stride_y = stride_y
    # Store the x and y padding
    self._pad_x = padding_x
    self._pad_y = padding_y
    # If the padding differs, we need to pad the given inputs
    if (self._pad_x > 0 or self._pad_y > 0):
      self._activations = self.padInputs()
    # 
    self.numBits = numBits
    self.qzer = quantizer

  # Function to pad the inputs based on the parameters specified in the constructor
  def padInputs(self):
    """
      Args: None  
      Returns: The padded numpy input array
    """
    # print("padding x = {}, y = {}".format(self._pad_x, self._pad_y))
    new_array = np.zeros((self._activations.shape[0], \
                          (self._activations.shape[1] + (2*self._pad_x)),\
                          (self._activations.shape[2] + (2*self._pad_y))))
    # print("Shape of activations = {}".format(self._activations.shape))
    # print("Shape of new_array = {}".format(new_array.shape))
    # Create a new array
    new_array[:, 1:self._activations.shape[1]+1, 1:self._activations.shape[2]+1] = self._activations
    return new_array

  # Process the given window with the activations and weights
  def processWindow(self, activations, weights, bitNumber, debug_mode=False):
    """
      Args:
          Activations - Input activation for processing the given input
          Weights - Entire weight values for the given filters
          bitNumber - Choose which bitnumber needs to be chosen from the weight
          debug_mode - Print whether the debug_info needs to be printed or not
      Returns:
          Output - number of channels * 8 bit for each result
    """
    # assert activations.shape == weights.shape # This doesn't hold after the conversion of values to 8 bit quantized numbers
    result = np.zeros(activations.shape)
    #print("processWindow: activations = {}, weights = {}".format(activations, weights))
    for i in range(len(activations)):
      # Extract the bit from the given bitNumber
      bit = int(weights[i][bitNumber])
      # bitNumber == 0 represents the most significant bit
      if bitNumber == 0:
        if bit == 1:
          result[i] = -1 * activations[i]
        else:
          assert bit == 0, "bit neither 1 nor 0 = " + str(bit)
          result[i] = 0
      else:
        if bit == 1:
          result[i] = activations[i]
        else:
          assert bit == 0, "bit neither 1 nor 0 = " + str(bit)
          result[i] = 0
    if debug_mode:
      print("Result is {}".format(result)) 
    return result

  # This is used to process a bulk of 16 x 16 tiles to generate the given output
  def processTile(self, inputs, weights, bitNumber, debug_mode=False):
    """
      Args:
          inputs - Given input activation lanes
          weights - Weights of the given neural network
          bitNumber - The exact bit number which needs to be processed
      Returns:
          The outputs of each of the given lanes
    """
    if debug_mode:
      print("processTile: inputs shape = {} and weights shape = {}".format(inputs.shape, weights.shape))
    partialOutputs = np.zeros((weights.shape[0], weights.shape[1]))
    for filter in np.arange(0, weights.shape[0]):
      weight_lanes = weights[filter][:]
      if debug_mode:
        print("processTile: weight_lanes shape = {} and weight_lanes value = {}".format(weight_lanes.shape, weight_lanes))
      partialOutputs[filter][:] = self.processWindow(inputs, weight_lanes, bitNumber, False)
    if debug_mode:
      print("Shape of partialOutputs = {}".format(partialOutputs.shape))
    return partialOutputs

  # TODO: We need to model implementation of ReLU, MaxPooling of every layer
  # TO THINK: Whether we could insert ways to process these layers in the same hardware
  def processInputWeights(self, inputs, weights, layerNumber, pixelsPerWindow, debug_mode=False):
    """
      Args:
        inputs - The quantized input numpy array
        weights - The quantized weights numpy array
        layerNumber - Extract the weights for the corresponding layerNumber
        bitNumber - Extract the corresponding bitNumber's result
        pixelsPerWindow - Number of pixels in each window (Number of cycles required to compute this number)
      Returns:
        The matrix multiplication of the given tensors
    """
    numFilters = 0
    self._activations = inputs
    # 
    if self._pad_x > 0 or self._pad_y > 0:
      inputs = self.padInputs()
    numFilters = weights.shape[0]
    inputChannels, inputSize_x, inputSize_y = inputs.shape[0], inputs.shape[1], inputs.shape[2]
    outputSize_x = int((inputSize_x - self._kernel_size_x)/self._stride_x) + 1
    outputSize_y = int((inputSize_y - self._kernel_size_y)/self._stride_y) + 1
    # quantize the weights initially
    weight, q_scale, zero_point, minVal = self.qzer.quantizeTensorBits(weights, self.numBits, False)
    print("q_scale {}, zero_point {}, minVal {} for weights tensor".format(q_scale, zero_point, minVal))
    if debug_mode:
      print("Input channels = {}, input rows = {}, input cols = {}".format(inputChannels, inputSize_x, inputSize_y))
      print("Input shape = {}".format(inputs.shape))
    outputImages = np.zeros((numFilters, outputSize_x, outputSize_y), dtype=float)
    outputImageDict = {}
    expectedImages = np.zeros((numFilters, outputSize_x, outputSize_y), dtype=float)
    fileName = "/content/gdrive/MyDrive/goldenOutputsv2/outputConvLayer"+str(layerNumber + 1)+".npy"
    expectedImages = np.load(fileName)
    # 
    for bitNumber in np.arange(0, self.numBits):
      for r in np.arange(0, inputSize_x, self._stride_x):
        for c in np.arange(0, inputSize_y, self._stride_y):
          for kernel_row in np.arange(0, self._kernel_size_x):
            for kernel_col in np.arange(0, self._kernel_size_y):
              for window in np.arange(0, math.ceil(inputChannels/pixelsPerWindow)):
                windowStartPoint, windowEndPoint = window * pixelsPerWindow, min((window + 1) * pixelsPerWindow, inputChannels)
                for filter in np.arange(0, math.ceil(numFilters/self._num_weight_lanes)):
                  filterStartPoint, filterEndPoint = filter * self._num_weight_lanes, min((filter + 1) * self._num_weight_lanes, numFilters)
                  output_r, output_c = int((r - kernel_row)/self._stride_x), int((c - kernel_col)/self._stride_y)
                  # Avoid processing out of bounds problems in this code
                  if (r < kernel_row or c < kernel_col or output_r >= outputSize_x or output_c >= outputSize_y):
                    continue
                  for f in np.arange(filterStartPoint, filterEndPoint):
                    for pixel in np.arange(windowStartPoint, windowEndPoint):
                      bit = weight[f][pixel][kernel_row][kernel_col][bitNumber]
                      key = str(f) + str(output_r) + str(output_c)
                      if key not in outputImageDict:
                        outputImages[f][output_r][output_c] = (minVal + (128 * q_scale)) * inputs[pixel, r, c]
                        outputImageDict[key] = True
                      if bit == 1:
                        if bitNumber == 0:
                          outputImages[f][output_r][output_c] += (-128 * q_scale * inputs[pixel, r, c])
                        else:
                          outputImages[f][output_r][output_c] += ((2 ** (7-bitNumber)) * q_scale * inputs[pixel, r, c])

                    #print("Computed result at bitNumber = {} is {}".format(7-bitNumber, outputImages[f,output_r,output_c]))
                    #print("Expected result is {}".format(expectedImages[0,f,output_r,output_c]))
    print("Expected output is {}".format(expectedImages[0]))
    print("Actual output is {}".format(outputImages))
    return outputImages

### Wrapper class for a Convolutional layer and VGG16 network

In [None]:
class ConvLayer(nn.Module):
	""" Custom Linear layer but mimics a standard linear layer """
	def __init__(self, in_channels, out_channels, kernel_size, padding = 0):
		super().__init__()
		self.in_size, self.out_size, self.kernel_size = in_channels, out_channels, kernel_size
		self.conv = nn.Conv2d(self.in_size, self.out_size, kernel_size=self.kernel_size, padding = padding)

	def forward(self, x):
		return self.conv(x)

class VGG16(nn.Module):
	def __init__(self):
		super(VGG16, self).__init__()
		
		self.conv1_1 = ConvLayer(in_channels=3, out_channels=64, kernel_size=3, padding=1)
		self.layer1_1 = nn.Sequential(nn.BatchNorm2d(64), nn.ReLU(inplace=True))
		self.conv1_2 = ConvLayer(in_channels=64, out_channels=64, kernel_size=3, padding=1)
		self.layer1_2 = nn.Sequential(nn.BatchNorm2d(64),nn.ReLU(inplace=True))
		self.layer1_2p =nn.Sequential(nn.MaxPool2d(kernel_size=2, stride=2))

		self.conv2_1 = ConvLayer(in_channels=64, out_channels=128, kernel_size=3, padding=1)
		self.layer2_1 = nn.Sequential(nn.BatchNorm2d(128), nn.ReLU(inplace=True))  
		self.conv2_2 = ConvLayer(in_channels=128, out_channels=128, kernel_size=3, padding=1)
		self.layer2_2 = nn.Sequential(nn.BatchNorm2d(128), nn.ReLU(inplace=True))
		self.layer2_2p =nn.Sequential(nn.MaxPool2d(kernel_size=2, stride=2))
		
		self.conv3_1 = ConvLayer(in_channels=128, out_channels=256, kernel_size=3, padding=1)
		self.layer3_1 = nn.Sequential(nn.BatchNorm2d(256), nn.ReLU(inplace=True))
		self.conv3_2 = ConvLayer(in_channels=256, out_channels=256, kernel_size=3, padding=1)
		self.layer3_2 = nn.Sequential(nn.BatchNorm2d(256), nn.ReLU(inplace=True))
		self.conv3_3 = ConvLayer(in_channels=256, out_channels=256, kernel_size=3, padding=1)
		self.layer3_3 = nn.Sequential(nn.BatchNorm2d(256), nn.ReLU(inplace=True))
		self.layer3_3p =nn.Sequential(nn.MaxPool2d(kernel_size=2, stride=2))
		
		self.conv4_1 = ConvLayer(in_channels=256, out_channels=512, kernel_size=3, padding=1)
		self.layer4_1 = nn.Sequential(nn.BatchNorm2d(512), nn.ReLU(inplace=True))
		self.conv4_2 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, padding=1)
		self.layer4_2 = nn.Sequential(nn.BatchNorm2d(512), nn.ReLU(inplace=True))
		self.conv4_3 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, padding=1)
		self.layer4_3 = nn.Sequential(nn.BatchNorm2d(512), nn.ReLU(inplace=True))
		self.layer4_3p =nn.Sequential(nn.MaxPool2d(kernel_size=2, stride=2))
		
		self.conv5_1 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, padding=1)
		self.layer5_1 = nn.Sequential(nn.BatchNorm2d(512), nn.ReLU(inplace=True))
		self.conv5_2 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, padding=1)
		self.layer5_2 = nn.Sequential(nn.BatchNorm2d(512), nn.ReLU(inplace=True))
		self.conv5_3 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, padding=1)
		self.layer5_3 = nn.Sequential(nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2))
		

		self.fc1 = nn.Linear(512, 512)
		self.fc2 = nn.Linear(512, 10)

	def forward(self, x):
		#print('Input = ', x.shape)
		out = self.conv1_1(x)
		#print('Conv11 = ',out.shape)
		out = self.layer1_1(out)
		#print('Layer11 = ',out.shape)
		out = self.conv1_2(out)
		#print('Conv12 = ',out.shape)
		out = self.layer1_2(out)
		out = self.layer1_2p(out)
		#print('Layer 12 = ',out.shape)
		
		out = self.conv2_1(out)
		out = self.layer2_1(out)
		out = self.conv2_2(out)
		out = self.layer2_2(out)
		out = self.layer2_2p(out)
	
		out = self.conv3_1(out)
		out = self.layer3_1(out)
		out = self.conv3_2(out)
		out = self.layer3_2(out)
		out = self.conv3_3(out)
		out = self.layer3_3(out)
		out = self.layer3_3p(out)
		
		out = self.conv4_1(out)
		out = self.layer4_1(out)
		out = self.conv4_2(out)
		out = self.layer4_2(out)
		out = self.conv4_3(out)
		out = self.layer4_3(out)
		out = self.layer4_3p(out)
		
		out = self.conv5_1(out)
		out = self.layer5_1(out)
		out = self.conv5_2(out)
		out = self.layer5_2(out)
		out = self.conv5_3(out)
		out = self.layer5_3(out)
		
		out = out.view(out.size(0), -1)
		out = self.fc1(out)
		out = self.fc2(out)
		return out

### Testing Quantization framework

In [None]:
import torchvision
import torchvision.transforms as transforms
# For Quantization purposes
from torch.quantization import QuantStub, DeQuantStub
import torchvision.models as models

# import numpy as np
import numpy as np
from time import time
import sys
import pandas as pd
import json

from sklearn import cluster
from joblib import Parallel, delayed
from scipy.optimize import linear_sum_assignment

import pickle
torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)	 # On by default, leave it here for clarity

import os
from google.colab import drive
import struct

n_patterns = 32
n_groups = 8
Lambda_rate = 1.2
eval_pattern = 0
# Customized information
debug_mode = True
integer_bits = 4
frac_bits = 4

drive.mount('/content/gdrive',force_remount=True)

transform_train = transforms.Compose([
	transforms.RandomCrop(32, padding=4),
	transforms.RandomHorizontalFlip(),
	transforms.ToTensor(),
	transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

print(os.getcwd(), os.listdir())

test_data = torchvision.datasets.CIFAR10(root='./data/', train=False, download=True, transform=transform_test)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=100, shuffle=False, num_workers=2)
import os
print("CWD is {}".format(os.getcwd()))
print(os.listdir())


# Obtaining the FP32 version of the model
import torchvision.models as models
#model_vgg16 = models.vgg16(pretrained=True)
model_vgg16 = torch.load('/content/gdrive/MyDrive/model_vgg16_cifar10_bline.torch', map_location=torch.device('cpu'))

# Quantizing the activations for this input model
activations = []
for data in test_loader:
  inputs, labels = data
  activations.append(inputs)
activations = torch.tensor(np.concatenate(activations, axis=0))
if debug_mode:
  print("activations type = {}, shape = {}".format(type(activations), activations.shape))

qzer = Quantizer(model_vgg16, activations, integer_bits+frac_bits, debug_mode)
# Quantizing the given model and obtaining the quantized version of the model
#qzer.quantizeModel()
# TODO: This is probably not required
# qzer.quantizeActivations(activations)

Mounted at /content/gdrive
/content ['.config', 'gdrive', 'data', 'sample_data']
Files already downloaded and verified
CWD is /content
['.config', 'gdrive', 'data', 'sample_data']
activations type = <class 'torch.Tensor'>, shape = torch.Size([10000, 3, 32, 32])


### Activations are computed here

In [None]:
input_data = activations[0]
print("Input Shape = {}".format(input_data.shape))
print("Shape of activations = {}".format(activations.shape))

Input Shape = torch.Size([3, 32, 32])
Shape of activations = torch.Size([10000, 3, 32, 32])


In [None]:
# convert(inputTensor)
def convert(inputTensor):
  """
    Args: inputTensor - Convert this tensor from Qint8 to float32
    Returns: Converted output tensor
  """
  assert len(inputTensor.shape) == 4
  shape = inputTensor.shape
  outputTensor = torch.tensor(np.zeros(inputTensor.shape)) 
  for i in range(0, shape[0]):
    for j in range(0, shape[1]):
      for k in range(0, shape[2]):
        for l in range(0, shape[3]):
          outputTensor[i,j,k,l] = float(inputTensor[i,j,k,l].item())
  return outputTensor

### Code to obtain the weights of the pre-trained model

In [None]:
nonquantized_model = torch.load('/content/gdrive/MyDrive/model_vgg16_cifar10_bline.torch', map_location=torch.device('cpu'))
#quantizedModel = torch.load('./quantizedVGG16.p',map_location=torch.device('cpu'))
#print("Type of quantized model = {}".format(type(quantizedModel)))
print("Type of non quantized model = {}".format(type(nonquantized_model)))
#print(len(quantizedModel.keys()), quantizedModel.keys())
#print(quantizedModel['features.0.bias'], quantizedModel['features.0.weight'])
#weight_layer1 = quantizedModel['conv1_1.conv.weight']
#outputChannels = weight_layer1.shape[0]
#inputChannels = weight_layer1.shape[1]
#kernel_rows = weight_layer1.shape[2]
#kernel_cols = weight_layer1.shape[3]
numBits = 8

# Evaluate Quantization per Tensor
# PyTorch Quantization Per Tensor
#print(quantizedModel.keys())
# Biases, Weights, Scales
biases = []
scales = []
zero_points = []
# 
values = []
nonquantized_weights = []
weights = []
convLayerNames = []
batchNormLayerNames = []
batchNormWeights = []

# Set this to TRUE if preTrained model is used for quantizing
preTrainedModel = False
quantized = False
if quantized:
  # Understood quantization currently
  for k, v in quantizedModel.items():
    if preTrainedModel: 
      if "features" in k and "weight" in k:
        print("WEIGHT: key = {}, Type(value) = {}, value.shape = {}".format(k, type(v), v.shape))
        values.append(v)
        weights.append(convert(v))
        print("SCALES: key = {}, scale = {}".format(k, v.q_scale()))
        scales.append(v.q_scale())
      elif "features" in k and "bias" in k:
        print("BIAS: key = {}, Type(value) = {}, value.shape = {}".format(k, type(v), v.shape))
        biases.append(v)
      elif "features" in k and "zero_point" in k:
        print("ZERO_POINT: type = {}".format(type(v)))
        print("ZERO_POINT: ", v.item())
        zero_points.append(v.item())
    else:
      if "conv" in k:
        if "weight" in k:
          print("WEIGHT: key = {}, Type(value) = {}, value.shape = {}".format(k, type(v), v.shape))
          nonquantized_weights.append(v)
          weights.append(convert(v))
          print("SCALES: key = {}, scale = {}".format(k, v.q_scale()))
          scales.append(v.q_scale())
          #print("ZERO_POINT: key = {}, zero_point = {}".format(k, v))
        elif "bias" in k:
          print("BIAS: key = {}, Type(value) = {}, value.shape = {}".format(k , type(v), v.shape))
        elif "zero_point" in k:
          # v is a tensor
          zero_points.append(v.item())
          print("ZERO_POINT: ", v.item())
else:
  # Go through every layer and get the weights of the model
  for name, param in nonquantized_model.named_parameters():
    if "conv" in name:
      if "weight" in name:
        convLayerNames.append(name)
        weights.append(param)
        nonquantized_weights.append(param)
        #print("NONQUANTIZED: CONV LAYER {}, WEIGHTS SHAPE {}".format(name, param.shape))
      elif "bias" in name:
        biases.append(param)
        #print("NONQUANTIZED: CONV LAYER {}, BIAS SHAPE {}".format(name, param))
    elif "layer" in name:
      if "weight" in name:
        batchNormLayerNames.append(name)
        batchNormWeights.append(param)
        #print("NONQUANTIZED: SEQUENTIAL LAYER {}, WEIGHTS SHAPE {}".format(name, param.shape))
      elif "bias" in name:
        biases.append(param)
        #print("NONQUANTIZED: SEQUENTIAL LAYER {}, BIAS SHAPE {}".format(name, param))

# Measurements have been made here
if quantized:
  print("Length of quantized weights = {}".format(len(weights)))
else:
  print("Length of nonquantized weights = {}".format(len(nonquantized_weights)))

Type of non quantized model = <class '__main__.VGG16'>
Length of nonquantized weights = 13


In [None]:
weights = np.array(nonquantized_weights)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# TODO: Set these as knobs and setup values for each one of them
integer_bits, frac_bits = 4, 4
stride_x, stride_y = 1, 1
padding_x, padding_y = 1, 1
kernel_size_x, kernel_size_y = 3, 3
SIPTile = SerialInnerProduct(input_data, weights, \
                             kernel_size_x, kernel_size_y, \
                             integer_bits + frac_bits, qzer, \
                             stride_x, stride_y, \
                             padding_x, padding_y)
# Read through the code to compute the layerNumber and weights for the given layer
activationFilePath = "/content/gdrive/MyDrive/goldenOutputs/convLayer"
weightFilePath = "/content/gdrive/MyDrive/quantizedWeights/vgg16_model_quantized"
activations = []

# Just pass the first output through the model
for i in range(1, 14):
  convLayer = np.load(activationFilePath + str(i) + ".npy")
  print("Shape of first image of convLayer = {} is {}".format(i+1, convLayer[0,:,:,:].shape))
  activations.append(convLayer[0,:,:,:])

pixelsPerWindow = 16
weightLen = len(weights)
for layerNumber in range(0, len(nonquantized_weights)):
  print("Continuing with the execution of layer number {}".format(layerNumber))
  weights = nonquantized_weights[layerNumber]
  if weights.requires_grad:
    weights = weights.detach()
  inputs = torch.from_numpy(activations[layerNumber])
  print("At layernumber = {}, type(inputs), type(weights) = {}, {}".format(layerNumber, type(inputs), type(weights)))
  print("At layernumber = {}, inputs.shape, weights.shape = {}, {}".format(layerNumber, inputs.shape, weights.shape))
  outputs = SIPTile.processInputWeights(inputs, weights, layerNumber, pixelsPerWindow, False)
  np.save("/content/gdrive/MyDrive/quantizedOutputs/activationsLayer"+str(layerNumber+1), outputs)
  print("Proceeding with the execution of layer number {}".format(layerNumber+1))

Shape of first image of convLayer = 2 is (3, 32, 32)
Shape of first image of convLayer = 3 is (64, 32, 32)
Shape of first image of convLayer = 4 is (64, 16, 16)
Shape of first image of convLayer = 5 is (128, 16, 16)
Shape of first image of convLayer = 6 is (128, 8, 8)
Shape of first image of convLayer = 7 is (256, 8, 8)
Shape of first image of convLayer = 8 is (256, 8, 8)
Shape of first image of convLayer = 9 is (256, 4, 4)
Shape of first image of convLayer = 10 is (512, 4, 4)
Shape of first image of convLayer = 11 is (512, 4, 4)
Shape of first image of convLayer = 12 is (512, 2, 2)
Shape of first image of convLayer = 13 is (512, 2, 2)
Shape of first image of convLayer = 14 is (512, 2, 2)
Continuing with the execution of layer number 0
At layernumber = 0, type(inputs), type(weights) = <class 'torch.Tensor'>, <class 'torch.Tensor'>
At layernumber = 0, inputs.shape, weights.shape = torch.Size([3, 32, 32]), torch.Size([64, 3, 3, 3])
quantizeTensor: type(input_tensor) = <class 'torch.Tenso

KeyboardInterrupt: ignored

In [None]:
quantized_output = np.load("/content/gdrive/MyDrive/quantizedOutputs/activationsLayer" + str(1) + ".npy")
original_output = np.load("/content/gdrive/MyDrive/goldenOutputs/outputs_convLayer" + str(1) + ".npy")
# The results are available here
#print(scales[0], len(scales))
print("Shape of quantized o/p = {} and original o/p = {}".format(quantized_output.shape, original_output.shape))
print(quantized_output.shape)
minOutput, maxOutput = np.min(quantized_output), np.max(quantized_output)
print("min and max of output are {} and {}".format(minOutput, maxOutput))
q_scale = (maxOutput - minOutput)/255
zero_point = round(minOutput/q_scale)
#print("q_scale of output tensor is {}".format(q_scale))
#print("zero_point is {}".format(zero_point))
#print("Quantized value is {}".format(round((quantized_output[0][0][3]/q_scale) - zero_point)-128))
print(quantized_output[0,0])
print(original_output[0,0,0])
#print(scales[0], zero_points[0])
numNonZeroQuantized = [0]*quantized_output.shape[0]
for idx in range(0, quantized_output.shape[0]):
  numNonZeroQuantized[idx] = np.count_nonzero(quantized_output[idx,:,:])
print(numNonZeroQuantized)

Shape of quantized o/p = (64, 32, 32) and original o/p = (100, 64, 32, 32)
(64, 32, 32)
min and max of output are -9.602738576316584 and 5.730517546559387
[0.83762959 1.31520707 1.26840806 1.15626893 1.20936132 1.34911803
 1.35902524 1.29573031 1.36324786 1.40598055 1.38623875 1.28771938
 1.25243054 1.28503194 1.29528314 1.31506737 1.38608674 1.45427131
 1.4578509  1.39166469 1.34686341 1.39870204 1.46430591 1.49781568
 1.54173731 1.56475284 1.67353174 1.73785197 1.82485383 1.89866897
 2.02741259 1.68175563]
[0.38503942 0.6964835  0.74173236 0.68195057 0.6717471  0.759539
 0.76300675 0.7356571  0.7247607  0.737562   0.73675555 0.71046275
 0.7387419  0.7998264  0.81721026 0.8006569  0.77809197 0.76845104
 0.7570511  0.69142395 0.6063319  0.5796229  0.5752282  0.6195668
 0.623373   0.6172741  0.61819047 0.62541693 0.61613566 0.56735975
 0.5220558  0.61489373]
[1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 102

### This segment of the notebook has been used for testing the baseline implementation of the Bit-Serial architecture. Very simple tests have been used to verify the implementation of the architecture

In [None]:
import numpy as np

model = torch.load('/content/gdrive/MyDrive/model_vgg16_cifar10_bline.torch', map_location=torch.device('cpu'))
inputMatrix = np.load('/content/gdrive/MyDrive/goldenOutputs/convLayer1.npy')
print("Shape of Input matrix is {}".format(inputMatrix.shape))
inputTensor = torch.tensor(inputMatrix)
qzer = Quantizer(nonquantized_model, inputTensor, integer_bits, frac_bits)

convMatrix = nonquantized_weights[0].detach().numpy()
print("Shape of weights = {}".format(convMatrix.shape))
org_convMatrix = np.copy(convMatrix)
expectedResult = model.conv1_1(torch.from_numpy(inputMatrix))
print("Expected result shape = {}".format(expectedResult.shape))

# Weights quantization
maxVal = np.max(convMatrix)
minVal = np.min(convMatrix)
print("minVal and maxVal are {} and {}".format(minVal, maxVal))
q_scale = float((maxVal - minVal)/255)
print("q_scale value is {}".format(q_scale))
zero_point = round(minVal/q_scale)
print("Zero point is {}".format(zero_point))
for val in np.nditer(convMatrix, op_flags=['readwrite']):
  val[...] = round(val/q_scale - zero_point) - 128

# Test whether the values are between -128 and 127
print(np.max(convMatrix), np.min(convMatrix))

numFilters = convMatrix.shape[0]
inputChannels = convMatrix.shape[1]
numImages = inputMatrix.shape[0]

assert inputChannels == inputMatrix.shape[1], "Number of inputChannels is " + str(inputChannels)
numRows, numCols = inputMatrix.shape[2], inputMatrix.shape[3]
kernelRows, kernelCols = convMatrix.shape[2], convMatrix.shape[3]

# numBits, pixelsPerWindow, numWeightLanes, convMatrixQuantized
numBits = 8
pixelsPerWindow = 16
numWeightLanes = 16
convMatrixQuantized = np.zeros((numFilters, inputChannels, kernelRows, kernelCols, numBits), dtype=np.int8)
print(convMatrix.shape)

# This is the matrix after padding the inputs with zeros
inputMatrixPadded = np.zeros((numImages, inputChannels, numRows+2, numCols+2), dtype=float)
inputMatrixPadded[:,:,1:numRows+1,1:numCols+1] = inputMatrix

# numRows, numCols
numRows, numCols = inputMatrixPadded.shape[2], inputMatrixPadded.shape[3]

print(convMatrix[0][0][0][0], convMatrixQuantized[0][0][0][0])
# 
for i in range(convMatrix.shape[0]):
  for j in range(convMatrix.shape[1]):
    for k in range(convMatrix.shape[2]):
      for l in range(convMatrix.shape[3]):
        convMatrixQuantized[i][j][k][l][:] = qzer.generateTwosComplement(convMatrix[i,j,k,l], 8, False)

print(convMatrix[0][0][0][0], convMatrixQuantized[0][0][0][0])
# Assume strides are 1 and 1
outputRows, outputCols = int(numRows - kernelRows) + 1, int(numCols - kernelCols) + 1
result = np.zeros((numImages, numFilters, outputRows, outputCols), dtype=float)
print("Shape of convMatrixQuantized = {}, result = {}".format(convMatrixQuantized.shape, result.shape))
resultDict = {}
print("numBits = {}, numRows = {}, numCols = {}".format(numBits, numRows, numCols))
print("kernel rows = {}, kernel cols = {}, inputChannels = {}, numFilters = {}".format(kernelRows, kernelCols, inputChannels, numFilters))
print("pixelsPerWindow = {}, numWeightLanes = {}".format(pixelsPerWindow, numWeightLanes))
# This needs to be accelerated on the CPU
for image in np.arange(0, numImages):
  if image >= 1:
    break
  for bitNumber in np.arange(0, numBits):
    for r in np.arange(0, numRows):
      for c in np.arange(0, numCols):
        for kr in np.arange(0, kernelRows):
          for kc in np.arange(0, kernelCols):
            for window in np.arange(0, math.ceil(inputChannels/pixelsPerWindow)):
              windowStartPoint, windowEndPoint = window * pixelsPerWindow, min((window + 1) * pixelsPerWindow, inputChannels)
              for filter in np.arange(0, math.ceil(numFilters/numWeightLanes)):
                filterStartPoint, filterEndPoint = filter * numWeightLanes, min((filter + 1) * numWeightLanes, numFilters)
                output_r, output_c = int(r - kr), int(c - kc)
                if (r < kr or c < kc or output_r >= outputRows or output_c >= outputCols):
                  continue
                for f in np.arange(filterStartPoint, filterEndPoint):
                  # Every value would have this term added to it
                  for pixel in np.arange(windowStartPoint, windowEndPoint):
                    key = str(f) + str(output_r) + str(output_c)
                    if key not in resultDict:
                      result[image, f, output_r, output_c] = (round(minVal/q_scale) + 128) * q_scale * inputMatrixPadded[image, pixel, r, c]
                      resultDict[key] = True
                    bit = convMatrixQuantized[filter][pixel][kr][kc][bitNumber]
                    # This depends on the bit number and the value of the bit
                    if bit == 1:
                      if bitNumber == 0:
                        result[image, f, output_r, output_c] +=  (-128 * inputMatrixPadded[image,pixel,r,c] * q_scale)
                      else:
                        result[image, f, output_r, output_c] += ((2**(7-bitNumber)) * inputMatrixPadded[image, pixel, r, c] * q_scale)
    print("At bitnumber = {}, result matrix is {}".format(7-bitNumber, result[0][0]))

# Here, we print the expected result of the given matrix
print("Expected result = {}".format(expectedResult[0][0]))

Shape of Input matrix is (100, 3, 32, 32)
Shape of weights = (64, 3, 3, 3)
Expected result shape = torch.Size([100, 64, 32, 32])
minVal and maxVal are -0.8069930672645569 and 1.0306917428970337
q_scale value is 0.007206607332416609
Zero point is -112
127.0 -128.0
(64, 3, 3, 3)
-25.0 [0 0 0 0 0 0 0 0]
-25.0 [1 1 1 0 0 1 1 1]
Shape of convMatrixQuantized = (64, 3, 3, 3, 8), result = (100, 64, 32, 32)
numBits = 8, numRows = 34, numCols = 34
kernel rows = 3, kernel cols = 3, inputChannels = 3, numFilters = 64
pixelsPerWindow = 16, numWeightLanes = 16
At bitnumber = 7, result matrix is [[ 3.78128067  5.34399467  4.7686642  ... 10.04196974 11.28131521
   8.52170656]
 [ 5.99114069  8.60637154  7.71051283 ... 15.30463829 17.01542574
  12.55001618]
 [ 6.74676105  9.8347355   8.89968361 ... 14.76301908 16.22068092
  11.75587846]
 ...
 [ 3.10746624  7.96715229 12.13065646 ... 13.12958593 17.59984337
  15.02092809]
 [ 2.695035    7.68667464 11.50233736 ... 13.20490811 18.56974271
  15.24943779]
 [

In [None]:
#activationLayer1 = np.load('activationsLayer1.npy')
#activationLayer2 = np.load('activationsLayer2.npy')
outputLayer1 = np.load('/content/gdrive/MyDrive/goldenOutputs/outputs_convLayer1.npy')
print(outputLayer1[0,:,:,:].shape)
print(outputLayer1[0,0,0,1])
nonZeroPerChannel = [0] * outputLayer1[0,:,:,:].shape[0]
for idx in range(outputLayer1[0,:,:,:].shape[0]):
  nonZeroPerChannel[idx] = np.count_nonzero(outputLayer1[0,idx,:,:])
  #print("At channel {}, nonzero count is {}".format(idx, nonZeroPerChannel[idx]))
print(nonZeroPerChannel)
#print(activationLayer1.shape)
#print(activationLayer1[0][0][0])
#print(inputs)

(64, 32, 32)
0.6964835
[1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]


In [None]:
print(inputLayer1[0])

In [None]:
a = np.random.random((3, 3))
b = np.ones((3, 3))
c = np.multiply(a, b)
print(a)
print(b)
print(c)

In [None]:
filePath = "/content/gdrive/MyDrive/quantizedOutputs/activationsLayer"
startLayer, endLayer = 3, 4
for layerNumber in range(startLayer, endLayer):
  fileName = filePath + str(layerNumber) + ".npy"
  npArray = np.load(fileName)
  print("Shape of npArray at layerNumber = {} is {}".format(layerNumber, npArray.shape))
  if layerNumber >= startLayer + 1:
    break
  for i in range(0, npArray.shape[0]):
    o1 = np.count_nonzero(npArray[i])
    percent = ((o1*100)/(npArray.shape[1]*npArray.shape[2]))
    print("For channel {}, percentage nonzeros {}".format(i+1, percent))
  print("Shape of numpy array {}".format(npArray.shape))

In [None]:
filePath = "/content/gdrive/MyDrive/goldenOutputs/outputs_convLayer"
startLayer, endLayer = 3, 4
for layerNumber in range(startLayer, endLayer):
  fileName = filePath + str(layerNumber) + ".npy"
  npArray = np.load(fileName)
  print("Shape of npArray at layerNumber = {} is {}".format(layerNumber, npArray.shape))
  if layerNumber >= startLayer+1:
    break
  for img in range(0, npArray.shape[0]):
    if img >= 1:
      break
    for i in range(0, npArray.shape[1]):
      o1 = np.count_nonzero(npArray[img][i])
      percent = float((o1*100)/(npArray.shape[2]*npArray.shape[3]))
      print("For channel {}, percentage nonzeros {}".format(i+1, percent))
  print("Shape of numpy array {}".format(npArray.shape))

In [None]:
def performReLU(inputTensor):
  """
    Args: inputTensor - Apply ReLU on this tensor and return the new tensor
  """
  input_np = inputTensor()
  for val in np.nditer(input_np, op_flags=['readwrite']):
    val[...] = max(0, val)
  return torch.tensor(input_np)

In [None]:
npArray = np.load("/content/gdrive/MyDrive/quantizedOutputs/activationsLayer1.npy")
activationTensor = torch.tensor(npArray)
postReLUTensor = performReLU(activationTensor)
for channel in range(0, postReLUTensor.shape[0]):
  value = torch.count_nonzero(postReLUTensor[channel])
  percent = float(value/(postReLUTensor.shape[1] * postReLUTensor.shape[2]))
  print("For channel {}, percent {}".format(channel, percent))