##### Import Libraries

In [1]:
# %pip install scikit-image
# %pip install ipdb
# %pip install torchnet
# %pip install visdom
# %pip install ipywidgets

In [2]:
%load_ext autoreload
%autoreload 2
import os
import torch
from utils.config import opt
from model import FasterRCNNVGG16   
from trainer import FasterRCNNTrainer
from data.util import  read_image
from utils.vis_tool import vis_bbox
from utils import array_tool as at

%matplotlib inline

In [3]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

import os
import cv2
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

import os    
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [4]:
# Get cpu or gpu device for training.
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using {DEVICE} device")
if torch.cuda.is_available(): print(f'device name: {torch.cuda.get_device_name(0)}')

Using cuda device
device name: TITAN RTX


In [5]:
cpu_device = torch.device('cpu')
gpu_device = torch.device('cuda')

##### Define Size Func

In [6]:
def print_model_size(model):
    torch.save(model.state_dict(), "tmp.pt")
    print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6))
    os.remove('tmp.pt')

Load Model and Load Pretrained Weight Values

In [7]:
faster_rcnn = FasterRCNNVGG16().cpu()
faster_rcnn.cpu()

FasterRCNNVGG16(
  (extractor): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=

In [8]:
pretrained_weights = torch.load("sehyun_pretrained/1batch_20220707_fasterrcnn_11_epoch_model.pth")
pretrained_weights = pretrained_weights['model']
pretrained_weights

OrderedDict([('extractor.0.weight',
              tensor([[[[-5.5373e-01,  1.4270e-01,  5.2896e-01],
                        [-5.8312e-01,  3.5655e-01,  7.6566e-01],
                        [-6.9022e-01, -4.8019e-02,  4.8409e-01]],
              
                       [[ 1.7548e-01,  9.8630e-03, -8.1413e-02],
                        [ 4.4089e-02, -7.0323e-02, -2.6035e-01],
                        [ 1.3239e-01, -1.7279e-01, -1.3226e-01]],
              
                       [[ 3.1303e-01, -1.6591e-01, -4.2752e-01],
                        [ 4.7519e-01, -8.2677e-02, -4.8700e-01],
                        [ 6.3203e-01,  1.9308e-02, -2.7753e-01]]],
              
              
                      [[[ 2.3254e-01,  1.2666e-01,  1.8605e-01],
                        [-4.2805e-01, -2.4349e-01,  2.4628e-01],
                        [-2.5066e-01,  1.4177e-01, -5.4864e-03]],
              
                       [[-1.4076e-01, -2.1903e-01,  1.5041e-01],
                        [-8.4127e-01, -

In [9]:
model = faster_rcnn.cpu()
model.load_state_dict(pretrained_weights)

<All keys matched successfully>

Change to Eval Mode

Model Layer Fusion

In [10]:
modules_to_fuse = [ 
    ["extractor.0", "extractor.1"],
    ["extractor.2", "extractor.3"],
    ["extractor.5", "extractor.6"],
    ["extractor.7", "extractor.8"],
    ["extractor.10", "extractor.11"],
    ["extractor.12", "extractor.13"],
    ["extractor.14", "extractor.15"],
    ["extractor.17", "extractor.18"],
    ["extractor.19", "extractor.20"],
    ["extractor.21", "extractor.22"],
    ["extractor.24", "extractor.25"],
    ["extractor.26", "extractor.27"],
    ["extractor.28", "extractor.29"],

    ["head.classifier.0", "head.classifier.1"],
    ["head.classifier.2", "head.classifier.3"]
    ]


fused_model = torch.quantization.fuse_modules(model, modules_to_fuse)
fused_model.cpu()

FasterRCNNVGG16(
  (extractor): Sequential(
    (0): ConvReLU2d(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (1): Identity()
    (2): ConvReLU2d(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (3): Identity()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): ConvReLU2d(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (6): Identity()
    (7): ConvReLU2d(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (8): Identity()
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): ConvReLU2d(
      (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (11): Identity()
    (12): Co

In [11]:
# quant_stubbed_model = nn.Sequential(torch.quantization.QuantStub(),
#                                     fused_model, 
#                                     torch.quantization.DeQuantStub())

In [12]:
class quantStubModel(nn.Module):
    def __init__(self, model_fp32):
        super(quantStubModel, self).__init__()
        # QuantStub converts tensors from floating point to quantized.
        # This will only be used for inputs.
        self.quant = torch.quantization.QuantStub()

        self.model_fp32 = model_fp32

        # DeQuantStub converts tensors from quantized to floating point.
        # This will only be used for outputs.
        self.dequant = torch.quantization.DeQuantStub()
        
        # self.model_fp32 = model_fp32

    def forward(self, x):
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

# creating nn.Module with stubs for inputs and outputs
quant_stubbed_model = quantStubModel(model_fp32=fused_model).cpu()

##### Define Quantization Mode

In [13]:
# colab requires fbgemm backend
use_fbgemm = True

if use_fbgemm == True:
  quantization_config = torch.quantization.get_default_qconfig('fbgemm')
  torch.backends.quantized.engine = 'fbgemm'

else:
  quantization_config = torch.quantization.default_qconfig
  torch.backends.quantized.engine = 'qnnpack'

In [14]:
quantized_model = quantStubModel(model_fp32=fused_model)

fused_model.qconfig  = quantization_config

# Set the backend on which the quantized kernels need to be run
fused_model_prepared = torch.quantization.prepare(fused_model, inplace=False)

fused_model_prepared.eval().cpu()

FasterRCNNVGG16(
  (extractor): Sequential(
    (0): ConvReLU2d(
      (0): Conv2d(
        3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
        (activation_post_process): HistogramObserver()
      )
      (1): ReLU(
        inplace=True
        (activation_post_process): HistogramObserver()
      )
    )
    (1): Identity()
    (2): ConvReLU2d(
      (0): Conv2d(
        64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
        (activation_post_process): HistogramObserver()
      )
      (1): ReLU(
        inplace=True
        (activation_post_process): HistogramObserver()
      )
    )
    (3): Identity()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): ConvReLU2d(
      (0): Conv2d(
        64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
        (activation_post_process): HistogramObserver()
      )
      (1): ReLU(
        inplace=True
        (activation_post_process): HistogramObserver()
      )
    )
    

##### Define DataLoader

In [15]:
class CoCo_TestDataset(Dataset):
    def __init__(self, rootDir, folder, tf=None):
        """Dataset class for CoCo data

        Args:
            rootDir (str): path to directory containing CoCo image data
            folder (str) : 'train' or 'val' folder
            tf (optional): transformation to apply. Defaults to None
        """        
        self.rootDir = rootDir
        self.folder = folder
        self.transform = tf

        # read rgb image list
        ### sourceImgFolder =  os.path.join(self.rootDir, 'leftImg8bit', self.folder)
        sourceImgFolder =  os.path.join(self.rootDir, self.folder)
        self.sourceImgFiles  = [os.path.join(sourceImgFolder, x) for x in sorted(os.listdir(sourceImgFolder))]

        # read label image list
        ### labelImgFolder =  os.path.join(self.rootDir, 'gtFine', self.folder)
        labelImgFolder =  os.path.join(self.rootDir, self.folder)
        self.labelImgFiles  = [os.path.join(labelImgFolder, x) for x in sorted(os.listdir(labelImgFolder))]
    
    def __len__(self):
        return len(self.sourceImgFiles)
  
    def __getitem__(self, index):
        # read source image and convert to RGB, apply transform
        sourceImage = cv2.imread(f"{self.sourceImgFiles[index]}", -1)
        sourceImage = cv2.cvtColor(sourceImage, cv2.COLOR_BGR2RGB)
        if self.transform is not None:
            sourceImage = self.transform(sourceImage)

        # read label image and convert to torch tensor
        labelImage  = torch.from_numpy(cv2.imread(f"{self.labelImgFiles[index]}", -1)).long()
        return sourceImage, labelImage  
        # return sourceImage

##### Calibration

In [16]:
tf = transforms.Compose([
                    # transforms.ToPILImage(),
                    transforms.ToTensor(),
                    # transforms.Resize((320, 320)),
                    transforms.Normalize(mean=(0.485, 0.56, 0.406), std=(0.229, 0.224, 0.225))
                ])

TEST_BATCH_SIZE = 1

# Creating Test set and Test Dataloaders
test_set = CoCo_TestDataset(rootDir= 'data/coco/', folder='test2017', tf = tf)
test_set = torch.utils.data.Subset(test_set, indices=np.arange(1))
test_dataloader  = DataLoader(test_set, batch_size=TEST_BATCH_SIZE)

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [18]:
if torch.cuda.is_available():
    print('cuda')
    fused_model_prepared.cuda()# 

cuda


In [19]:
with torch.no_grad():
# with torch.inference_mode():
    for inputs, labels in tqdm(test_dataloader):
      # print(inputs[0].shape)
      # print(inputs[0])

      # real_inputs = inputs[0]

      gpu_inputs = inputs.to(gpu_device)
      # gpu_inputs = gpu_inputs.cuda()

      _ = fused_model_prepared(gpu_inputs)

fused_model_prepared.eval()
fused_model

100%|██████████| 1/1 [00:00<00:00,  3.14it/s]


FasterRCNNVGG16(
  (extractor): Sequential(
    (0): ConvReLU2d(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (1): Identity()
    (2): ConvReLU2d(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (3): Identity()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): ConvReLU2d(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (6): Identity()
    (7): ConvReLU2d(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (8): Identity()
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): ConvReLU2d(
      (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (11): Identity()
    (12): Co

##### Quantization Done!

In [20]:
fused_model_prepared.cpu()
quantized_model = torch.quantization.convert(fused_model_prepared, inplace=False)
quantized_model.state_dict()

OrderedDict([('extractor.0.weight',
              tensor([[[[-0.5525,  0.1441,  0.5285],
                        [-0.5825,  0.3543,  0.7627],
                        [-0.6906, -0.0480,  0.4864]],
              
                       [[ 0.1742,  0.0120, -0.0841],
                        [ 0.0420, -0.0721, -0.2582],
                        [ 0.1321, -0.1742, -0.1321]],
              
                       [[ 0.3123, -0.1681, -0.4264],
                        [ 0.4744, -0.0841, -0.4864],
                        [ 0.6305,  0.0180, -0.2762]]],
              
              
                      [[[ 0.2309,  0.1254,  0.1847],
                        [-0.4289, -0.2441,  0.2441],
                        [-0.2507,  0.1386, -0.0066]],
              
                       [[-0.1386, -0.2177,  0.1518],
                        [-0.8446, -0.3497,  0.5608],
                        [-0.2441,  0.5213,  0.5410]],
              
                       [[-0.3167, -0.3695, -0.1320],
                    

##### Model Save

In [21]:
# torch.save(quantized_model, 'sehyun_quantized/quantized_model_FasterRCNN_0801.pth')
torch.save(quantized_model.state_dict(), 'sehyun_quantized/quantized_model_FasterRCNN_statedict_0804.pth')

In [22]:
# torch.jit.save(torch.jit.script(quantized_model), 'sehyun_quantized/quantized_model_FasterRCNN_statedict_0811.pth')
# torch.jit.script(quantized_model, "dsds.pth")

---

##### Compare

In [23]:
print_model_size(quantized_model)
print_model_size(model)

137.37 MB
548.33 MB


---

In [24]:
# quantized_model

# np.random.seed(44)
# dummy_input = torch.rand(1, 3, 512, 512).to("cpu", dtype=torch.float)  # Corresponds to a 512*512 RGB image
# test_output = quantized_model(dummy_input) 

# unscripted_top2 = F.softmax(test_output, dim=1).topk(2).indices
# print('Python model top 2 results:\n  {}'.format(unscripted_top2))


img = read_image('misc/demo6.jpg')
img = torch.from_numpy(img)[None]
img.to("cpu", dtype=torch.float)
# img = img.cpu()

quantized_model.cpu()
# # quantized_model
quantized_model(img)


# _bboxes, _labels, _scores = quantized_model.predict(img, visualize=True)

RuntimeError: Could not run 'quantized::conv2d_relu.new' with arguments from the 'CPU' backend. 'quantized::conv2d_relu.new' is only available for these backends: [QuantizedCPU].

---

In [None]:
import time
from PIL import Image
from torchvision import transforms

input_image = Image.open("Classification_Quant_Completed/cat.jpg")
preprocess = transforms.Compose([
    transforms.Resize(32),
    transforms.CenterCrop(32),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
])

input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

In [None]:
img = read_image('misc/demo6.jpg')
img = torch.from_numpy(img)[None]

In [None]:
model.to(cpu_device)
model.eval()

input_batch.to(cpu_device)
img.to(cpu_device)

quantized_model.to(cpu_device)
quantized_model.eval()

FasterRCNNVGG16(
  (extractor): Sequential(
    (0): QuantizedConvReLU2d(3, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.09948394447565079, zero_point=0, padding=(1, 1))
    (1): Identity()
    (2): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.22371403872966766, zero_point=0, padding=(1, 1))
    (3): Identity()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): QuantizedConvReLU2d(64, 128, kernel_size=(3, 3), stride=(1, 1), scale=0.23829995095729828, zero_point=0, padding=(1, 1))
    (6): Identity()
    (7): QuantizedConvReLU2d(128, 128, kernel_size=(3, 3), stride=(1, 1), scale=0.3613240718841553, zero_point=0, padding=(1, 1))
    (8): Identity()
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): QuantizedConvReLU2d(128, 256, kernel_size=(3, 3), stride=(1, 1), scale=0.41364625096321106, zero_point=0, padding=(1, 1))
    (11): Identity()
    (12): QuantizedConvReLU2d(256, 256

In [None]:
model(img)

RuntimeError: rois.device().is_cpu() INTERNAL ASSERT FAILED at "/root/project/torchvision/csrc/cpu/ROIPool_cpu.cpp":130, please report a bug to PyTorch. rois must be a CPU tensor