In [1]:
#imports
import torch
import numpy as np
import pandas as pd
import torchvision.models as models
from torch.utils.tensorboard import SummaryWriter
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset
import time
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import copy
import os
import pickle
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#read files
def unpickle(file):
    
    with open(file, 'rb') as fo:
        dictionary = pickle.load(fo, encoding='bytes')
    return dictionary

In [4]:
#data processing
def reshape_images(data_dict):
    reshaped = data_dict.numpy().reshape(len(data_dict), 1024, 3, order = 'F').reshape(len(data_dict), 32,32,3)
    reshaped_processed = torch.from_numpy(reshaped).float().permute(0, 3, 1, 2)
    return reshaped_processed

## Settings

In [5]:
#data settings
subset = False #for local running
DATA_LOCAL = False
BATCH_SIZE = 16
#mean and std of cifar100 dataset
CIFAR100_TRAIN_MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
ENABLE_VAL = False #we do not have validation set then
#model settings
USE_TENSORBOARD = False
if USE_TENSORBOARD:
    foo = SummaryWriter()
use_gpu = True

#lr scheduler
BASE_LR = 0.001
EPOCH_DECAY = 4
DECAY_WEIGHT = 0.5

DEVICE = 'cpu'
if use_gpu and torch.cuda.is_available():
    DEVICE = 'cuda'

## Load Data and Process

In [6]:
if DATA_LOCAL:
    test_dict = unpickle('../cifar-100-python/test')
    train_dict = unpickle('../cifar-100-python/train')
    meta = unpickle('../cifar-100-python/meta')
    label_names = meta[b'fine_label_names']
    #load to local
    train_data = torch.from_numpy(train_dict[b'data'])
    train_y = torch.tensor(train_dict[b'fine_labels'])
    test_data = torch.from_numpy(test_dict[b'data'])
    test_y = torch.tensor(test_dict[b'fine_labels'])
    
    def see_plot(phase, sample_id, test_y = test_y, label_names = label_names, test_data = test_data, train_y = train_y, train_data = train_data):
    
      if phase == 'train':
          data = train_data
          y = train_y
      elif phase == 'test':
          data = test_data
          y = test_y
      assert sample_id < len(data)
      plt.imshow(data[sample_id].numpy().reshape(-1,3, order = 'F').reshape(32,32,3))
      labeli = y[sample_id].item()
      plt.title('label: ' + label_names[labeli].decode("utf-8") + ', label id: ' + str(labeli))

In [7]:
if DATA_LOCAL:
    # see pictures by sample id
    see_plot('train', 16)

In [33]:
def get_training_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True, large_subset = False):
    """ return training dataloader
    Args:
        mean: mean of cifar100 training dataset
        std: std of cifar100 training dataset
        path: path to cifar100 training python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle
    Returns: train_data_loader:torch dataloader object
    """

    transform_train = transforms.Compose([
        #transforms.ToPILImage(),
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    cifar100_training = torchvision.datasets.CIFAR100(root='cifar-100-python', train=True, download=True, transform=transform_train)
    if subset:
        cifar100_training = torch.utils.data.Subset(cifar100_training, [i for i in range(100)])
    elif large_subset:
        cifar100_training = torch.utils.data.Subset(cifar100_training, list(range(0, len(cifar100_training), 10)))

    cifar100_training_loader = DataLoader(
        cifar100_training, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar100_training_loader

In [34]:
def get_test_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
    """ return training dataloader
    Args:
        mean: mean of cifar100 test dataset
        std: std of cifar100 test dataset
        path: path to cifar100 test python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle
    Returns: cifar100_test_loader:torch dataloader object
    """

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    cifar100_test = torchvision.datasets.CIFAR100(root='cifar-100-python', train=False, download=True, transform=transform_test)
    if subset:
        cifar100_test = torch.utils.data.Subset(cifar100_test, [i for i in range(100)])
    cifar100_test_loader = DataLoader(
        cifar100_test, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar100_test_loader

In [35]:
CIFAR100_TRAIN_MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
cifar100_test_loader = get_test_dataloader(
    CIFAR100_TRAIN_MEAN,
    CIFAR100_TRAIN_STD,
    num_workers = 4,
    batch_size = BATCH_SIZE,
    shuffle = False
)

Files already downloaded and verified


In [36]:
cifar100_training_loader = get_training_dataloader(
    CIFAR100_TRAIN_MEAN,
    CIFAR100_TRAIN_STD,
    num_workers = 4,
    batch_size = BATCH_SIZE,
    shuffle = True
)

Files already downloaded and verified


In [37]:
cifar100_training_loader_subset = get_training_dataloader(
    CIFAR100_TRAIN_MEAN,
    CIFAR100_TRAIN_STD,
    num_workers = 4,
    batch_size = BATCH_SIZE,
    shuffle = False,
    large_subset = True
)

Files already downloaded and verified


In [38]:
dset_loaders = {'train': cifar100_training_loader, 
                'train_large_subset': cifar100_training_loader_subset, 
                'test': cifar100_test_loader}
dset_sizes = {'train': len(cifar100_training_loader.dataset), 
              'train_large_subset': len(cifar100_training_loader_subset.dataset), 
              'test': len(cifar100_test_loader.dataset)}

## Load Model and Utilities

In [13]:
#train model
def train_model(model, criterion, optimizer, lr_scheduler, num_epochs=5):
    
    model.to(DEVICE)
    since = time.time()

    best_model = model
    best_acc = 0.0
    if ENABLE_VAL:
        phases = ['train', 'val']
    else:
        phases = ['train']
        
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in phases:
            if phase == 'train':
                mode='train'
                optimizer = lr_scheduler(optimizer, epoch)
                model.train()  # Set model to training mode
            else:
                model.eval()
                mode='val'

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for index, (inputs, labels) in enumerate(dset_loaders[phase]):
                
                inputs, labels = inputs.float().to(DEVICE), labels.long().to(DEVICE)

                # Set gradient to zero to delete history of computations in previous epoch. Track operations so that differentiation can be done automatically.
                optimizer.zero_grad()
                
                outputs = model(inputs)
                _, preds = torch.max(outputs.data, 1)
                
                loss = criterion(outputs, labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss = Variable(loss, requires_grad = True)
                    loss.backward()
                    optimizer.step()
                    
                running_loss += loss.item()
                running_corrects += torch.sum(preds == labels.data)
                
                if index % 500 == 0 and index > 0:
                    print('{}/{} with loss {:.4f}'.format(index, dset_sizes['train']/16, running_loss/index))
                
            epoch_loss = running_loss / dset_sizes[phase]
            epoch_acc = running_corrects.item() / float(dset_sizes[phase])
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))


            # deep copy the model
            if phase == 'val':
                if USE_TENSORBOARD:
                    foo.add_scalar('epoch_loss',epoch_loss,epoch)
                    foo.add_scalar('epoch_acc',epoch_acc,epoch)
                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_model = copy.deepcopy(model)
                    pickle.dump(best_model, open('best_model.pkl', 'wb'))
                    print('new best accuracy = ',best_acc)
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    print('returning and looping back')
    if USE_TENSORBOARD:
        foo.close()
    return best_model

# This function changes the learning rate over the training model.
def exp_lr_scheduler(optimizer, epoch, init_lr=BASE_LR, lr_decay_epoch=EPOCH_DECAY):
    """Decay learning rate by a factor of DECAY_WEIGHT every lr_decay_epoch epochs."""
    lr = init_lr * (DECAY_WEIGHT**(epoch // lr_decay_epoch))

    if epoch % lr_decay_epoch == 0:
        print('LR is set to {}'.format(lr))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    return optimizer

#test model
def test_model(model, data = 'test'):
    model.to(DEVICE)
    model.eval()
    
    running_loss = 0
    running_corrects = 0
    with torch.no_grad():
      for batch_index, (inputs, labels) in enumerate(dset_loaders[data]):
          inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
          outputs = model(inputs)
          _, preds = torch.max(outputs.data, 1)
          corrects = torch.sum(preds == labels.data) 
          loss = criterion(outputs, labels)
          running_loss += loss
          running_corrects += corrects
          del inputs
          del labels
          del outputs
          torch.cuda.empty_cache()

    accuracy = (running_corrects / float(dset_sizes['test'])).item()
    loss = (running_loss / dset_sizes['test']).item()
    print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                'test', loss, accuracy))
    return loss, accuracy

In [14]:
import torch
import torch.nn as nn

cfg = {
    'A' : [64,     'M', 128,      'M', 256, 256,           'M', 512, 512,           'M', 512, 512,           'M'],
    'B' : [64, 64, 'M', 128, 128, 'M', 256, 256,           'M', 512, 512,           'M', 512, 512,           'M'],
    'D' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256,      'M', 512, 512, 512,      'M', 512, 512, 512,      'M'],
    'E' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M']
}

class VGG(nn.Module):

    def __init__(self, features, num_class=100):
        super().__init__()
        self.features = features
        self.classifier = nn.Sequential(
            nn.Linear(512, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, num_class)
        )

    def forward(self, x):
        output = self.features(x)
        output = output.view(output.size()[0], -1)
        output = self.classifier(output)

        return output

def make_layers(cfg, batch_norm=False):
    layers = []

    input_channel = 3
    for l in cfg:
        if l == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            continue

        layers += [nn.Conv2d(input_channel, l, kernel_size=3, padding=1)]

        if batch_norm:
            layers += [nn.BatchNorm2d(l)]

        layers += [nn.ReLU(inplace=True)]
        input_channel = l

    return nn.Sequential(*layers)

def vgg11_bn():
    return VGG(make_layers(cfg['A'], batch_norm=True))

def vgg13_bn():
    return VGG(make_layers(cfg['B'], batch_norm=True))

def vgg16_bn():
    return VGG(make_layers(cfg['D'], batch_norm=True))

def vgg19_bn():
    return VGG(make_layers(cfg['E'], batch_norm=True))

In [15]:
def load_vgg(path):
    model = vgg16_bn()
    weights = torch.load(path, map_location=DEVICE)
    model.load_state_dict(weights)
    model.to(DEVICE)

    return model

In [16]:
#vgg16 = load_vgg('../vgg16-197-best.pth')
vgg16 = load_vgg('./drive/MyDrive/vgg16-197-best.pth')

In [17]:
criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(vgg16.parameters(), lr=0.001)
#train_model(vgg16, criterion, optimizer, exp_lr_scheduler, num_epochs = 3);

In [25]:
start = time.time()
loss, accuracy = test_model(vgg16)
end = time.time()
print(loss, accuracy)
print('time seconds:', end - start)

test Loss: 0.1027 Acc: 0.7201
0.10269515216350555 0.7200999855995178
time seconds: 136.56451988220215


In [None]:
#visualize tensorboard -- a little buggy...
if USE_TENSORBOARD:
    %load_ext tensorboard
    %tensorboard --logdir=runs

## Quantization

In [19]:
def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (KB):', size/1e3)
    os.remove('temp.p')
    return size

### Quant Dynamics with no retrain

In [23]:
DEVICE = 'cpu'
vgg16_quant1 = torch.quantization.quantize_dynamic(
    vgg16.to(DEVICE),  # the original model
    {torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.qint8)  # the target dtype for quantized weights
start = time.time()
loss, accuracy = test_model(vgg16_quant1)
end = time.time()
print(loss, accuracy)
print('time seconds:', end - start)

test Loss: 0.1027 Acc: 0.7201
0.1027042344212532 0.7200999855995178
time seconds: 127.8513925075531


In [24]:
# compare the sizes
f=print_size_of_model(vgg16,"baseline")
q=print_size_of_model(vgg16_quant1,"dynamic quantization")
print("{0:.2f} times smaller".format(f/q))

model:  baseline  	 Size (KB): 136126.773
model:  dynamic quantization  	 Size (KB): 78277.298
1.74 times smaller


### Quant Static with no retrain

In [39]:
my_qconfig = torch.quantization.QConfig(activation=torch.quantization.MinMaxObserver.with_args(dtype=torch.quint8), 
                                        weight=torch.quantization.default_observer.with_args(dtype=torch.qint8))

In [40]:
vgg16_quant2 = copy.deepcopy(vgg16)
#add layers
#set_parameter_requires_grad(vgg16_quant2, feature_extracting = True)
vgg16_quant2.features = nn.Sequential(torch.quantization.QuantStub(), vgg16_quant2.features)
vgg16_quant2.classifier = nn.Sequential(vgg16_quant2.classifier, torch.quantization.DeQuantStub())
#set configs
#vgg16_quant2.qconfig = torch.quantization.get_default_qconfig('fbgemm')
vgg16_quant2.qconfig = my_qconfig
vgg16_quant2 = torch.quantization.fuse_modules(vgg16_quant2, [['features.1.0', 'features.1.1', 'features.1.2'],
                                                             ['features.1.3', 'features.1.4', 'features.1.5'],
                                                             ['features.1.7', 'features.1.8', 'features.1.9'],
                                                             ['features.1.10', 'features.1.11', 'features.1.12'],
                                                             ['features.1.14', 'features.1.15', 'features.1.16'],
                                                             ['features.1.17', 'features.1.18', 'features.1.19'],
                                                             ['features.1.20', 'features.1.21', 'features.1.22'],
                                                             ['features.1.24', 'features.1.25', 'features.1.26'],
                                                             ['features.1.27', 'features.1.28', 'features.1.29'],
                                                             ['features.1.30', 'features.1.31', 'features.1.32'],
                                                             ['features.1.34', 'features.1.35', 'features.1.36'],
                                                             ['features.1.37', 'features.1.38', 'features.1.39'],
                                                             ['features.1.40', 'features.1.41', 'features.1.42']])
vgg16_quant2 = torch.quantization.prepare(vgg16_quant2)
loss, accuracy = test_model(vgg16_quant2, data = 'train_large_subset')
print(loss, accuracy)
vgg16_quant2 = torch.quantization.convert(vgg16_quant2)

test Loss: 0.0003 Acc: 0.4991
0.00025286932941526175 0.499099999666214


  reduce_range will be deprecated in a future release of PyTorch."


In [41]:
start = time.time()
loss, accuracy = test_model(vgg16_quant2)
end = time.time()
print(loss, accuracy)
print('time seconds:', end - start)

test Loss: 0.1029 Acc: 0.7181
0.10290030390024185 0.7181000113487244
time seconds: 63.51117753982544


In [42]:
# compare the sizes
f=print_size_of_model(vgg16,"baseline")
q=print_size_of_model(vgg16_quant2,"static quantization")
print("{0:.2f} times smaller".format(f/q))

model:  baseline  	 Size (KB): 136126.773
model:  static quantization  	 Size (KB): 34067.257
4.00 times smaller


### Quant Static with Retrain -- BUGGY

In [None]:
vgg16_quant3 = copy.deepcopy(vgg16)
#add layers
vgg16_quant3.features = nn.Sequential(torch.quantization.QuantStub(), vgg16_quant3.features)
vgg16_quant3.classifier = nn.Sequential(vgg16_quant3.classifier, torch.quantization.DeQuantStub())
#set configs
vgg16_quant3.qconfig = torch.quantization.get_default_qconfig('fbgemm')
vgg16_quant3 = torch.quantization.prepare_qat(vgg16_quant3)
#retrain
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(vgg16_quant3.parameters(), lr=0.001)
train_model(vgg16_quant3, criterion, optimizer, exp_lr_scheduler, num_epochs = 1);
vgg16_quant3.eval()
vgg16_quant3 = torch.quantization.convert(vgg16_quant3)


Epoch 0/0
----------
LR is set to 0.001
train Loss: 0.1743 Acc: 0.4400
Training complete in 0m 52s
Best val Acc: 0.000000
returning and looping back


In [None]:
loss, accuracy = test_model(vgg16_quant3)
print(loss, accuracy)

test Loss: 0.0485 Acc: 0.0200
0.048465121537446976 0.019999999552965164


In [None]:
# compare the sizes
f=print_size_of_model(vgg16,"baseline")
q=print_size_of_model(vgg16_quant3,"static quantization with retrain")
print("{0:.2f} times smaller".format(f/q))

model:  baseline  	 Size (KB): 538686.817
model:  static quantization with retrain  	 Size (KB): 134857.157
3.99 times smaller
