In [1]:
%load_ext autotime
%load_ext autoreload
%autoreload 2

In [2]:
import os
import git
git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
os.chdir(git_repo.git.rev_parse("--show-toplevel"))

time: 187 ms


# PyTorch Quantization

This tutorial will look at the different [quantization methods](https://pytorch.org/docs/stable/quantization.html) that are currently supported PyTorch. 

1. Post Training Dynamic Quantization
    + Popular for LSTMs/Transformers where execution time is dominated by loading the weights from memory
2. Post Training Static Quantization
    + Most commonly used form of quantization where quantization of weights is performed during a calibration step
3. Quantization Aware Training (QAT)
    + This is used if Post Training Static Quantization does not provide enough accuracy. QAT performs fake quantization during training. 
    
For this example we will use the TorchText IMDB sentiment dataset to get started quickly and a simple CNN model. 


### Setting up the environment

We'll get started by setting up the environment, loading the packages we need and setting the random seed. Then we can define the fields in the torchtext dataset. 

In [3]:
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
import numpy as np

from modules.models.pytorch_basic_cnn import CNN


SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

time: 1.57 s


In [4]:
TEXT = torchtext.data.Field(tokenize = 'spacy', batch_first = True)
LABEL = torchtext.data.LabelField(dtype = torch.float)

train_data, test_data = torchtext.datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d")

LABEL.build_vocab(train_data)

time: 3min 24s


In [5]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

time: 112 ms


In [6]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

time: 115 ms


In [7]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.5551,  0.0393, -0.3477,  ..., -0.2534,  0.3526,  0.0432],
        [ 0.2581,  0.4548, -0.6745,  ...,  0.1374, -0.2201,  0.7585],
        [-0.4779,  0.7344, -0.4383,  ..., -0.1864, -0.0872,  0.2465]])

time: 86.9 ms


In [8]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

time: 74.8 ms


In [9]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

time: 45.7 ms


In [10]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

time: 39.7 ms


In [11]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

time: 52.3 ms


In [12]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

time: 45.9 ms


In [13]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

time: 45.3 ms


In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 15m 31s
	Train Loss: 0.499 | Train Acc: 74.63%
	 Val. Loss: 0.343 |  Val. Acc: 85.41%
Epoch: 02 | Epoch Time: 13m 49s
	Train Loss: 0.309 | Train Acc: 87.06%
	 Val. Loss: 0.294 |  Val. Acc: 87.59%
Epoch: 03 | Epoch Time: 8140m 16s
	Train Loss: 0.225 | Train Acc: 91.27%
	 Val. Loss: 0.273 |  Val. Acc: 88.58%
Epoch: 04 | Epoch Time: 10275m 20s
	Train Loss: 0.153 | Train Acc: 94.37%
	 Val. Loss: 0.269 |  Val. Acc: 89.40%
