In [1]:
import os
import re
import time
import spacy
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
import pandas as pd
from models import Gen
import warnings as wrn
import logging
from tqdm import tqdm

SEED = 2021

wrn.filterwarnings('ignore')
os.environ['SP_DIR'] = '/opt/conda/lib/python3.11/site-packages'
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cuda.deterministic = True

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# MODE = 'inference' # 'train' or 'inference' or 'none'

### Load Data

In [4]:
train_data = pd.read_csv('data/ag_news/train.csv', header=None, usecols=[0,2])
train_data.columns = ['label', 'text']
valid_data = pd.read_csv('data/ag_news/valid.csv', header=None, usecols=[0,2])
valid_data.columns = ['label', 'text']
test_data = pd.read_csv('data/ag_news/test.csv', header=None, usecols=[0,2])
test_data.columns = ['label', 'text']

In [5]:
TRAIN_SIZE = len(train_data)
VALID_SIZE = len(valid_data)
TEST_SIZE = len(test_data)
print(TRAIN_SIZE, VALID_SIZE, TEST_SIZE)

120000 5000 7600


### Clean Data

In [6]:
# def clean_text(text):
#     return re.sub(r'[^A-Za-z0-9]+', ' ', str(text))

In [7]:
# train_data['text'] = train_data['text'].apply(clean_text)
# valid_data['text'] = valid_data['text'].apply(clean_text)
# test_data['text'] = test_data['text'].apply(clean_text)

# train_data.to_csv('data/ag_news3/train_clean.csv', index=False, header=False)
# valid_data.to_csv('data/ag_news3/valid_clean.csv', index=False, header=False)
# test_data.to_csv('data/ag_news3/test_clean.csv', index=False, header=False)

### Tokenize Data

In [8]:
spacy_en = spacy.load('en_core_web_sm')

def spacy_tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [9]:
LABEL = data.LabelField()
TEXT = data.Field(tokenize=spacy_tokenizer, batch_first=True, include_lengths=True)
fields = [('label', LABEL), ('text', TEXT)]

In [10]:
train_dataset = data.TabularDataset(path='data/ag_news/train_clean.csv', format='csv', fields=fields, skip_header=True)
valid_dataset = data.TabularDataset(path='data/ag_news/valid_clean.csv', format='csv', fields=fields, skip_header=True)
test_dataset = data.TabularDataset(path='data/ag_news/test_clean.csv', format='csv', fields=fields, skip_header=True)

In [11]:
print(vars(train_dataset.examples[0]))

{'label': '4', 'text': ['Apple', 'yesterday', 'released', 'a', '12', '7', 'MB', 'security', 'update', 'that', 'consists', 'of', 'several', 'revised', 'components', 'including', 'Apache', 'AppKit', 'HIToolbox', 'Kerberos', 'Postfix', 'PSNormalizer', 'Safari', 'and', 'Terminal']}


In [12]:
TEXT.build_vocab(train_dataset, min_freq=5)
LABEL.build_vocab(train_dataset)

In [13]:
label_counts = {LABEL.vocab.itos[i]: LABEL.vocab.freqs[LABEL.vocab.itos[i]] for i in range(len(LABEL.vocab))}
print("Number of instances per class:", label_counts)

print("Size of text vocab:",len(TEXT.vocab))

print("Size of label vocab:",len(LABEL.vocab))

Number of instances per class: {'2': 30000, '3': 30000, '4': 30000, '1': 29999}
Size of text vocab: 27797
Size of label vocab: 4


In [14]:
TEXT.vocab.freqs.most_common(10)

[('the', 174008),
 ('to', 96378),
 ('a', 95595),
 ('of', 89434),
 ('in', 76339),
 ('and', 66138),
 ('on', 47406),
 ('s', 43763),
 ('for', 37311),
 ('39', 31877)]

In [15]:
BATCH_SIZE = 32

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
)

test_iterator = data.BucketIterator(
    test_dataset,
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
)

### Define Batches

In [16]:
VOCAB_SIZE= len(TEXT.vocab)
WORD_EMB_DIM = 100
LABEL_EMB_DIM = 100
HID_DIM = 100
NLAYERS = 1
NCLASS = len(LABEL.vocab)
DROPOUT = 0
USE_CUDA = torch.cuda.is_available()
TIED = False
USE_BIAS = False
CONCAT_LABEL = 'hidden'
AVG_LOSS = False
ONE_HOT = False
BIT_WIDTH = 5

LR = 1e-4
LOG_INTERVAL = 200
CLIP = 1.0
LOGGING = logging.INFO

In [17]:
model = Gen(VOCAB_SIZE, WORD_EMB_DIM, LABEL_EMB_DIM, HID_DIM, NLAYERS, NCLASS, DROPOUT, USE_CUDA, TIED, USE_BIAS, CONCAT_LABEL, AVG_LOSS, ONE_HOT).to(device)
criterion = nn.CrossEntropyLoss(reduce=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

In [18]:
def init_hidden(model, bsz):
    weight = next(model.parameters())
    # Return hidden state and cell state as 2D tensors
    return (weight.new_zeros(NLAYERS, HID_DIM),
            weight.new_zeros(NLAYERS, HID_DIM))

In [19]:
def evaluate(valid_iterator, model, criterion, mode='valid', model_state=0):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    cnt = 0
    
    with torch.no_grad():
        for batch in tqdm(valid_iterator, desc=f"Evaluating ({mode})", leave=True):
            sents = [torch.tensor(row) for row in batch.text[0]]
            labels = batch.label
            # y_exts = [torch.full((batch.text[0].shape[1],), labels[i], dtype=torch.long) for i in range(len(labels))]
            y_exts = []
            for y_label in range(NCLASS):
                y_ext = []
                for d in sents:
                    y_ext.append(torch.LongTensor([y_label] * (len(d) - 1)))
                y_exts.append(y_ext)
            
            
            hidden = init_hidden(model, len(sents))
            x = nn.utils.rnn.pack_sequence([s[:-1] for s in sents])
            x_pred = nn.utils.rnn.pack_sequence([s[1:] for s in sents])

            # p_y = torch.FloatTensor([0.071] * len(seq_len))

            losses = []
            for y_ext in y_exts:
                y_ext = nn.utils.rnn.pack_sequence(y_ext)

                if device.type == 'cuda':
                    x, y_ext, x_pred, labels = x.cuda(), y_ext.cuda(), x_pred.cuda(), labels.cuda()

                # output (batch_size, )
                hidden = init_hidden(model, len(sents))
                
                out = model(x, x_pred, y_ext, hidden, criterion, model_state)
                
                loss_matrix = criterion(out, x_pred.data)

                LM_loss = nn.utils.rnn.pad_packed_sequence(nn.utils.rnn.PackedSequence(
                    loss_matrix, x.batch_sizes))[0].transpose(0,1)
                sum_loss = torch.sum(LM_loss, dim = 1)
                
                losses.append(sum_loss)

            losses = torch.cat(losses, dim=0).view(-1, len(sents))
            prediction = torch.argmin(losses, dim=0)

            num_correct = (prediction == labels).float().sum()

            total_loss += torch.sum(torch.min(losses, dim=0)[0]).item()
            total_correct += num_correct.item()
            cnt += 1

    return total_loss / cnt, total_correct / (VALID_SIZE if mode == 'valid' else TEST_SIZE if mode == 'test' else '0') * 100.0

In [20]:
# if MODE == 'inference':
#     model.load_state_dict(torch.load('gen_lstm_spacy_best_val.pth'))
#     model.eval()
    
#     test_loss, test_acc = evaluate(test_iterator, model, criterion, mode='test', model_state='fp')
#     print('=' * 89)
#     print(f'Test Loss: {test_loss} | Test Acc:  {test_acc}')
#     print('=' * 89)

In [21]:
from brevitas.graph.quantize import preprocess_for_quantize
from ptq_common import quantize_model, apply_bias_correction, apply_act_equalization

In [22]:
# pre_model = preprocess_for_quantize(
#             model,
#             equalize_iters=20,
#             equalize_merge_bias=True,
#             merge_bn=True,
#             channel_splitting_ratio=0.0,
#             channel_splitting_split_input=False)

In [23]:
dtype = getattr(torch, 'float')
device = torch.device('cpu')
print("device is set to ",device)
print("Quantizing the model")
quant_model = quantize_model(
        model.to(device),
        dtype=dtype,
        device=device,
        backend='layerwise',
        scale_factor_type='float_scale',
        bias_bit_width=32,
        weight_bit_width=BIT_WIDTH,
        weight_narrow_range=False,
        weight_param_method='stats',
        weight_quant_granularity='per_tensor',
        weight_quant_type='sym',
        layerwise_first_last_bit_width=BIT_WIDTH,
        act_bit_width=BIT_WIDTH,
        act_param_method='stats',
        act_quant_percentile=99.99,
        act_quant_type='sym',
        quant_format='int',
        layerwise_first_last_mantissa_bit_width=4,
        layerwise_first_last_exponent_bit_width=3,
        weight_mantissa_bit_width=4,
        weight_exponent_bit_width=3,
        act_mantissa_bit_width=4,
        act_exponent_bit_width=3).to(device) 

print("Quantization completed!")
device = torch.device('cuda')
print("device is set back to ",device)
model=model.to(device)
quant_model=quant_model.to(device)

device is set to  cpu
Quantizing the model
Quantization completed!
device is set back to  cuda


In [24]:
quant_model.load_state_dict(torch.load(f'./GenQuantResults/ModelParameter_Gen_{BIT_WIDTH}bit.pth'))
quant_model=quant_model.to(device)

In [25]:
test_loss, test_acc = evaluate(test_iterator, quant_model, criterion, mode='test', model_state='quant')
print('=' * 89)
print(f'Test Loss: {test_loss} | Test Acc before calibration:  {test_acc}')
print('=' * 89)

Evaluating (test): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 238/238 [1:52:52<00:00, 28.46s/it]

Test Loss: 10497.739056755514 | Test Acc before calibration:  60.578947368421055



