In [2]:
import numpy as np
from tqdm import tqdm
import torch
import torch.nn.utils.prune as prune
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification
from transformers import AdamW
from utils import EarlyStopping, TokenClassificationDataset
import io
from torch.utils.data import DataLoader
import optparse
import pickle
from constants import *
import os.path
import os

In [54]:
import time

In [55]:
def inference_latency(model, inputs, num_samples=100, num_warmups=100):
    with torch.no_grad():
        for _ in range(num_warmups):
            _ = model(torch.unsqueeze(inputs['ids'], 0), torch.unsqueeze(inputs['masks'], 0))
#     torch.cuda.synchronize()
    
    with torch.no_grad():
        stime = time.time()
        for _ in range(num_samples):
            _ = model(torch.unsqueeze(inputs['ids'], 0), torch.unsqueeze(inputs['masks'], 0))
#             torch.cuda.synchronize()
        etime = time.time()
    elapsed_time = etime - stime
    
    return elapsed_time, elapsed_time/num_samples
        

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
target_list = NER_TARGET

In [20]:
len(target_list)

9

In [9]:
num_classes = len(target_list)
model = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base", 
                                                        num_labels=num_classes).to(device)
model.load_state_dict(torch.load(CHECKPOINT_PATH, map_location=device))

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mo

<All keys matched successfully>

In [10]:
quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [15]:
torch.save(quantized_model.state_dict(), RESULT_PATH + "/quantized_NER_Pretrained_phoBERT.pt")

In [17]:
os.path.getsize(RESULT_PATH + "/quantized_NER_Pretrained_phoBERT.pt")//(2 ** 20), \
os.path.getsize(RESULT_PATH + "/NER_Pretrained_phoBERT.pt")//(2 ** 20)

(269, 512)

In [21]:
target_list = NER_TARGET
with io.open(NER_PATH_TEST, encoding='utf-8') as f:
    test_task = f.read()
test_task = test_task.split('\n\n')
print('Load NER data sucessfully!')

Load NER data sucessfully!


In [23]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=True)
print(f'Loaded tokenizer, using {device}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded tokenizer, using cpu


In [43]:
test_task = test_task[:1]    
test_dataset = TokenClassificationDataset(test_task, target_list, tokenizer)

In [47]:
torch.unsqueeze(test_dataset[0]['ids'], 0)

tensor([[    0,  1108,  1612,  1896,   802,  9443,   238,    60,    48,    82,
            78,  7164,   126, 13098,    72,   150,   355, 29618,    26,   337,
            44,    13,   283,   523,    28,   224,   366,     7,   327,     5,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [58]:
inference_latency(model, test_dataset[0], num_samples=100, num_warmups=50)

(46.35512590408325, 0.4635512590408325)

In [59]:
inference_latency(quantized_model, test_dataset[0], num_samples=100, num_warmups=50)

(31.60396456718445, 0.3160396456718445)