# Import libraries

In [None]:
from transformers import pipeline 
import numpy as np
import torch
import pandas as pd 
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification,AutoTokenizer
from utils import *
bert_ckpt = "phnghiapro/distilbert-base-uncased-distilled-clinc"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pipe = pipeline('text-classification', model=bert_ckpt, device=device) 
print("device : ",device)
print("device of pipeline : ", pipe.model.device)

# plot the distribution of the weights

In [None]:
state_dict = pipe.model.to('cpu').state_dict()
weights = state_dict['distilbert.transformer.layer.0.attention.out_lin.weight']
plt.hist(weights.flatten().numpy(), bins=250, range=(-0.2,0.2), edgecolor="C0")
plt.show()

In [None]:
# value distributed in small range [-0.1, 0.1]
zero_point = 0 
scale = (weights.max() - weights.min()) / 255 

In [None]:
(weights / scale + zero_point).clamp(-128,127).round().char()

# Compare time and size 


In [None]:
from torch import quantize_per_tensor 
dtype = torch.qint8
quantized_weights = quantize_per_tensor(weights, scale=scale, zero_point=zero_point, dtype=dtype) 
quantized_weights.int_repr() 


## compare time

In [None]:
# %%timeit
# weights @ weights

In [None]:
from torch.nn.quantized import QFunctional 
q_fn = QFunctional() 


In [None]:
# %%timeit
# q_fn.mul(quantized_weights, quantized_weights)

In [None]:
# (1.26 milliseconds) / (95.9 microseconds) =
# 13.1386861
# 13 time faster

## compare size 


In [None]:
import sys
sys.getsizeof(weights.storage()) / sys.getsizeof(quantized_weights.storage())
# decrease 4 times in size
#3.999755879241598


# Quantize the model

In [None]:
from torch.quantization import quantize_dynamic 
model = (AutoModelForSequenceClassification.from_pretrained(bert_ckpt).to('cpu'))
model_quantized = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [None]:
from utils import *

## Plot all the model benchmard

In [None]:
# #Distillation + quantization
# model_ckpt = "phnghiapro/distilbert-base-uncased-distilled-clinc"
# tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# pipe = pipeline("text-classification",
#                 model=model_quantized,
#                 tokenizer=tokenizer,
#                 device=device
#                 )
# optim_type = "Distillation + quantization"
# pb = PerformanceBenchmark(pipe, clinc["test"], optim_type=optim_type)
# perf_metrics = pb.run_benchmark()
# # perf_metrics.update(pb.run_benchmark())


#Distillation
distilled_ckpt = "phnghiapro/distilbert-base-uncased-distilled-clinc"
pipe = pipeline('text-classification', 
                model=distilled_ckpt,
                # device=device
                )
optim_type = "Distillation"
distilbert_benchmark = PerformanceBenchmark(pipe, clinc["test"], optim_type)
perf_metrics = pb.run_benchmark()
# perf_metrics.update(distilbert_benchmark.run_benchmark())



In [None]:
plot_metrics(perf_metrics, optim_type)