In [18]:
from functools import partial
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification
from optimum.onnxruntime.configuration import AutoQuantizationConfig, AutoCalibrationConfig
import warnings
warnings.filterwarnings("ignore")

model_id = "distilbert-base-uncased-finetuned-sst-2-english"

onnx_model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantizer = ORTQuantizer.from_pretrained(onnx_model)
qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=True)

Framework not specified. Using pt to export the model.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.

***** Exporting submodel 1/1: DistilBertForSequenceClassification *****
Using framework PyTorch: 2.2.2


In [19]:
def preprocess_fn(ex, tokenizer):
    return tokenizer(ex["sentence"])

calibration_dataset = quantizer.get_calibration_dataset(
    "glue",
    dataset_config_name="sst2",
    preprocess_function=partial(preprocess_fn, tokenizer=tokenizer),
    num_samples=50,
    dataset_split="train",
)

calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)

ranges = quantizer.fit(
    dataset=calibration_dataset,
    calibration_config=calibration_config,
    operators_to_quantize=qconfig.operators_to_quantize,
)

Using static quantization schema (dataset: parquet, method: CalibrationMethod.MinMax)
Creating calibrator: CalibrationMethod.MinMax(CalibrationConfig(dataset_name='parquet', dataset_config_name='sst2', dataset_split='train', dataset_num_samples=50, method=<CalibrationMethod.MinMax: 0>, num_bins=None, num_quantized_bins=None, percentile=None, moving_average=False, averaging_constant=0.01))
Collecting tensors statistics...
Computing calibration ranges


In [20]:
model_quantized_path = quantizer.quantize(
    save_dir="Static Quantization/distilbert",
    calibration_tensors_range=ranges,
    quantization_config=qconfig,
)

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: True)
Quantizing model...
Saving quantized model at: Static Quantization/distilbert (external data format: False)
Configuration saved in Static Quantization/distilbert/ort_config.json


In [21]:
import onnx

# Load the ONNX model
onnx_model = onnx.load("Static Quantization/distilbert/model_quantized.onnx")

# Iterate through the model's nodes
for node in onnx_model.graph.node:
    for input_tensor in node.input:
        input_info = next((i for i in onnx_model.graph.input if i.name == input_tensor), None)
        if input_info:
            data_type = input_info.type.tensor_type.elem_type
            print(f"Input tensor '{input_tensor}' of node '{node.name}' has data type: {onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[data_type]}")

    for output_tensor in node.output:
        output_info = next((o for o in onnx_model.graph.output if o.name == output_tensor), None)
        if output_info:
            data_type = output_info.type.tensor_type.elem_type
            print(f"Output tensor '{output_tensor}' of node '{node.name}' has data type: {onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[data_type]}")


Input tensor 'input_ids' of node '/distilbert/embeddings/word_embeddings/Gather' has data type: int64
Input tensor 'attention_mask' of node '/distilbert/transformer/layer.0/attention/Equal' has data type: int64
Output tensor 'logits' of node '/classifier/Gemm_Add' has data type: float32


In [22]:
import onnx

# Load the ONNX model
onnx_model = onnx.load("augmented_model.onnx")

# Iterate through the model's nodes
for node in onnx_model.graph.node:
    for input_tensor in node.input:
        input_info = next((i for i in onnx_model.graph.input if i.name == input_tensor), None)
        if input_info:
            data_type = input_info.type.tensor_type.elem_type
            print(f"Input tensor '{input_tensor}' of node '{node.name}' has data type: {onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[data_type]}")

    for output_tensor in node.output:
        output_info = next((o for o in onnx_model.graph.output if o.name == output_tensor), None)
        if output_info:
            data_type = output_info.type.tensor_type.elem_type
            print(f"Output tensor '{output_tensor}' of node '{node.name}' has data type: {onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[data_type]}")


Input tensor 'input_ids' of node '/distilbert/embeddings/word_embeddings/Gather' has data type: int64
Input tensor 'attention_mask' of node '/distilbert/transformer/layer.0/attention/Equal' has data type: int64
Output tensor 'logits' of node '/classifier/Gemm' has data type: float32
Output tensor '/distilbert/transformer/layer.4/attention/Reshape_4_output_0_ReduceMin' of node '/distilbert/transformer/layer.4/attention/Reshape_4_output_0_ReduceMin_Reshape' has data type: float32
Output tensor '/distilbert/transformer/layer.4/attention/Reshape_4_output_0_ReduceMax' of node '/distilbert/transformer/layer.4/attention/Reshape_4_output_0_ReduceMax_Reshape' has data type: float32
Output tensor '/distilbert/transformer/layer.4/attention/Transpose_1_output_0_ReduceMin' of node '/distilbert/transformer/layer.4/attention/Transpose_1_output_0_ReduceMin_Reshape' has data type: float32
Output tensor '/distilbert/transformer/layer.4/attention/Transpose_1_output_0_ReduceMax' of node '/distilbert/trans

In [23]:
import os

# Check the size of the original ONNX model
original_model_size = os.path.getsize("augmented_model.onnx")
print(f"Original Model Size: {original_model_size / 1e6} MB")

# Check the size of the quantized ONNX model
quantized_model_size = os.path.getsize("Static Quantization/distilbert/model_quantized.onnx")
print(f"Quantized Model Size: {quantized_model_size / 1e6} MB")


Original Model Size: 268.157142 MB
Quantized Model Size: 67.610578 MB
