# ONNX + ONNX Runtime
Before you begin, make sure you have all the necessary libraries installed :

- pip install optimum[exporters]
- pip install onnxruntime==1.16

It is possible to export 🤗 Transformers and Diffusers models to the ONNX format and perform graph optimization as well as quantization easily:
- optimum-cli export onnx -m deepset/roberta-base-squad2 --optimize O2 roberta_base_qa_onnx

The model can then be quantized using onnxruntime:
- optimum-cli onnxruntime quantize \
  --avx512 \
  --onnx_model roberta_base_qa_onnx \
  -o quantized_roberta_base_qa_onnx

In [71]:
from optimum.onnxruntime import ORTModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline

model_id = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(tokenizer)

RobertaTokenizerFast(name_or_path='deepset/roberta-base-squad2', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}


In [73]:
model = ORTModelForQuestionAnswering.from_pretrained("roberta_base_qa_onnx")
print(model.config)

RobertaConfig {
  "_name_or_path": "roberta_base_qa_onnx/",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



In [81]:
print(model.base_model_prefix)
print(model.preprocessors)

optimized_model
[RobertaTokenizerFast(name_or_path='roberta_base_qa_onnx', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}, RobertaTokenizerFast(name_or_path='roberta_base_qa_onnx', vocab_size=50265, mo

In [83]:
qa_pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
print(qa_pipe)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


<transformers.pipelines.question_answering.QuestionAnsweringPipeline object at 0x14a5cbaf0>


In [84]:
question = "What's Optimum?"
context = "Optimum is an awesome library everyone should use!"
results = qa_pipe(question=question, context=context)
results

{'score': 0.532249927520752,
 'start': 11,
 'end': 29,
 'answer': 'an awesome library'}

# Check the Model Precision Before Quantization:
- Before quantizing the model, the precision of the weights is typically in fp32 (32-bit floating point). This is the default precision for most models unless you explicitly convert or quantize them to a different format.

# Check the Quantized Model's Precision:
- To verify that your model has been quantized to int8, you can inspect the ONNX model's nodes to see if the operations are using int8 tensors. You can do this by loading the ONNX model and examining the data types of its nodes.

In [86]:
import onnx

# Load the ONNX model
onnx_model = onnx.load("quantized_roberta_base_qa_onnx/model_quantized.onnx")

# Iterate through the model's nodes
for node in onnx_model.graph.node:
    for input_tensor in node.input:
        input_info = next((i for i in onnx_model.graph.input if i.name == input_tensor), None)
        if input_info:
            data_type = input_info.type.tensor_type.elem_type
            print(f"Input tensor '{input_tensor}' of node '{node.name}' has data type: {onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[data_type]}")

    for output_tensor in node.output:
        output_info = next((o for o in onnx_model.graph.output if o.name == output_tensor), None)
        if output_info:
            data_type = output_info.type.tensor_type.elem_type
            print(f"Output tensor '{output_tensor}' of node '{node.name}' has data type: {onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[data_type]}")


Input tensor 'attention_mask' of node '' has data type: int64
Input tensor 'input_ids' of node '/roberta/embeddings/Equal' has data type: int64
Input tensor 'input_ids' of node '/roberta/Shape_1' has data type: int64
Input tensor 'input_ids' of node '/roberta/embeddings/word_embeddings/Gather' has data type: int64
Output tensor 'start_logits' of node '/Squeeze' has data type: float32
Output tensor 'end_logits' of node '/Squeeze_1' has data type: float32


In [87]:
import os

# Check the size of the original ONNX model
original_model_size = os.path.getsize("roberta_base_qa_onnx/model.onnx")
print(f"Original Model Size: {original_model_size / 1e6} MB")

# Check the size of the quantized ONNX model
quantized_model_size = os.path.getsize("quantized_roberta_base_qa_onnx/model_quantized.onnx")
print(f"Quantized Model Size: {quantized_model_size / 1e6} MB")


Original Model Size: 496.27695 MB
Quantized Model Size: 124.539271 MB


In [88]:
import onnx

# Load the ONNX model
onnx_model = onnx.load("quantized_roberta_base_qa_onnx/model_quantized.onnx")

# Iterate through the model's nodes
for node in onnx_model.graph.node:
    print(f"Node: {node.name}, OpType: {node.op_type}")
    for input_tensor in node.input:
        input_info = next((i for i in onnx_model.graph.input if i.name == input_tensor), None)
        if input_info:
            data_type = input_info.type.tensor_type.elem_type
            print(f"  Input tensor '{input_tensor}' has data type: {onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[data_type]}")

    for output_tensor in node.output:
        output_info = next((o for o in onnx_model.graph.output if o.name == output_tensor), None)
        if output_info:
            data_type = output_info.type.tensor_type.elem_type
            print(f"  Output tensor '{output_tensor}' has data type: {onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[data_type]}")


Node: , OpType: Cast
  Input tensor 'attention_mask' has data type: int64
Node: /roberta/embeddings/Equal, OpType: Equal
  Input tensor 'input_ids' has data type: int64
Node: /roberta/Shape_1, OpType: Shape
  Input tensor 'input_ids' has data type: int64
Node: /roberta/embeddings/word_embeddings/Gather, OpType: Gather
  Input tensor 'input_ids' has data type: int64
Node: MaskReduceSum_0, OpType: ReduceSum
Node: /roberta/embeddings/Not, OpType: Not
Node: /roberta/Gather_1, OpType: Gather
Node: /roberta/Reshape, OpType: Reshape
Node: /roberta/embeddings/word_embeddings/Gather_output_0_DequantizeLinear, OpType: DequantizeLinear
Node: /roberta/embeddings/Cast, OpType: Cast
Node: /roberta/Unsqueeze_2, OpType: Unsqueeze
Node: /roberta/Equal, OpType: Equal
Node: /roberta/embeddings/CumSum, OpType: CumSum
Node: /roberta/Slice, OpType: Slice
Node: /roberta/Where, OpType: Where
Node: /roberta/embeddings/Mul, OpType: Mul
Node: /roberta/Expand, OpType: Expand
Node: /roberta/embeddings/Cast_1, OpTy