# Optimum API for Deployment with ONNX and inference
Tutorial in the [link](https://www.philschmid.de/optimizing-transformers-with-optimum)

## 1. Install `Optimum` for Onnxruntime

In [None]:
!pip install "optimum[onnxruntime]" evaluate[evaluator] --upgrade


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 2. Convert a Hugging Face Transformers model to ONNX for inference

In [1]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM

model_id = "VietAI/vit5-base-vietnews-summarization"
onnx_path = Path("onnx")
task = "summarization"

2023-05-24 23:36:26.986719: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-24 23:36:27.070553: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-24 23:36:27.071454: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:

# load vanilla transformers and convert to onnx
model = ORTModelForSeq2SeqLM.from_pretrained(onnx_path, export=True)
tokenizer = AutoTokenizer.from_pretrained(onnx_path)

Framework not specified. Using pt to export to ONNX.
Using framework PyTorch: 2.0.1+cu118
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR



Using framework PyTorch: 2.0.1+cu118
Overriding 1 configuration item(s)
	- use_cache -> True
  if causal_mask.shape[1] < attention_mask.shape[1]:


verbose: False, log level: Level.ERROR



Using framework PyTorch: 2.0.1+cu118
Overriding 1 configuration item(s)
	- use_cache -> True
Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `decoder_input_ids`.
  elif past_key_value.shape[2] != key_value_states.shape[1]:


verbose: False, log level: Level.ERROR



In [None]:
# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/spiece.model',
 'onnx/added_tokens.json',
 'onnx/tokenizer.json')

In [None]:
# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2
optimum_summarizer = pipeline(task, model=model, tokenizer=tokenizer)

In [None]:
text = 'VietAI là tổ chức phi lợi nhuận với sứ mệnh ươm mầm tài năng về trí tuệ nhân tạo và xây dựng một cộng đồng các chuyên gia trong lĩnh vực trí tuệ nhân tạo đẳng cấp quốc tế tại Việt Nam.'
prediction = optimum_summarizer(text)
print(prediction)

[{'summary_text': 'ViệtAI là tổ chức phi lợi nhuận duy nhất được thành lập năm 2003'}]


## 3. Use the ORTOptimizer to optimize the model

In [None]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.optimize(
    save_dir=onnx_path,
    optimization_config=optimization_config,
)



Optimizing model...
Configuration saved in onnx/ort_config.json
Optimized model saved at: onnx (external data format: False; saved all tensor to one file: True)


PosixPath('onnx')

In [None]:
from transformers import pipeline

# load optimized model
model = ORTModelForSeq2SeqLM.from_pretrained(onnx_path, file_name="model_optimized.onnx")

# create optimized pipeline
optimized_summarizer = pipeline(task, model=model, tokenizer=tokenizer)
optimized_summarizer(text)


[{'summary_text': 'ViệtAI là tổ chức phi lợi nhuận duy nhất được thành lập năm 2003'}]

## 4. Apply dynamic quantization using ORTQuantizer from Optimum

In [None]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

# create ORTQuantizer and define quantization configuration
optimized_filenames = [
    'decoder_model_optimized.onnx',
    'decoder_with_past_model_optimized.onnx',
    'encoder_model_optimized.onnx', 
]
for filename in optimized_filenames:
  dynamic_quantizer = ORTQuantizer.from_pretrained(model_or_path=onnx_path,
                                                   file_name=filename)

  # apply the quantization configuration to the model
  model_quantized_path = dynamic_quantizer.quantize(
      save_dir=onnx_path,
      quantization_config=dqconfig,
  )

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: onnx (external data format: False)
Configuration saved in onnx/ort_config.json
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: onnx (external data format: False)
Configuration saved in onnx/ort_config.json
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: onnx (external data format: False)
Configuration saved in onnx/ort_config.json


In [None]:
%cd /content/drive/MyDrive/pre-entrance

/content/drive/MyDrive/pre-entrance


In [None]:
import os

# get model file size
quantized_filenames = [
    'decoder_model_optimized_quantized.onnx',
    'decoder_with_past_model_optimized_quantized.onnx',
    'encoder_model_optimized_quantized.onnx', 
]
for i in range(len(quantized_filenames)):
  size = os.path.getsize(onnx_path / optimized_filenames[i])/(1024*1024)
  quantized_model = os.path.getsize(onnx_path / quantized_filenames[i])/(1024*1024)

  print(f"Model file size: {size:.2f} MB")
  print(f"Quantized Model file size: {quantized_model:.2f} MB")


Model file size: 643.82 MB
Quantized Model file size: 240.73 MB
Model file size: 589.80 MB
Quantized Model file size: 227.21 MB
Model file size: 429.92 MB
Quantized Model file size: 187.01 MB


## 5. Test inference with the quantized model

In [4]:
from transformers import pipeline

# load optimized model
model = ORTModelForSeq2SeqLM.from_pretrained(onnx_path, file_name="model_quantized.onnx")
tokenizer = AutoTokenizer.from_pretrained(onnx_path)

# create optimized pipeline
optimized_summarizer = pipeline(task, model=model, tokenizer=tokenizer)

text = 'VietAI là tổ chức phi lợi nhuận với sứ mệnh ươm mầm tài năng về trí tuệ nhân tạo và xây dựng một cộng đồng các chuyên gia trong lĩnh vực trí tuệ nhân tạo đẳng cấp quốc tế tại Việt Nam.'

output = optimized_summarizer(text)


The ONNX file decoder_model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime that are ['decoder_model.onnx', 'decoder_model_quantized.onnx', 'decoder_model_optimized.onnx'], the ORTModelForSeq2SeqLM might not behave as expected.
The ONNX file decoder_with_past_model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime that are ['decoder_with_past_model.onnx', 'decoder_with_past_model_quantized.onnx', 'decoder_with_past_model_optimized.onnx'], the ORTModelForSeq2SeqLM might not behave as expected.
The ONNX file encoder_model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime, the ORTModelForConditionalGeneration might not behave as expected.


In [5]:
print(output[0]['summary_text'])

ViệtAI là tổ chức phi lợi nhuận duy nhất trên thế giới về trí tuệ nhân tạo
