In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
from datetime import datetime

In [53]:
DATASET_SIZE = 480
DATASET_IS_BALANCED = False

training_name = 'bert-finetune_{}k_{}'.format(
    DATASET_SIZE,
    'bal' if DATASET_IS_BALANCED else 'imbal'
)

training_args_datetime = datetime(year=2023, month=12, day=20)          # change the date to the date of training
training_storing_folder = Path(training_name).resolve()

model_path = Path.joinpath(
        training_storing_folder, 
        '{}_{}_model'.format(training_name, training_args_datetime.strftime('%Y-%m-%d')))

In [54]:
model_path.name

'bert-finetune_480k_imbal_2023-12-20_model'

In [55]:
# save the model and tokenizer to ONNX format

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from optimum.onnxruntime import ORTModelForSequenceClassification


tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
ort_model = ORTModelForSequenceClassification.from_pretrained(
    model_path
    ,export=True)

save_directory = Path.joinpath(training_storing_folder, model_path.name + '_onnx')

if not save_directory.exists():
    save_directory.mkdir(parents=True)

print('Saving model to {}'.format(save_directory))

tokenizer.save_pretrained(save_directory)
ort_model.save_pretrained(save_directory)

Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.1.0+cu121
Overriding 1 configuration item(s)
	- use_cache -> False


Saving model to /root/FYP/NLP/dev-workspace/sa/bert_2023-12-13/bert-finetune_480k_imbal/bert-finetune_480k_imbal_2023-12-20_model_onnx


In [56]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# ref: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization

optimizer = ORTOptimizer.from_pretrained(ort_model)
optimization_config = OptimizationConfig(\
    optimization_level=2,
    enable_transformers_specific_optimizations=True,
    optimize_for_gpu=False) 

optimizer.optimize(
    save_dir=save_directory,
    optimization_config=optimization_config
)

# it will create two files:
# model_optimized.onnx
# ort_config.json

Optimizing model...
Configuration saved in /root/FYP/NLP/dev-workspace/sa/bert_2023-12-13/bert-finetune_480k_imbal/bert-finetune_480k_imbal_2023-12-20_model_onnx/ort_config.json
Optimized model saved at: /root/FYP/NLP/dev-workspace/sa/bert_2023-12-13/bert-finetune_480k_imbal/bert-finetune_480k_imbal_2023-12-20_model_onnx (external data format: False; saved all tensor to one file: True)


PosixPath('/root/FYP/NLP/dev-workspace/sa/bert_2023-12-13/bert-finetune_480k_imbal/bert-finetune_480k_imbal_2023-12-20_model_onnx')

In [57]:
# load the ONNX model and tokenizer

from transformers import AutoTokenizer
from onnxruntime import InferenceSession

tokenizer = AutoTokenizer.from_pretrained(save_directory)
session = InferenceSession(Path.joinpath(
    save_directory,
    "model_optimized.onnx")
)

input_names = [label.name for label in session.get_inputs()]
output_names = [label.name for label in session.get_outputs()]

# ONNX Runtime expects NumPy arrays as input
inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
outputs = session.run(output_names=output_names, input_feed=dict(inputs))

In [58]:
input_names

['input_ids', 'attention_mask', 'token_type_ids']

In [59]:
output_names

['logits']

In [60]:
# load huggingface original model

from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification


hg_model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [61]:
test_data = [['I like the game'], ["I do not like it."], ["It crashes when I just run on my pc."]]

In [62]:
# test inference

pred_hg = []
perd_onnx = []

for i in range(len(test_data)):

    # hg inference
    hg_inputs = tokenizer(test_data[i], return_tensors="pt", max_length=tokenizer.model_max_length, truncation=True)
    hg_outputs = hg_model(**hg_inputs)

    # onnx inference
    onnx_inputs = tokenizer(test_data[i], return_tensors="np", max_length=tokenizer.model_max_length, truncation=True)
    onnx_outputs = session.run(output_names=output_names, input_feed=dict(onnx_inputs))

    perd_onnx.append(onnx_outputs[0])   # only get the unsoftmaxed logits
    pred_hg.append(hg_outputs.logits.detach().numpy())  # only get the unsoftmaxed logits


print(pred_hg)
print(perd_onnx)

# compare the results
for i in range(len(pred_hg)):
    np.testing.assert_allclose(pred_hg[i], perd_onnx[i], rtol=1e-3, atol=1e-3)

[array([[-3.1710236,  3.2229025]], dtype=float32), array([[ 1.2209499 , -0.87591463]], dtype=float32), array([[ 0.76088977, -0.63426894]], dtype=float32)]
[array([[-3.1710243,  3.2229028]], dtype=float32), array([[ 1.22095  , -0.8759148]], dtype=float32), array([[ 0.76088965, -0.6342685 ]], dtype=float32)]
