# Start by creating ONNX model from transformers embeddings

## It is assumed that you are already in the local path where your model files have been downloaded to. 

In [4]:
# !pip install "optimum[onnxruntime]==1.5.0" transformers evaluate mkl-include mkl --upgrade
!pip install "optimum[onnxruntime-gpu]" transformers evaluate mkl-include mkl --upgrade
!pip install tf-keras

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting mkl
  Downloading mkl-2024.1.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2024.* (from mkl)
  Downloading intel_openmp-2024.1.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting tf-keras
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB 640.0 kB/s eta 0:00:03
   ---- ----------------------------------- 0.2/1.7 MB 2.1 MB/s eta 0:00:01
   -------------- ------------------------- 0.6/1.7 MB 4.8 MB/s eta 0:00:01
   -------------------------- ------------- 1.1/1.7 MB 6.5 MB/s eta 0:00:01
   ---------------------------------------  1.7/1.7 MB 7.8 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 M

In [5]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from pathlib import Path
 
 
model_id="./"
onnx_path = Path("onnx_opt")
 
# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
 
# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)




The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead
Framework not specified. Using pt to export the model.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.

***** Exporting submodel 1/1: RobertaModel *****
Using framework PyTorch: 2.3.0+cpu
Overriding 1 configuration item(s)
	- use_cache -> False


('onnx_opt\\tokenizer_config.json',
 'onnx_opt\\special_tokens_map.json',
 'onnx_opt\\vocab.json',
 'onnx_opt\\merges.txt',
 'onnx_opt\\added_tokens.json',
 'onnx_opt\\tokenizer.json')

# Now for inferencing, creating custom functions to allow encode from pytorch esque to work

In [6]:
from transformers import Pipeline
import torch.nn.functional as F
import torch
 
# copied from the model card
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
 
class SentenceEmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        # we don't have any hyperameters to sanitize
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}
 
    def preprocess(self, inputs):
        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        return encoded_inputs
 
    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}
 
    def postprocess(self, model_outputs):
        # Perform pooling
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings

# Use the previously loaded & converted model (without loading local ONNX save file), for inferencing

In [22]:
# init pipeline
vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)
 
# run inference
pred = vanilla_emb("Could you assist me in finding my lost card?")
 
# print an excerpt from the sentence embedding
print(pred[0][:5])
#     tensor([-0.0631,  0.0426,  0.0037,  0.0377,  0.0414])

tensor([ 0.0446, -0.0113, -0.0328, -0.0394, -0.0135])


# On local terminal, CD into the ONNX path where the new files are. 

# Load locally saved ONNX model & run inferencing on it

In [26]:
#Load from the ONNX saved path a model for inferencing

model_id="./onnx_opt/"
onnx_path = Path("./")
 
# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=False)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# init pipeline
vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)
 
# run inference
pred = vanilla_emb("Could you assist me in finding my lost card?")
 
# print an excerpt from the sentence embedding
print(pred[0][:5])
#     tensor([-0.0631,  0.0426,  0.0037,  0.0377,  0.0414])

The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead


tensor([ 0.0446, -0.0113, -0.0328, -0.0394, -0.0135])
