In [21]:
#REFERENCE EMBEDDING OUTPUT FROM ORIGINAL MODEL EXECUTION USING TRANSFORMERS LIBRARY
import torch
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import numpy as np
# AD NOTE: I had to drop the sentence_transformers because of keras incompatibility 
# But I was able to get the same output as sentence-transformers by using the mean pooling method.
# Set print options
torch.set_printoptions(precision=8, sci_mode=False, linewidth=200, threshold=1000)
np.set_printoptions(precision=8, suppress=True, linewidth=200, threshold=1000)

# Load model and tokenizer
model_name = 'intfloat/multilingual-e5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

input_texts = [
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",
]

# Tokenize inputs
inputs = tokenizer(input_texts, padding=True, truncation=True, 
                  return_tensors='pt', max_length=512)

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    #AD: note that model outputs are in the shape of [batch_size, sequence_length_max, hidden_size]
    #sequence length is always padded to the max length of the sequence (512 in this case)
    #we need to use mean pooling to get the embedding for the entire sequence 
    #and finally normalize the embeddings to have a unit vector length of 1 (L2 norm)
    #attention mask is always token 0 for padding tokens and 1 for non-padding tokens [batch_size, sequence_length_max]
    #so we need to first expand the attention mask to the size of the token embeddings
    #and then use it to mask the token embeddings
    #and finally use the attention mask to mask the token embeddings
    
    # Use mean pooling (attention-masked)
    token_embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask']
    
    # prepare attention mask by expanding the attention mask to the size of the token embeddings
    attention_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

    #apply attention mask and do mean pooling along the sequence length dimension to get the embedding for the entire sequence 
    sentence_embedding = torch.sum(token_embeddings * attention_mask_expanded, 1) / torch.clamp(attention_mask_expanded.sum(1), min=1e-9)
    
    # Normalize the sentence embedding to have a unit vector length of 1 (L2 norm)
    sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)

# Convert to numpy for same output format as sentence-transformers
sentence_embedding_hf = sentence_embedding.numpy()

print(sentence_embedding_hf.shape)
print(sentence_embedding_hf)



(1, 384)
[[-0.02603657 -0.04028142 -0.0407016  -0.0567059   0.09808024 -0.00689326  0.00147932  0.04843688  0.11323299 -0.02740479 -0.00866384  0.0041722   0.05230619 -0.04768711 -0.06603136  0.08920389
   0.06334062 -0.0531108   0.00967995 -0.10728    -0.00384981 -0.02598385 -0.00927944  0.07551413  0.06361946  0.01656309  0.04170515  0.01730528  0.01455702 -0.04344152 -0.05670433 -0.04429421
   0.07144866 -0.03361619  0.04803946 -0.00959462 -0.08393569 -0.04850754  0.05855654 -0.05139397  0.01839359  0.05547391  0.00980077  0.04608278  0.02681234  0.07292694 -0.06347434  0.05774028
   0.00521451 -0.0223504  -0.04456337  0.06401621  0.0201432   0.04503602  0.07350688 -0.04566628 -0.01399929 -0.04260228 -0.08010492 -0.05667777  0.06421689 -0.0662206  -0.01281161  0.00306563
   0.06230233  0.06887282 -0.02185547  0.02037258 -0.06924744 -0.05492327 -0.05856651  0.04827426  0.02585801 -0.04206208  0.07226781 -0.00066223  0.02808696 -0.04768767 -0.02578073 -0.0346549
  -0.05184536 -0.02213

In [None]:
#TFLITE CONVERSION AND EXECUTION OF CONVERTED MODEL

import torch
from torch.fx import symbolic_trace
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import numpy as np
import ai_edge_torch
import tensorflow as tf
import os 

tflite_model_path = "./e5_embedding_model.tflite"

torch.set_printoptions(precision=8, sci_mode=False, linewidth=200, threshold=1000)
np.set_printoptions(precision=8, suppress=True, linewidth=200, threshold=1000)
class E5EmbeddingModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        #model outputs are in the shape of [batch_size, sequence_length_max, hidden_size] i.e. padded token embeddings 
        #mean pooling is done by summing the token embeddings and dividing by the number of non-padding tokens
        #this gives us a single the embedding for the entire sequence
        token_embeddings = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * mask, dim=1)
        sum_mask = mask.sum(dim=1).clamp(min=1e-9)
        pooled = sum_embeddings / sum_mask
        return torch.nn.functional.normalize(pooled, p=2, dim=1)

# Create the complete model
complete_model = E5EmbeddingModel('intfloat/multilingual-e5-small')
complete_model.eval()


input_texts = [
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",    
]

# 3. Tokenize the text
inputs = complete_model.tokenizer(
    input_texts,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)
# tokenizer outputs a dictionary with input_ids and attention_mask
print("Tokenizer's output:")
for key, value in inputs.items():
    print(f"{key}: {value.shape}\n")


print ("Running the Pytorch Embeddings Neural Network program...")
print("\n\n")

print(inputs['input_ids'].dtype)         # Should be torch.int64
print(inputs['attention_mask'].dtype)    # Should be torch.int64
print(inputs['input_ids'].shape)         # e.g., torch.Size([1, 16])
print(inputs['attention_mask'].shape)    # Same

#4. Generate embedding with PyTorch
with torch.no_grad():
    sentence_embedding_pytorch = complete_model(inputs["input_ids"], inputs["attention_mask"])


# # 5. Print or use the embedding
# print("PYTORCH: Embedding shape:", sentence_embedding_pytorch.shape)  # shape: (1, hidden_size)
# print("PYTORCH: Embedding:", sentence_embedding_pytorch)

print("Export-time input shape:", inputs['input_ids'].shape)
print("Export-time attention shape:", inputs['attention_mask'].shape)

#Convert to TFLite using ai_edge_torch
try:
    #TODO AD: note that it's possible to do FP16 quantization here - but it's not done by default
    edge_model = ai_edge_torch.convert(
        complete_model,(inputs['input_ids'], inputs['attention_mask'])
    )
    
    edge_model.export(tflite_model_path)
    print(f"Exporting to {tflite_model_path}...")
    
    #Verify file was created
    if os.path.exists(tflite_model_path):
         file_size = os.path.getsize(tflite_model_path)
         print(f"TFLite model exported successfully: {file_size} bytes")
    else:
         print("ERROR: TFLite file was not created")
        
    print("Direct TFLite conversion successful!")
except Exception as e:
    print(f"TFLiteConversion failed: {e}")
    

print("Running the TFLite Neural Network program...")
print("\n\n")

print(inputs['input_ids'].dtype)         # Should be torch.int64
print(inputs['attention_mask'].dtype)    # Should be torch.int64
print(inputs['input_ids'].shape)         # e.g., torch.Size([1, 16])
print(inputs['attention_mask'].shape)    # Same


with torch.no_grad():
    # Method 1: Call it like a function
    sentence_embedding_tflite = edge_model(inputs['input_ids'], inputs['attention_mask'])
    print("Edge model output shape:", sentence_embedding_tflite.shape)
    print("Edge model output:", sentence_embedding_tflite)

#pytorch returns torch tensor, tflite returns numpy array, sentence_embedding_hf is numpy array already converted
print("CONVERSION SUMMARY")
print("HF Transformers: Sentence Embedding shape:", sentence_embedding_hf.shape)  # shape: (1, hidden_size)
print("PYTORCH: Sentence Embedding shape:", sentence_embedding_pytorch.shape)  # shape: (1, hidden_size)
print("TFLite:  Sentence Embedding shape:", sentence_embedding_tflite.shape)
print("TFLite - HF Transformers Mean Difference:", np.mean(np.abs(sentence_embedding_tflite - sentence_embedding_hf)))
print("PYTORCH - HF Transformers Mean Difference:", np.mean(np.abs(sentence_embedding_pytorch.numpy() - sentence_embedding_hf)))
print("PYTORCH - TFLite Mean Difference:", np.mean(np.abs(sentence_embedding_pytorch.numpy() - sentence_embedding_tflite)))




Tokenizer's output:
input_ids: torch.Size([1, 512])

attention_mask: torch.Size([1, 512])

Running the Pytorch Embeddings Neural Network program...



torch.int64
torch.int64
torch.Size([1, 512])
torch.Size([1, 512])
Export-time input shape: torch.Size([1, 512])
Export-time attention shape: torch.Size([1, 512])
INFO:tensorflow:Assets written to: /var/folders/2v/7m66d7dj3q71w4mx7vh6xgf40000gn/T/tmpjjllzqq9/assets


INFO:tensorflow:Assets written to: /var/folders/2v/7m66d7dj3q71w4mx7vh6xgf40000gn/T/tmpjjllzqq9/assets
W0000 00:00:1749452264.342669 19994713 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1749452264.342691 19994713 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.


Exporting to ./e5_embedding_model.tflite...
TFLite model exported successfully: 470394716 bytes
Direct TFLite conversion successful!
Running the TFLite Neural Network program...



torch.int64
torch.int64
torch.Size([1, 512])
torch.Size([1, 512])
Edge model output shape: (1, 384)
Edge model output: [[-0.02603654 -0.04028143 -0.04070158 -0.05670586  0.09808024 -0.00689328  0.00147928  0.04843687  0.11323302 -0.02740473 -0.00866385  0.00417222  0.05230613 -0.04768719 -0.0660314   0.08920392
   0.06334058 -0.05311074  0.00967995 -0.10727999 -0.00384988 -0.02598386 -0.00927937  0.07551409  0.06361946  0.01656306  0.04170508  0.0173052   0.01455698 -0.04344153 -0.05670442 -0.04429423
   0.0714488  -0.03361628  0.04803939 -0.00959461 -0.08393559 -0.04850765  0.05855655 -0.05139392  0.01839353  0.05547389  0.00980082  0.04608277  0.02681228  0.07292698 -0.06347436  0.05774029
   0.00521456 -0.02235041 -0.04456339  0.06401615  0.02014327  0.045036    0.07350691 -0.04566626 -0.01399923 -0.04260