In [7]:
#REFERENCE EMBEDDING OUTPUT FROM ORIGINAL MODEL EXECUTION USING TRANSFORMERS LIBRARY
import torch
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import numpy as np
# AD NOTE: I had to drop the sentence_transformers because of keras incompatibility 
# But I was able to get the same output as sentence-transformers by using the mean pooling method.
# Set print options
torch.set_printoptions(precision=8, sci_mode=False, linewidth=200, threshold=1000)
np.set_printoptions(precision=8, suppress=True, linewidth=200, threshold=1000)

# Load model and tokenizer
model_name = 'intfloat/multilingual-e5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

input_texts = [
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",
]

# Tokenize inputs
inputs = tokenizer(input_texts, padding=True, truncation=True, 
                  return_tensors='pt', max_length=512)

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    #AD: note that model outputs are in the shape of [batch_size, sequence_length_max, hidden_size]
    #sequence length is always padded to the max length of the sequence (512 in this case)
    #we need to use mean pooling to get the embedding for the entire sequence 
    #and finally normalize the embeddings to have a unit vector length of 1 (L2 norm)
    #attention mask is always token 0 for padding tokens and 1 for non-padding tokens [batch_size, sequence_length_max]
    #so we need to first expand the attention mask to the size of the token embeddings
    #and then use it to mask the token embeddings
    #and finally use the attention mask to mask the token embeddings
    
    # Use mean pooling (attention-masked)
    token_embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask']
    
    # prepare attention mask by expanding the attention mask to the size of the token embeddings
    attention_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

    #apply attention mask and do mean pooling along the sequence length dimension to get the embedding for the entire sequence 
    sentence_embedding = torch.sum(token_embeddings * attention_mask_expanded, 1) / torch.clamp(attention_mask_expanded.sum(1), min=1e-9)
    
    # Normalize the sentence embedding to have a unit vector length of 1 (L2 norm)
    sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)

# Convert to numpy for same output format as sentence-transformers
sentence_embedding_hf = sentence_embedding.numpy()

print(sentence_embedding_hf.shape)
print(sentence_embedding_hf)



(1, 384)
[[-0.02603657 -0.04028142 -0.0407016  -0.0567059   0.09808024 -0.00689326  0.00147932  0.04843688  0.11323299 -0.02740479 -0.00866384  0.0041722   0.05230619 -0.04768711 -0.06603136  0.08920389
   0.06334062 -0.0531108   0.00967995 -0.10728    -0.00384981 -0.02598385 -0.00927944  0.07551413  0.06361946  0.01656309  0.04170515  0.01730528  0.01455702 -0.04344152 -0.05670433 -0.04429421
   0.07144866 -0.03361619  0.04803946 -0.00959462 -0.08393569 -0.04850754  0.05855654 -0.05139397  0.01839359  0.05547391  0.00980077  0.04608278  0.02681234  0.07292694 -0.06347434  0.05774028
   0.00521451 -0.0223504  -0.04456337  0.06401621  0.0201432   0.04503602  0.07350688 -0.04566628 -0.01399929 -0.04260228 -0.08010492 -0.05667777  0.06421689 -0.0662206  -0.01281161  0.00306563
   0.06230233  0.06887282 -0.02185547  0.02037258 -0.06924744 -0.05492327 -0.05856651  0.04827426  0.02585801 -0.04206208  0.07226781 -0.00066223  0.02808696 -0.04768767 -0.02578073 -0.0346549
  -0.05184536 -0.02213

In [11]:
#COREML CONVERSION AND EXECUTION OF CONVERTED COREML MODEL

import torch
from torch.fx import symbolic_trace
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import numpy as np
import os 
import coremltools as ct
from torch.fx import symbolic_trace

coreml_model_path = "./e5_embedding_model.mlpackage"

torch.set_printoptions(precision=8, sci_mode=False, linewidth=200, threshold=1000)
np.set_printoptions(precision=8, suppress=True, linewidth=200, threshold=1000)
class E5EmbeddingModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        #model outputs are in the shape of [batch_size, sequence_length_max, hidden_size] i.e. padded token embeddings 
        #mean pooling is done by summing the token embeddings and dividing by the number of non-padding tokens
        #this gives us a single the embedding for the entire sequence
        token_embeddings = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * mask, dim=1)
        sum_mask = mask.sum(dim=1).clamp(min=1e-9)
        pooled = sum_embeddings / sum_mask
        return torch.nn.functional.normalize(pooled, p=2, dim=1)

# Create the complete model
complete_model = E5EmbeddingModel('intfloat/multilingual-e5-small')
complete_model.eval()


input_texts = [
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",    
]

# 3. Tokenize the text
inputs = complete_model.tokenizer(
    input_texts,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)
# tokenizer outputs a dictionary with input_ids and attention_mask
print("Tokenizer's output:")
for key, value in inputs.items():
    print(f"{key}: {value.shape}\n")


print ("Running the Pytorch Embeddings Neural Network program...")
print("\n\n")

print(inputs['input_ids'].dtype)         # Should be torch.int64
print(inputs['attention_mask'].dtype)    # Should be torch.int64
print(inputs['input_ids'].shape)         # e.g., torch.Size([1, 16])
print(inputs['attention_mask'].shape)    # Same

#4. Generate embedding with PyTorch
with torch.no_grad():
    sentence_embedding_pytorch = complete_model(inputs["input_ids"], inputs["attention_mask"])

#5. trace the model 

traced_pytorch_model = torch.jit.trace(complete_model, (inputs['input_ids'], inputs['attention_mask']))

# # 5. Print or use the embedding
# print("PYTORCH: Embedding shape:", sentence_embedding_pytorch.shape)  # shape: (1, hidden_size)
# print("PYTORCH: Embedding:", sentence_embedding_pytorch)

print("Export-time input shape:", inputs['input_ids'].shape)
print("Export-time attention shape:", inputs['attention_mask'].shape)


try:
    # Convert PyTorch model to CoreML
    # Note: For transformer models, you might need to specify compute_precision for optimization

    coreml_model = ct.convert(
        traced_pytorch_model,
        source="pytorch",
        inputs=[
            ct.TensorType(name="input_ids", shape=inputs['input_ids'].shape, dtype=np.int32),
            ct.TensorType(name="attention_mask", shape=inputs['attention_mask'].shape, dtype=np.int32)
        ],
        outputs=[ct.TensorType(name="sentence_embedding")],
        compute_precision=ct.precision.FLOAT32  # Can use FLOAT16 for smaller model size
    )
    
    # Save CoreML model
    coreml_model.save(coreml_model_path)
    print(f"CoreML model saved to: {coreml_model_path}")
    
    # Verify model was created
    if os.path.exists(coreml_model_path):
        print(f"CoreML model exported successfully")
    else:
        print("ERROR: CoreML model was not created")
        
    print("CoreML conversion successful!")
    
except Exception as e:
    print(f"CoreML conversion failed: {e}")
    coreml_model = None

print("Running the CoreML Neural Network program...")
print("\n\n")

print(inputs['input_ids'].dtype)         # Should be torch.int64
print(inputs['attention_mask'].dtype)    # Should be torch.int64
print(inputs['input_ids'].shape)         # e.g., torch.Size([1, 16])
print(inputs['attention_mask'].shape)    # Same

if coreml_model is not None:
    try:
        # Convert inputs to the expected format for CoreML
        coreml_inputs = {
            "input_ids": inputs['input_ids'].numpy().astype(np.int32),
            "attention_mask": inputs['attention_mask'].numpy().astype(np.int32)
        }
        
        # Run inference with CoreML model
        coreml_prediction = coreml_model.predict(coreml_inputs)
        sentence_embedding_coreml = coreml_prediction["sentence_embedding"]
        
        print("CoreML model output shape:", sentence_embedding_coreml.shape)
        print("CoreML model output:", sentence_embedding_coreml)
        
        # Comparison summary
        print("CONVERSION SUMMARY")
        print("HF Transformers: Sentence Embedding shape:", sentence_embedding_hf.shape)  # shape: (1, hidden_size)
        print("CoreML:  Sentence Embedding shape:", sentence_embedding_coreml.shape)
        print("CoreML - HF Transformers Mean Difference:", np.mean(np.abs(sentence_embedding_coreml - sentence_embedding_hf)))
        
    except Exception as e:
        print(f"CoreML inference failed: {e}")
else:
    print("Skipping CoreML inference due to conversion failure")




Tokenizer's output:
input_ids: torch.Size([1, 512])

attention_mask: torch.Size([1, 512])

Running the Pytorch Embeddings Neural Network program...



torch.int64
torch.int64
torch.Size([1, 512])
torch.Size([1, 512])




Export-time input shape: torch.Size([1, 512])
Export-time attention shape: torch.Size([1, 512])


Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 567/568 [00:00<00:00, 6504.61 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 148.48 passes/s]
Running MIL default pipeline: 100%|██████████| 87/87 [00:00<00:00, 104.36 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 162.84 passes/s]


CoreML model saved to: ./e5_embedding_model.mlpackage
CoreML model exported successfully
CoreML conversion successful!
Running the CoreML Neural Network program...



torch.int64
torch.int64
torch.Size([1, 512])
torch.Size([1, 512])
CoreML model output shape: (1, 384)
CoreML model output: [[-0.02603657 -0.04028145 -0.04070156 -0.0567059   0.09808026 -0.00689322  0.00147932  0.04843691  0.11323304 -0.02740478 -0.00866384  0.00417222  0.05230611 -0.04768718 -0.06603134  0.0892039
   0.06334062 -0.0531108   0.00967996 -0.10728001 -0.00384989 -0.02598388 -0.00927937  0.07551409  0.06361942  0.01656307  0.04170508  0.01730528  0.01455699 -0.04344153 -0.05670436 -0.04429425
   0.07144875 -0.03361621  0.04803944 -0.00959456 -0.08393563 -0.04850756  0.05855657 -0.05139393  0.01839356  0.05547391  0.0098008   0.0460828   0.02681229  0.07292693 -0.06347437  0.0577403
   0.00521456 -0.02235039 -0.04456335  0.06401617  0.02014329  0.04503596  0.0735069  -0.04566622 -0.01399922 -0.04260225 -0.08010