In [25]:
#REFERENCE EMBEDDING OUTPUT FROM ORIGINAL MODEL EXECUTION USING TRANSFORMERS LIBRARY
#Run this part first...
import torch
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import numpy as np

#https://huggingface.co/docs/hub/datasets-usage
#https://huggingface.co/datasets/rag-datasets/rag-mini-bioasq
from datasets import load_dataset
from datasets import load_dataset_builder


# AD NOTE: I had to drop the sentence_transformers because of keras incompatibility 
# But I was able to get the same output as sentence-transformers by using the mean pooling method.
# Set print options

torch.set_printoptions(precision=8, sci_mode=False, linewidth=200, threshold=1000)
np.set_printoptions(precision=8, suppress=True, linewidth=200, threshold=1000)

ds_builder = load_dataset_builder("rag-datasets/rag-mini-bioasq", "text-corpus")
print(ds_builder.info.features)

corpus = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus")
print(corpus)
corpus = corpus["passages"]


# Load model and tokenizer
model_name = 'intfloat/multilingual-e5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# Get first 100 passages from the corpus dataset
input_texts = [
    "passage: " + text for text in corpus["passage"][:100]
]

print("Sample passages:")
for i, text in enumerate(input_texts):
    print(f"\nPassage {i+1}:")
    print(text[:200] + "..." if len(text) > 200 else text)


# input_texts = [
#     "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",
#     "passage: CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",
#     "passage: protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    
# ]

# Tokenize inputs
inputs = tokenizer(input_texts, padding=True, truncation=True, 
                  return_tensors='pt', max_length=512)
print (inputs)

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    #AD: note that model outputs are in the shape of [batch_size, sequence_length_max, hidden_size]
    #sequence length is always padded to the max length of the sequence (512 in this case)
    #we need to use mean pooling to get the embedding for the entire sequence 
    #and finally normalize the embeddings to have a unit vector length of 1 (L2 norm)
    #attention mask is always token 0 for padding tokens and 1 for non-padding tokens [batch_size, sequence_length_max]
    #so we need to first expand the attention mask to the size of the token embeddings
    #and then use it to mask the token embeddings
    #and finally use the attention mask to mask the token embeddings
    
    # Use mean pooling (attention-masked)
    token_embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask']
    
    # prepare attention mask by expanding the attention mask to the size of the token embeddings
    attention_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

    #apply attention mask and do mean pooling along the sequence length dimension to get the embedding for the entire sequence 
    sentence_embedding = torch.sum(token_embeddings * attention_mask_expanded, 1) / torch.clamp(attention_mask_expanded.sum(1), min=1e-9)
    
    # Normalize the sentence embedding to have a unit vector length of 1 (L2 norm)
    sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)

# Convert to numpy for same output format as sentence-transformers
sentence_embedding_hf = sentence_embedding.numpy()

print(sentence_embedding_hf.shape)
print(sentence_embedding_hf)



{'passage': Value(dtype='string', id=None), 'id': Value(dtype='int64', id=None)}
Sample passages:

Passage 1:
passage: New data on viruses isolated from patients with subacute thyroiditis de Quervain 
are reported. Characteristic morphological, cytological, some physico-chemical 
and biological features of th...

Passage 2:
passage: We describe an improved method for detecting deficiency of the acid hydrolase, 
alpha-1,4-glucosidase in leukocytes, the enzyme defect in glycogen storage 
disease Type II (Pompe disease). Th...

Passage 3:
passage: We have studied the effects of curare on responses resulting from iontophoretic 
application of several putative neurotransmitters onto Aplysia neurons. These 
neurons have specific receptors...

Passage 4:
passage: Kinetic and electrophoretic properties of 230--300 fold purified preparations of 
glucose-6-phosphate dehydrogenase (G6PD) from red cells of donors and patients 
with acute drug hemolytic ane...

Passage 5:
passage: Male Wistar speci

In [26]:
#COREML CONVERSION AND EXECUTION OF CONVERTED COREML MODEL
#Run this part after first part... 

import torch
from torch.fx import symbolic_trace
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import numpy as np
import os 
import coremltools as ct
from torch.fx import symbolic_trace

coreml_model_path = "./e5_embedding_model.mlpackage"

torch.set_printoptions(precision=8, sci_mode=False, linewidth=200, threshold=1000)
np.set_printoptions(precision=8, suppress=True, linewidth=200, threshold=1000)
class E5EmbeddingModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        #model outputs are in the shape of [batch_size, sequence_length_max, hidden_size] i.e. padded token embeddings 
        #mean pooling is done by summing the token embeddings and dividing by the number of non-padding tokens
        #this gives us a single the embedding for the entire sequence
        token_embeddings = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * mask, dim=1)
        sum_mask = mask.sum(dim=1).clamp(min=1e-9)
        pooled = sum_embeddings / sum_mask
        return torch.nn.functional.normalize(pooled, p=2, dim=1)

# Create the complete model
complete_model = E5EmbeddingModel('intfloat/multilingual-e5-small')
complete_model.eval()

# 3. Tokenize the text
inputs = complete_model.tokenizer(
    input_texts,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)
# tokenizer outputs a dictionary with input_ids and attention_mask
print("Tokenizer's output:")
for key, value in inputs.items():
    print(f"{key}: {value.shape}\n")


print ("Running the Pytorch Embeddings Neural Network program...")
print("\n\n")

print(inputs['input_ids'].dtype)         # Should be torch.int64
print(inputs['attention_mask'].dtype)    # Should be torch.int64
print(inputs['input_ids'].shape)         # e.g., torch.Size([1, 16])
print(inputs['attention_mask'].shape)    # Same

#4. Generate embedding with PyTorch
with torch.no_grad():
    sentence_embedding_pytorch = complete_model(inputs["input_ids"], inputs["attention_mask"])

#5. trace the model 

traced_pytorch_model = torch.jit.trace(complete_model, (inputs['input_ids'], inputs['attention_mask']))

# # 5. Print or use the embedding
# print("PYTORCH: Embedding shape:", sentence_embedding_pytorch.shape)  # shape: (1, hidden_size)
# print("PYTORCH: Embedding:", sentence_embedding_pytorch)

print("Export-time input shape:", inputs['input_ids'].shape)
print("Export-time attention shape:", inputs['attention_mask'].shape)


try:
    # Convert PyTorch model to CoreML
    # Note: For transformer models, you might need to specify compute_precision for optimization

    coreml_model = ct.convert(
        traced_pytorch_model,
        source="pytorch",
        inputs=[
            ct.TensorType(name="input_ids", shape=inputs['input_ids'].shape, dtype=np.int32),
            ct.TensorType(name="attention_mask", shape=inputs['attention_mask'].shape, dtype=np.int32)
        ],
        outputs=[ct.TensorType(name="sentence_embedding")],
        compute_precision=ct.precision.FLOAT32  # Can use FLOAT16 for smaller model size
    )
    
    # Save CoreML model
    coreml_model.save(coreml_model_path)
    print(f"CoreML model saved to: {coreml_model_path}")
    
    # Verify model was created
    if os.path.exists(coreml_model_path):
        print(f"CoreML model exported successfully")
    else:
        print("ERROR: CoreML model was not created")
        
    print("CoreML conversion successful!")
    
except Exception as e:
    print(f"CoreML conversion failed: {e}")
    coreml_model = None

print("Running the CoreML Neural Network program...")
print("\n\n")

print(inputs['input_ids'].dtype)         # Should be torch.int64
print(inputs['attention_mask'].dtype)    # Should be torch.int64
print(inputs['input_ids'].shape)         # e.g., torch.Size([1, 16])
print(inputs['attention_mask'].shape)    # Same

if coreml_model is not None:
    try:
        # Convert inputs to the expected format for CoreML
        coreml_inputs = {
            "input_ids": inputs['input_ids'].numpy().astype(np.int32),
            "attention_mask": inputs['attention_mask'].numpy().astype(np.int32)
        }
        
        # Run inference with CoreML model
        coreml_prediction = coreml_model.predict(coreml_inputs)
        sentence_embedding_coreml = coreml_prediction["sentence_embedding"]
        
        print("CoreML model output shape:", sentence_embedding_coreml.shape)
        print("CoreML model output:", sentence_embedding_coreml)
        
        # Comparison summary
        print("CONVERSION SUMMARY")
        print("HF Transformers: Sentence Embedding shape:", sentence_embedding_hf.shape)  # shape: (1, hidden_size)
        print("CoreML:  Sentence Embedding shape:", sentence_embedding_coreml.shape)
        print("CoreML - HF Transformers Mean Difference:", np.mean(np.abs(sentence_embedding_coreml - sentence_embedding_hf)))
        
    except Exception as e:
        print(f"CoreML inference failed: {e}")
else:
    print("Skipping CoreML inference due to conversion failure")

print ("If model is verified properly: now you need to manually compile the model using the following commands to complete the conversion and copy to prebuilt/models folder")
#TODO AD automate this: now you need to compile the model and generate the objective c headers using the following commands
print ("xcrun coremlc compile e5_embedding_model_i512a512_FP32.mlpackage/Data/com.apple.CoreML/model.mlmodel .")
print ("xcrun coremlc generate e5_embedding_model_i512a512_FP32.mlpackage/Data/com.apple.CoreML/model.mlmodel .")

Tokenizer's output:
input_ids: torch.Size([100, 512])

attention_mask: torch.Size([100, 512])

Running the Pytorch Embeddings Neural Network program...



torch.int64
torch.int64
torch.Size([100, 512])
torch.Size([100, 512])




Export-time input shape: torch.Size([100, 512])
Export-time attention shape: torch.Size([100, 512])


Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 567/568 [00:00<00:00, 5063.41 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 140.31 passes/s]
Running MIL default pipeline: 100%|██████████| 87/87 [00:00<00:00, 99.57 passes/s] 
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 180.97 passes/s]


CoreML model saved to: ./e5_embedding_model.mlpackage
CoreML model exported successfully
CoreML conversion successful!
Running the CoreML Neural Network program...



torch.int64
torch.int64
torch.Size([100, 512])
torch.Size([100, 512])
CoreML model output shape: (100, 384)
CoreML model output: [[ 0.0800216  -0.01421368 -0.08343757 ...  0.03563565  0.0520995   0.05273295]
 [ 0.09403674 -0.00406726 -0.07337061 ...  0.06022897  0.00974567  0.04333089]
 [ 0.07225356 -0.01599408 -0.04516048 ...  0.03383964  0.05420528  0.04943083]
 ...
 [ 0.09703236 -0.03234687 -0.06825696 ...  0.02683572  0.05403959  0.04884185]
 [ 0.0545549  -0.06540271 -0.05851531 ...  0.01909437  0.06365164  0.03874499]
 [ 0.06400624 -0.04059309 -0.06402634 ...  0.06717853  0.04043268  0.06252333]]
CONVERSION SUMMARY
HF Transformers: Sentence Embedding shape: (100, 384)
CoreML:  Sentence Embedding shape: (100, 384)
CoreML - HF Transformers Mean Difference: 3.3987803e-08
If model is verified properly: now you need to ma