In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-small')
input_texts = [
    'query: how much protein should a female eat',
    'query: 南瓜的家常做法',
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 i     s 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "passage: 1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮     ,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,     放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油     锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀      6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅"
]
embeddings = model.encode(input_texts, normalize_embeddings=True)
print(embeddings.shape)
print(embeddings)



(4, 384)
[[ 0.04217593 -0.03877138 -0.05730907 ...  0.025732    0.03076709
   0.04966558]
 [ 0.05337083 -0.01525038 -0.03673498 ...  0.07745236  0.06234612
   0.01122018]
 [-0.02770222 -0.03813098 -0.03982561 ...  0.06744722  0.05041993
   0.05880518]
 [ 0.01656801 -0.00350602 -0.03791859 ...  0.06404015  0.07488152
   0.01880834]]


In [None]:
import torch
from torch.fx import symbolic_trace
import torch.nn as nn
import executorch.exir as exir
from executorch.extension.pybindings.portable_lib import _load_for_executorch
from transformers import AutoModel, AutoTokenizer

class E5EmbeddingModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        token_embeddings = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * mask, dim=1)
        sum_mask = mask.sum(dim=1).clamp(min=1e-9)
        pooled = sum_embeddings / sum_mask
        return torch.nn.functional.normalize(pooled, p=2, dim=1)

# Create the complete model
complete_model = E5EmbeddingModel('intfloat/multilingual-e5-small')
complete_model.eval()


input_texts = [
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 i     s 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",    
]

# 3. Tokenize the text
inputs = complete_model.tokenizer(
    input_texts,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)
# tokenizer outputs a dictionary with input_ids and attention_mask
print("Tokenizer's output:")
for key, value in inputs.items():
    print(f"{key}: {value.shape}\n")


print ("Running the Pytorch Embeddings Neural Network program...")
print("\n\n")

print(inputs['input_ids'].dtype)         # Should be torch.int64
print(inputs['attention_mask'].dtype)    # Should be torch.int64
print(inputs['input_ids'].shape)         # e.g., torch.Size([1, 16])
print(inputs['attention_mask'].shape)    # Same




# 4. Generate embedding
with torch.no_grad():
    embedding = complete_model(inputs["input_ids"], inputs["attention_mask"])


# 5. Print or use the embedding
print("PYTORCH: Embedding shape:", embedding.shape)  # shape: (1, hidden_size)
print("PYTORCH: Embedding:", embedding)

print("Export-time input shape:", inputs['input_ids'].shape)
print("Export-time attention shape:", inputs['attention_mask'].shape)
#Export to ExecuTorch
with torch.no_grad():
    exported_program = torch.export.export(
        complete_model,
        (inputs['input_ids'], inputs['attention_mask'])
    )

# Print the exported program's graph
# print("Exported Program Graph:")
# print(exported_program.graph_module.graph)

edge_program = exir.to_edge(exported_program)
executorch_program = edge_program.to_executorch()


with open("e5_complete.pte", "wb") as f:
     executorch_program.write_to_file(f)


print ("Exported to ExecuTorch successfully!")
print ("Running the Executorch Neural Network program...")
print("\n\n")

print(inputs['input_ids'].dtype)         # Should be torch.int64
print(inputs['attention_mask'].dtype)    # Should be torch.int64
print(inputs['input_ids'].shape)         # e.g., torch.Size([1, 16])
print(inputs['attention_mask'].shape)    # Same



# Load model
model = _load_for_executorch("e5_complete.pte")

with torch.no_grad():
    embedding_et = model.forward((inputs['input_ids'], inputs['attention_mask']))[0]  # typically returns a tuple
# 5. Print or use the embedding
print("EXECUTORCH: Embedding shape:", embedding_et.shape)  # shape: (1, hidden_size)
print("PYTORCH: Embedding:", embedding_et)



Tokenizer's output:
input_ids: torch.Size([4, 512])

attention_mask: torch.Size([4, 512])

Running the Pytorch Embeddings Neural Network program...



torch.int64
torch.int64
torch.Size([4, 512])
torch.Size([4, 512])
PYTORCH: Embedding shape: torch.Size([4, 384])
PYTORCH: Embedding: tensor([[ 0.0422, -0.0388, -0.0573,  ...,  0.0257,  0.0308,  0.0497],
        [ 0.0534, -0.0153, -0.0367,  ...,  0.0775,  0.0623,  0.0112],
        [-0.0277, -0.0381, -0.0398,  ...,  0.0674,  0.0504,  0.0588],
        [ 0.0166, -0.0035, -0.0379,  ...,  0.0640,  0.0749,  0.0188]])
Export-time input shape: torch.Size([4, 512])
Export-time attention shape: torch.Size([4, 512])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[program.cpp:136] InternalConsistency verification requested but not available


Exported to ExecuTorch successfully!
Running the Executorch Neural Network program...



torch.int64
torch.int64
torch.Size([4, 512])
torch.Size([4, 512])
EXECUTORCH: Embedding shape: torch.Size([4, 384])
PYTORCH: Embedding: tensor([[ 0.0422, -0.0388, -0.0573,  ...,  0.0257,  0.0308,  0.0497],
        [ 0.0534, -0.0153, -0.0367,  ...,  0.0775,  0.0623,  0.0112],
        [-0.0277, -0.0381, -0.0398,  ...,  0.0674,  0.0504,  0.0588],
        [ 0.0166, -0.0035, -0.0379,  ...,  0.0640,  0.0749,  0.0188]])
