In [2]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from pathlib import Path

In [3]:
model_path = "../models/embeddings/mxbai-embed-large-v1"

In [4]:
def create_path_if_not_exist(path: str):
    """
    Create a directory path recursively if it does not exist.
    
    Arguments:
    path: The directory path to create.
    """
    directory = Path(path)
    
    if not directory.exists():
        directory.mkdir(parents=True, exist_ok=True)
        print(f"Path '{path}' created successfully.")
    else:
        print(f"Path '{path}' already exists.")

In [5]:
create_path_if_not_exist(model_path)

Path '../models/embeddings/mxbai-embed-large-v1' created successfully.


In [6]:
# 1. Specify preffered dimensions
dimensions = 1024
# 2. load model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions)

In [9]:
# For retrieval you need to pass this prompt.
def test_embeeding(model: SentenceTransformer) -> None:
    query = 'Represent this sentence for searching relevant passages: A man is eating a piece of bread'

    docs = [
        query,
        "A man is eating food.",
        "A man is eating pasta.",
        "The girl is carrying a baby.",
        "A man is riding a horse.",
    ]

    # 2. Encode
    embeddings = model.encode(docs)

    # Optional: Quantize the embeddings
    # binary_embeddings = quantize_embeddings(embeddings, precision="ubinary")

    similarities = cos_sim(embeddings[0], embeddings[1:])
    print('similarities:', similarities)

In [8]:
model.save(path=model_path)

In [10]:
model = SentenceTransformer(model_path)

In [11]:
test_embeeding(model=model)

similarities: tensor([[0.7920, 0.6369, 0.1651, 0.3621]])
