In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
sentences = [
    "That is a happy person",
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]

## Embeddings

In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
embeddings = model.encode(sentences)

  return forward_call(*args, **kwargs)


In [7]:
embeddings

array([[-0.03387694,  0.0919416 ,  0.04870139, ..., -0.01439267,
        -0.02754978,  0.04475824],
       [ 0.00504997,  0.06316976,  0.01415726, ...,  0.04035439,
         0.07584126,  0.09087352],
       [-0.00248314,  0.091517  ,  0.04838616, ..., -0.0264111 ,
        -0.07529819,  0.02803203],
       [-0.01629126,  0.10406609,  0.09740777, ...,  0.00676729,
        -0.08788462,  0.03404385]], shape=(4, 384), dtype=float32)

In [8]:
similarities = model.similarity(embeddings, embeddings)

In [9]:
similarities

tensor([[1.0000, 0.6946, 0.9429, 0.2569],
        [0.6946, 1.0000, 0.6211, 0.2491],
        [0.9429, 0.6211, 1.0000, 0.2106],
        [0.2569, 0.2491, 0.2106, 1.0000]])

## Reranker
1. Calculates a similarity score given pairs of texts.
2. Generally provides superior performance compared to a Sentence Transformer (a.k.a. bi-encoder) model.
3. Often slower than a Sentence Transformer model, as it requires computation for each pair rather than each text.
4. Due to the previous 2 characteristics, Cross Encoders are often used to re-rank the top-k results from a Sentence Transformer model.

In [10]:
from sentence_transformers import CrossEncoder

In [12]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')

In [13]:
# The texts for which to predict similarity scores
query = "How many people live in Berlin?"

passages = [
    "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.",
    "Berlin has a yearly total of about 135 million day visitors, making it one of the most-visited cities in the European Union.",
    "In 2013 around 600,000 Berliners were registered in one of the more than 2,300 sport and fitness clubs.",
]

In [15]:
scores = cross_encoder.predict([(query, passage) for passage in passages])
scores

  return forward_call(*args, **kwargs)


array([8.607141 , 5.506264 , 6.3529844], dtype=float32)

We can also do the same using the `rank` method.

In [16]:
rank = cross_encoder.rank(query, passages)
rank

  return forward_call(*args, **kwargs)


[{'corpus_id': 0, 'score': np.float32(8.607141)},
 {'corpus_id': 2, 'score': np.float32(6.3529844)},
 {'corpus_id': 1, 'score': np.float32(5.506264)}]

## Sparse Encoders
1. Calculates sparse vector representations where most dimensions are zero
2. Provides efficiency benefits for large-scale retrieval systems due to the sparse nature of embeddings
3. Often more interpretable than dense embeddings, with non-zero dimensions corresponding to specific tokens
4. Complementary to dense embeddings, enabling hybrid search systems that combine the strengths of both approaches

In [17]:
from sentence_transformers import SparseEncoder

In [18]:
model = SparseEncoder('naver/splade-cocondenser-ensembledistil')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [19]:
embeddings = model.encode(sentences)

  return forward_call(*args, **kwargs)


In [20]:
embeddings

tensor(indices=tensor([[    0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     1,     1,     1,     1,
                            1,     1,     1,     1,     1,     1,     1,     1,
                            1,     1,   