In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [2]:
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

dataset = []

for i, x in enumerate(sentences):
    dataset.append({
        'id': i,
        'text': x
    })

In [3]:
print(dataset)

[{'id': 0, 'text': 'The weather is lovely today.'}, {'id': 1, 'text': "It's so sunny outside!"}, {'id': 2, 'text': 'He drove to the stadium.'}]


In [4]:
embeddings = []

for x in dataset:
    embedding = model.encode(x.get('text'))
    embeddings.append((
        x.get('id'),
        embedding
    ))

In [5]:
print(embeddings)

[(0, array([ 1.91957578e-02,  1.20085388e-01,  1.59598306e-01,  6.70659021e-02,
        5.00748232e-02, -2.59187259e-02,  5.64681925e-02, -9.28577706e-02,
       -3.76114286e-02,  6.32381253e-03, -4.28877324e-02,  4.02829377e-03,
        4.72778548e-03,  3.24676372e-02,  4.95197326e-02,  5.29818200e-02,
       -4.04454954e-02, -2.14837622e-02, -3.02760527e-02,  2.20857915e-02,
       -1.60775810e-01,  8.08077678e-02, -2.80131120e-02,  8.06255639e-02,
       -2.85814367e-02,  5.35818487e-02,  1.26382560e-02,  4.79190759e-02,
        5.71119180e-03, -3.25831026e-02, -2.61571202e-02,  8.00957680e-02,
        1.47315487e-02, -3.24082151e-02, -4.12551910e-02, -9.68343765e-03,
        5.00792288e-04, -1.56286791e-01, -6.77877441e-02,  4.88779582e-02,
        1.88976079e-02, -7.97722563e-02,  2.43868083e-02,  5.46695292e-03,
        1.10656917e-02, -7.77950697e-03, -2.20388398e-02,  3.50319780e-02,
        1.06080197e-01, -4.72456124e-03, -5.78185655e-02,  2.34565958e-02,
       -5.92035800e-

In [15]:
query = "The weather is lovely today"
query_embedding = model.encode(query)

results = []
for text in embeddings:
    text_id = text[0]
    text_embedding = text[1]
    rank = model.similarity(text_embedding, query_embedding)
    results.append((
        text_id,
        rank
    ))
results

[(0, tensor([[0.9903]])), (1, tensor([[0.6515]])), (2, tensor([[0.1066]]))]

In [10]:
results.sort(key=lambda x: x[1], reverse=True)

In [16]:
for result in results:
    dataset_id = result[0]
    rank = result[1]
    text = next(data for data in dataset if data['id'] == dataset_id)
    print(dataset_id, rank, text)

0 tensor([[0.9903]]) {'id': 0, 'text': 'The weather is lovely today.'}
1 tensor([[0.6515]]) {'id': 1, 'text': "It's so sunny outside!"}
2 tensor([[0.1066]]) {'id': 2, 'text': 'He drove to the stadium.'}
