In [3]:
# !pip install -U FlagEmbedding sentence_transformers

In [4]:
from FlagEmbedding import FlagModel

In [5]:
model_name = 'BAAI/bge-small-en-v1.5'

**Prepare data**

In [6]:
doc1 = 'I walked with the dog'
doc2 = 'I feed the cat'
doc3 = 'I ate burger to lunch'
doc4 = 'I took a coffé and chocolate'

sentences_1 = [doc1, doc2]
sentences_2 = [doc3, doc4]

# Using FlagEmbedding

In [18]:
model = FlagModel(model_name,
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)

In [8]:
embeddings_1 = model.encode(doc1)
embeddings_2 = model.encode(doc2)
embeddings_3 = model.encode(doc3)
embeddings_4 = model.encode(doc4)

similarity = embeddings_1 @ embeddings_2.T
print("similarity of doc1 to doc2:")
print(similarity)
similarity = embeddings_3 @ embeddings_4.T
print("similarity of doc3 to doc4:")
print(similarity)
similarity = embeddings_1 @ embeddings_3.T
print("similarity of doc1 to doc3:")
print(similarity)
similarity = embeddings_1 @ embeddings_4.T
print("similarity of doc1 to doc4:")
print(similarity)

similarity of doc1 to doc2:
0.6212703
similarity of doc3 to doc4:
0.69927436
similarity of doc1 to doc3:
0.6102983
similarity of doc1 to doc4:
0.6857992


In [9]:
embeddings_1 = model.encode(sentences_1)
embeddings_2 = model.encode(sentences_2)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.6102983  0.6857991 ]
 [0.64514744 0.6762895 ]]


# Using Sentence-Transformers

In [10]:
from sentence_transformers import SentenceTransformer

In [24]:
model = SentenceTransformer(model_name)

In [16]:
embeddings_1 = model.encode(doc1, normalize_embeddings=True)
embeddings_2 = model.encode(doc2, normalize_embeddings=True)
embeddings_3 = model.encode(doc3, normalize_embeddings=True)
embeddings_4 = model.encode(doc4, normalize_embeddings=True)

similarity = embeddings_1 @ embeddings_2.T
print("similarity of doc1 to doc2:")
print(similarity)
similarity = embeddings_3 @ embeddings_4.T
print("similarity of doc3 to doc4:")
print(similarity)
similarity = embeddings_1 @ embeddings_3.T
print("similarity of doc1 to doc3:")
print(similarity)
similarity = embeddings_1 @ embeddings_4.T
print("similarity of doc1 to doc4:")
print(similarity)

similarity of doc1 to doc2:
0.62127036
similarity of doc3 to doc4:
0.6992743
similarity of doc1 to doc3:
0.61029816
similarity of doc1 to doc4:
0.6857992


In [27]:
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.6102983  0.6857992 ]
 [0.64514744 0.67628956]]


**LIST EMBEDDINGS**

In [28]:
sentences = [doc1, doc2, doc3, doc4]
embeddings = model.encode(sentences, normalize_embeddings=True)
similarity = embeddings @ embeddings.T
print(similarity)


[[1.0000001  0.6212704  0.61029834 0.6857991 ]
 [0.6212704  1.0000001  0.64514756 0.67628956]
 [0.61029834 0.64514756 1.         0.6992745 ]
 [0.6857991  0.67628956 0.6992745  1.        ]]


In [29]:
sentences = [
    "I love pizza",
    "i love Pizza",
    "i have a passion for Pizza",
    "I like pizza",
    "Pizza is my favorite food",
    "I think pizza is yummy!",
    "I love eating pizza",
    "I love to eat pizza",
    "I like eating pizza",
    "I am obsessed with pizza",
    "I am addicted to pizza",
    "Uranium-235 has a half-life of 703.8 million years",
    "I HATE pizza",
    "Pizza is disgusting!",
    "Pizza is a horrible food",
]
embeddings = model.encode(sentences, normalize_embeddings=True)
similarity = embeddings @ embeddings.T
print(similarity)

[[1.0000002  1.0000002  0.91963625 0.9501734  0.9380901  0.8924688
  0.9479071  0.94965065 0.9204278  0.90476555 0.88284546 0.42250344
  0.77583843 0.71252453 0.73855567]
 [1.0000002  1.0000002  0.91963625 0.9501734  0.9380901  0.8924688
  0.9479071  0.94965065 0.9204278  0.90476555 0.88284546 0.42250344
  0.77583843 0.71252453 0.73855567]
 [0.91963625 0.91963625 1.         0.8910183  0.8995117  0.84936094
  0.88761234 0.90647984 0.87563735 0.923127   0.8867707  0.40718925
  0.7265849  0.6905863  0.731738  ]
 [0.9501734  0.9501734  0.8910183  1.         0.92236316 0.8956665
  0.91339517 0.9253816  0.95442617 0.86526144 0.8565476  0.43088847
  0.78160536 0.717241   0.7427681 ]
 [0.9380901  0.9380901  0.8995117  0.92236316 0.99999994 0.8922594
  0.92269695 0.9268213  0.9112665  0.891978   0.87388015 0.4465599
  0.7537708  0.7129973  0.76871884]
 [0.8924688  0.8924688  0.84936094 0.8956665  0.8922594  1.0000001
  0.8729282  0.8810704  0.8767743  0.8262595  0.8083968  0.42865402
  0.698357

In [None]:
# TODO
# Another example to calculate simularity
# https://www.architecture-performance.fr/ap_blog/using-a-local-sentence-embedding-model-for-similarity-calculation/