In [69]:
from sentence_transformers import SentenceTransformer, util
import torch

model = SentenceTransformer('all-MiniLM-L6-v2')
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [70]:
#sentences for encoding of the apple batch
sentences = [
            'The Batch of apples has class 3.',
            'Apple class is 3.',
            'Classification of the batch of apples is 3.'
            '3 is the classificationnumber.',
            'Batchclass is 3.',
            'The batch has 4 blotched apples.',
            '4 blotched appels are in this batch.',
            'You have 3 scabbed apples in this batch.',
            'The bast has 3 apples classified as beeing scab.',
            'There are 72 healthy apples.', 
            'This batch conssits 72 healthy apples.',
            '1 apple was rot.',
            'Only 1 apple was classified as beeing rot.',
            'The batch conssits between the 5.000 and 10.000 apples.',
            'Batch size is between 5.000 and 10.000.',
            '80 apples are beeing checked',
            'From this batch 80 apples where checked'
            ]

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    # print("Sentence:", sentence)
    # print("Embedding:", embedding)
    print("")



















<h2>Comparing Sentence Similarities</h2>

In [71]:
#Sentences are encoded by calling model.encode()
emb1 = model.encode("Whats the class of the Apple batch?")
emb2 = model.encode("Can you tell me the class of this batch?")
emb3 = model.encode("I would like to know the classification of this batch?")
cos_sim = util.cos_sim(emb2, emb3)
print("Cosine-Similarity:", cos_sim)

Cosine-Similarity: tensor([[0.8866]])


In [72]:
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

In [92]:
# sentences 

corpus = [
            'The Batch of apples has class 3.',
            'Apple class is 3.',
            'Classification of the batch of apples is 3.',
            '3 is the classificationnumber.',
            'Batchclass is 3.',
            'The batch has 4 blotched apples.',
            '4 blotched appels are in this batch.',
            'You have 3 scabbed apples in this batch.',
            'The bast has 3 apples classified as beeing scab.',
            'There are 72 healthy apples.', 
            'This batch conssits 72 healthy apples.',
            '1 apple was rot.',
            'Only 1 apple was classified as beeing rot.',
            'The batch conssits between the 5.000 and 10.000 apples.',
            'Batch size is between 5.000 and 10.000.',
            '80 apples are beeing checked',
            'From this batch 80 apples where checked'
          ]

#Encode all sentences
embeddings = model.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, embeddings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

Top-5 most similar pairs:
1 apple was rot. 	 Only 1 apple was classified as beeing rot. 	 0.8493
There are 72 healthy apples. 	 This batch conssits 72 healthy apples. 	 0.8482
The Batch of apples has class 3. 	 Classification of the batch of apples is 3.3 is the classificationnumber. 	 0.7915
The batch has 4 blotched apples. 	 The batch conssits between the 5.000 and 10.000 apples. 	 0.7888
The batch has 4 blotched apples. 	 This batch conssits 72 healthy apples. 	 0.7827


<h2>Semantic Search</h2>

In [93]:
# Corpus with example sentences
# corpus = ['What is the classification of the batch']
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['The apple batch classification']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(2, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score),'\n')
        





Query: The apple batch classification

Top 5 most similar sentences in corpus:
Classification of the batch of apples is 3. (Score: 0.6819) 

The Batch of apples has class 3. (Score: 0.6466) 



In [86]:
    
    """
    # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
    """
    
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))    


Classification of the batch of apples is 3.3 is the classificationnumber. (Score: 0.6661)
The Batch of apples has class 3. (Score: 0.6466)
The batch conssits between the 5.000 and 10.000 apples. (Score: 0.5876)
From this batch 80 apples where checked (Score: 0.5692)
The batch has 4 blotched apples. (Score: 0.5602)
