# Computing Sentence Embeddings
- SOURCE: https://www.sbert.net/examples/applications/computing-embeddings/README.html
- can use for shorter phrases as well as for longer texts with multiple sentences.
- If available, the model is automatically executed on the GPU.
- Input Sequence Length: for Transformer models like BERT / RoBERTa / DistilBERT etc. common value is 512 word pieces, or about 300-400 words (for English). Longer texts than this are truncated to the first x word pieces.

In [31]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-2.14861855e-01  3.95722806e-01  4.69087064e-01  1.36177167e-01
  6.91138115e-03  4.29976344e-01  1.03049481e+00 -8.20223093e-02
  2.20890209e-01 -8.78799796e-01 -1.11538485e-01 -1.70815572e-01
 -4.95070249e-01 -3.25745434e-01 -1.68513238e-01 -7.61577010e-01
 -3.08237493e-01  1.28200427e-01  2.28901476e-01 -8.63927841e-01
  8.01118433e-01 -5.13606608e-01 -8.88549760e-02  8.67860198e-01
  8.12778056e-01 -2.80299008e-01  1.01604223e+00  5.12090325e-01
  5.16793549e-01 -1.59422293e-01  8.63559693e-02 -7.04056501e-01
 -4.15288091e-01 -1.02641266e-02  1.39428407e-01 -7.40844477e-03
 -2.75420044e-02 -1.06805229e+00 -1.14779401e+00 -4.22716022e-01
  2.93512315e-01 -3.92946512e-01  3.17559749e-01 -1.97936110e-02
 -5.53470016e-01  4.99471044e-03 -5.46878390e-02  5.82734108e-01
 -1.38893008e+00 -4.44052637e-01 -8.77214432e-01 -1.82058170e-01
  4.69049752e-01 -6.04763389e-01 -3.85194927e-01 -1.42895263e-02
 -4.25226

### Storing & Loading Embeddings

In [None]:
import pickle
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

embeddings = model.encode(sentences)

#Store sentences & embeddings on disc
with open('embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': sentences, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
with open('embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']

- DistilBERT embeddings use a vector space of dimension 768

In [40]:
embeddings.shape

(3, 768)

# Semantic Textual Similarity
- from https://pypi.org/project/sentence-transformers/
- check here: https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [19]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Single list of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print(f"{sentences[i]:{40}} {sentences[j]:{40}} Score: {pair['score']:.4f}")

The new movie is awesome                 The new movie is so great                Score: 0.9816
The cat sits outside                     The cat plays in the garden              Score: 0.6247
I love pasta                             The new movie is so great                Score: 0.2605
I love pasta                             The new movie is awesome                 Score: 0.2526
I love pasta                             The cat plays in the garden              Score: 0.2455
I love pasta                             Do you like pizza?                       Score: 0.1997
The cat sits outside                     A woman watches TV                       Score: 0.1837
The cat plays in the garden              A woman watches TV                       Score: 0.1760
A man is playing guitar                  Do you like pizza?                       Score: 0.1080
A woman watches TV                       The new movie is so great                Score: 0.1008


# Semantic Search
- Semantic search seeks to improve search accuracy by understanding the content of the search query. In contrast to traditional search engines, that only finds documents based on lexical matches, semantic search can also find synonyms.
- https://www.sbert.net/examples/applications/semantic-search/README.html

In [25]:
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], f"(Score: {score:.4f})")





Query: A man is eating pasta.

Top 5 most similar sentences in corpus:
A man is eating food. (Score: 0.5777)
A man is eating a piece of bread. (Score: 0.4986)
A man is riding a horse. (Score: 0.1581)
A man is riding a white horse on an enclosed ground. (Score: 0.1474)
Two men pushed carts through the woods. (Score: 0.0992)




Query: Someone in a gorilla costume is playing a set of drums.

Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.6435)
A man is eating a piece of bread. (Score: 0.1719)
A man is eating food. (Score: 0.1240)
A man is riding a white horse on an enclosed ground. (Score: 0.0706)
A cheetah is running behind its prey. (Score: 0.0352)




Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.7769)
A man is riding a white horse on an enclosed ground. (Score: 0.2485)
A man is riding a horse. (Score: 0.2116)
A monkey is playing drums. (Score: 0.1820)
Two men p

# Text clustering: Fast Clustering
- https://www.sbert.net/examples/applications/clustering/README.html
- clustering algorithm tuned for large datasets (50k sentences in less than 5 seconds). In a large list of sentences it searches for local communities
- A local community is a set of highly similar sentences.
- configure the threshold of cosine-similarity for which we consider two sentences as similar. Also, you can specific the minimal size for a local community.
- This allows you to get either large coarse-grained cluster or small fine-grained clusters.

In [32]:
"""
This is a more complex example on performing clustering on large scale dataset.
This examples find in a large set of sentences local communities, i.e., groups of sentences that are highly
similar. You can freely configure the threshold what is considered as similar. A high threshold will
only find extremely similar sentences, a lower threshold will find more sentence that are less similar.
A second parameter is 'min_community_size': Only communities with at least a certain number of sentences will be returned.
The method for finding the communities is extremely fast, for clustering 50k sentences it requires only 5 seconds (plus embedding comuptation).
In this example, we download a large set of questions from Quora and then find
similar questions in this set.
"""
from sentence_transformers import SentenceTransformer, util
import numpy as np
import os
import csv
import pickle
import time



def community_detection(embeddings, threshold=0.75, min_community_size=10, init_max_size=1000):
    """
    Function for Fast Community Detection
    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.
    """

    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

In [28]:
# Model for computing sentence embeddings. We use one trained for similar questions detection
model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')

100%|███████████████████████████████████████████████████████████████████████████████| 245M/245M [01:06<00:00, 3.70MB/s]


In [33]:
# We donwload the Quora Duplicate Questions Dataset (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
# and find similar question in it
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 50000 # We limit our corpus to only the first 50k questions
embedding_cache_path = 'quora-embeddings-size-{}.pkl'.format(max_corpus_size)

In [34]:
#Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
    # Check if the dataset exists. If not, download and extract
    # Download dataset if needed
    if not os.path.exists(dataset_path):
        print("Download dataset")
        util.http_get(url, dataset_path)

    # Get all unique sentences from the file
    corpus_sentences = set()
    with open(dataset_path, encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            corpus_sentences.add(row['question1'])
            if len(corpus_sentences) >= max_corpus_size:
                break

            corpus_sentences.add(row['question2'])
            if len(corpus_sentences) >= max_corpus_size:
                break

    corpus_sentences = list(corpus_sentences)
    print("Encode the corpus. This might take a while")
    corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)

    print("Store file on disc")
    with open(embedding_cache_path, "wb") as fOut:
        pickle.dump({'sentences': corpus_sentences, 'embeddings': corpus_embeddings}, fOut)
else:
    print("Load pre-computed embeddings from disc")
    with open(embedding_cache_path, "rb") as fIn:
        cache_data = pickle.load(fIn)
        corpus_sentences = cache_data['sentences']
        corpus_embeddings = cache_data['embeddings']

Download dataset


100%|█████████████████████████████████████████████████████████████████████████████| 58.2M/58.2M [00:21<00:00, 2.75MB/s]


Encode the corpus. This might take a while


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1563.0), HTML(value='')))


Store file on disc


In [39]:
print("Start clustering")
start_time = time.time()

#Two parameter to tune:
#min_cluster_size: Only consider cluster that have at least 25 elements (30 similar sentences)
#threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = community_detection(corpus_embeddings, min_community_size=15, threshold=0.90)


#Print all cluster / communities
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster:
        print("\t", corpus_sentences[sentence_id])


print("Clustering done after {:.2f} sec".format(time.time() - start_time))

Start clustering

Cluster 1, #46 Elements 
	 How could I improve my English?
	 How can I specifically improve my English?
	 How can I improve my english language?
	 Can I improve my English?
	 What should I do to improve my English ?
	 How can I improve in English?
	 How can improve my english knowledge?
	 How do I improve my English language?
	 I want to improve my English?
	 How can I improve my English vocabulary?
	 How can I improve my spoken English?
	 How will I improve my spoken English?
	 How can I improve my English speaking ability?
	 How can I improve English language?
	 How do I improve my English writing ability?
	 How I can improve my English communication?
	 How can I improve my communication in English?
	 How do I improve my pronunciation of English?
	 What is the best path I should take to improve my English?
	 How could I improve my English pronunciation?
	 What should I do to improve my spoken English?
	 How can I improve my English speaking .?
	 How can I improve my

## Other clustering approaches
### Simple clustering (k means)

In [29]:
"""
This is a simple application for sentence embeddings: clustering
Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('distilroberta-base-paraphrase-v1')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]
corpus_embeddings = embedder.encode(corpus)

# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")


100%|███████████████████████████████████████████████████████████████████████████████| 306M/306M [01:31<00:00, 3.32MB/s]


Cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  2
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  3
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

Cluster  4
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  5
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']



### agglomerative clustering

In [30]:
"""
This is a simple application for sentence embeddings: clustering
Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

embedder = SentenceTransformer('distilroberta-base-paraphrase-v1')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]
corpus_embeddings = embedder.encode(corpus)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  3
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  4
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  2
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']

Cluster  5
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

