In [86]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]
corpus_embeddings = embedder.encode(corpus)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [87]:
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import normalize, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.decomposition import KernelPCA, SparsePCA, TruncatedSVD, PCA
import umap

In [88]:
umap_ = LocallyLinearEmbedding(n_components=2).fit_transform(corpus_embeddings)
umap_

array([[ 0.36443274, -0.29390317],
       [ 0.35529439, -0.27825133],
       [ 0.31624302, -0.21947139],
       [-0.51966279, -0.36871482],
       [-0.51863686, -0.3710331 ],
       [ 0.18559921,  0.06771241],
       [ 0.15114132,  0.12046328],
       [-0.01771327,  0.22489182],
       [-0.02042242,  0.23651433],
       [-0.14851786,  0.4375068 ],
       [-0.14775748,  0.44428518]])

In [89]:
def silhoutte(attempts):
    scores_silhouette = []

    for k in range(2,attempts):

        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, affinity="cosine" , linkage="complete").fit(umap_)
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(umap_, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters

In [90]:
n_clusters = silhoutte(11)

In [91]:
# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine' , linkage='complete') #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(umap_)
cluster_assignment = clustering_model.labels_

In [92]:

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  6
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  5
['A man is riding a horse.']

Cluster  4
['A man is riding a white horse on an enclosed ground.']

Cluster  2
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']

Cluster  3
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

