# Content

In this notebook, we will see a Clustering Algorithm in action.

We will start with an embedding model and some test words. we'll use the embedding model to convert the words into vectors and then we will use K-means algo to cluster those words/vectors.

We will use Word2Vec for our embedding model

We will implement KMeans and use SKLearn's AgglomerativeClustering and DBSCAN

In [1]:
from IPython.display import clear_output

In [2]:
# %pip install gdown==4.5

clear_output()

In [3]:
import random

import numpy as np
import gensim

from tqdm import tqdm

from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.metrics import pairwise_distances_argmin_min

## Downloading the embedding model

In [None]:
!gdown https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM

In [4]:
# This will take some time.
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

## Implementing the K-Means Clustering Algorithm

In [58]:
# We will make the class api(function names and such) similar to Sklearn's
class KMeans:

    def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.centers = None

    def fit(self, X):

        np.random.seed(self.random_state)

        self.centers = self._initialize_centroids(X)

        for _ in range(self.max_iter):
            labels, min_distances = self._assign_labels(X)
            new_centers = self._calculate_centers(X, labels)

            # Check for convergence
            if np.all(np.linalg.norm(new_centers - self.centers, axis=1) < self.tol):
                break
            self.centers = new_centers

    def _initialize_centroids(self, X):
        n_samples, _ = X.shape
        centroids = np.empty((self.n_clusters, X.shape[1]))
        centroid_idx = np.random.choice(n_samples)
        centroids[0] = X[centroid_idx]

        for k in range(1, self.n_clusters):
            distances = np.min(np.linalg.norm(X[:, np.newaxis] - centroids[:k], axis=2), axis=1)
            prob = distances / distances.sum()
            centroid_idx = np.random.choice(n_samples, p=prob)
            centroids[k] = X[centroid_idx]

        return centroids

    def _assign_labels(self, X):
        labels, min_distances = pairwise_distances_argmin_min(X, self.centers)
        return labels, min_distances

    def _calculate_centers(self, X, labels):
        new_centers = np.zeros((self.n_clusters, X.shape[1]))
        for i in range(self.n_clusters):
            new_centers[i] = X[labels == i].mean(axis=0)
        return new_centers

    def predict(self, X):
        labels, _ = self._assign_labels(X)
        return labels


## Clustering the data

In [59]:
# Let's pick out a few random words which we will cluster

test_words = [
    "apple", "mango", "banana", "peach", "cherry",
    "monkey", "tiger", "lion", "deer", "dog",
    "london", "belgium", "beijing", "lahore"

]

test_words = [word.lower() for word in test_words]
test_word_vectors = np.array([model[word] for word in test_words])

## K-Means Clustering

In [67]:
clustering_model = KMeans(n_clusters=3, tol=1e-2)
clustering_model.fit(test_word_vectors)

In [68]:
assignments = clustering_model.predict(test_word_vectors)  # return an array of a cluster id, for each word(or word vector)

In [69]:
cluster_id_to_words = {}

for word, cluster_id in zip(test_words, assignments):

    if cluster_id not in cluster_id_to_words:
        cluster_id_to_words[cluster_id] = []
    cluster_id_to_words[cluster_id].append(word)

for cluster_id in sorted(cluster_id_to_words):
    print(f"Cluster {cluster_id}: {', '.join(cluster_id_to_words[cluster_id])}")
    print('-'*30)

Cluster 0: monkey, tiger, lion, deer, dog
------------------------------
Cluster 1: apple, mango, banana, peach, cherry
------------------------------
Cluster 2: london, belgium, beijing, lahore
------------------------------


## Agglomerative Clustering

In [70]:
clustering_model = AgglomerativeClustering(n_clusters=3)
clustering_model.fit(test_word_vectors)

In [71]:
assignments = clustering_model.labels_  # return an array of a cluster id, for each word(or word vector)

In [72]:
cluster_id_to_words = {}

for word, cluster_id in zip(test_words, assignments):

    if cluster_id not in cluster_id_to_words:
        cluster_id_to_words[cluster_id] = []
    cluster_id_to_words[cluster_id].append(word)

for cluster_id in sorted(cluster_id_to_words):
    print(f"Cluster {cluster_id}: {', '.join(cluster_id_to_words[cluster_id])}")
    print('-'*30)

Cluster 0: monkey, tiger, lion, deer, dog
------------------------------
Cluster 1: apple, mango, banana, peach, cherry
------------------------------
Cluster 2: london, belgium, beijing, lahore
------------------------------


## DBSCAN clustering

In [73]:
# Normalize the vectors. This will help us by reducing the required "eps" value for good results

test_word_vectors_normalized = test_word_vectors/np.linalg.norm(test_word_vectors, axis=-1, keepdims=True)

In [74]:
np.linalg.norm(test_word_vectors_normalized[0])

0.99999994

In [75]:
clustering_model = DBSCAN(eps=1.1, min_samples=2)  # try playing around with these params
assignments = clustering_model.fit_predict(test_word_vectors_normalized)

In [76]:
assignments

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2])

In [77]:
cluster_id_to_words = {}

for word, cluster_id in zip(test_words, assignments):

    if cluster_id not in cluster_id_to_words:
        cluster_id_to_words[cluster_id] = []
    cluster_id_to_words[cluster_id].append(word)

for cluster_id in sorted(cluster_id_to_words):
    print(f"Cluster {cluster_id}: {', '.join(cluster_id_to_words[cluster_id])}")
    print('-'*30)

Cluster 0: apple, mango, banana, peach, cherry
------------------------------
Cluster 1: monkey, tiger, lion, deer, dog
------------------------------
Cluster 2: london, belgium, beijing, lahore
------------------------------
