### KMeans Clustering

* Initialize the Number of cluster to create and number of iterations.
* Initialize random centroids
* Create Cluster with random centroids using Euclidean Distance
* Recalculate Centroids on new created cluster in step 3
* Repeat step 3 and 4, until convergence
* Find convergence using distance between old and new centroids.
* Assign labels to samples.

In [13]:
import numpy as np
from collections import Counter

In [10]:
class KMeans:
    def __init__(self, k=3, iterations=100):
        self.k = k
        self.iters = iterations
        
    def euclideanDistance(self, V1, V2):
        return np.sqrt(np.sum((V1 - V2) ** 2))
    
    def createCluster(self, centroids):
        clusters = [[] for _ in range(self.k)]
        for i, sample in enumerate(self.X):
            distances = [self.euclideanDistance(sample, c) for c in centroids]
            distance_idx = np.argmin(distances)
            clusters[distance_idx].append(i)
            
        return clusters
    
    def getNewCentroids(self, clusters):
        centroids = np.zeros((self.k, self.n_features))
        for i, c in enumerate(clusters):
            cluster_mean = np.mean(self.X[c], axis=0)
            centroids[i] = cluster_mean
            
        return centroids
    
    def isConverged(self, old_centroids, new_centroids):
        distances = [self.euclideanDistance(old_centroids[i], new_centroids[i]) for i in range(self.k)]
        return sum(distances)==0
    
    def getLabels(self, clusters):
        nlabels = np.empty(self.n_samples)
        for i, cluster in enumerate(clusters):
            for sample_idx in cluster:
                nlabels[sample_idx] = i
        return nlabels
        
    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape
        random_idx = np.random.choice(self.n_samples, self.k, replace=False)
        self.centroids = [self.X[i] for i in random_idx]
        
        for _ in range(self.iters):
            self.clusters = self.createCluster(self.centroids)
            old_centroids = self.centroids
            self.centroids = self.getNewCentroids(self.clusters)
            
            if self.isConverged:
                break
                
        return self.getLabels(self.clusters)

In [15]:
if __name__ == "__main__":
    from sklearn.datasets import make_blobs

    X, y = make_blobs(
        centers=3, n_samples=500, n_features=2, shuffle=True, random_state=40
    )
    print(X.shape)

    clusters = len(np.unique(y))
    print(clusters)

    k = KMeans(k=clusters, iterations=150)
    y_pred = k.predict(X)
    print(Counter(y_pred))

(500, 2)
3
Counter({2.0: 168, 0.0: 168, 1.0: 164})
