In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist 

centers = 3
X, true_labels = make_blobs(n_samples=100, centers=centers, random_state=42)

In [None]:
true_labels

In [None]:
X

In [None]:
np.unique(true_labels)

In [None]:
u_labels = np.unique(true_labels)

In [None]:
i = u_labels[0]
true_labels == i

In [None]:
X

In [None]:
X[true_labels == i , 0]

In [None]:
X[true_labels == i , 1]

In [None]:
u_labels = np.unique(true_labels)
for i in u_labels:
    plt.scatter(X[true_labels == i , 0] , X[true_labels == i , 1], label=i)
plt.legend()
plt.show()

In [None]:
k = 3
np.random.choice(len(X), k, replace=False)

In [None]:
idx = np.random.choice(len(X), k, replace=False)
centroids = X[idx, :]
centroids

In [None]:
idx

In [None]:
u_labels = np.unique(true_labels)
for i in u_labels:
    plt.scatter(X[true_labels == i , 0] , X[true_labels == i , 1], label=i)
plt.scatter(centroids[:, 0], centroids[:, 1], c="red")
plt.legend()
plt.show()

In [None]:
distances = cdist(X, centroids, 'euclidean')
distances

In [None]:
distances.shape

In [None]:
clusters = np.array([np.argmin(i) for i in distances])
clusters

In [None]:
np.argmin(distances, axis=1)

In [None]:
idx = 1

In [None]:
X[clusters==idx]

In [None]:
X[clusters==idx].mean(axis=0)

In [None]:
no_of_iterations = 1000

for _ in range(no_of_iterations): 
    centroids = []
    for idx in range(k):
        temp_cent = X[clusters==idx].mean(axis=0) 
        centroids.append(temp_cent)

    centroids = np.vstack(centroids)

    distances = cdist(X, centroids, 'euclidean')
    clusters = np.array([np.argmin(i) for i in distances])

In [None]:
centroids

In [None]:
u_labels = np.unique(true_labels)
for i in u_labels:
    plt.scatter(X[true_labels == i , 0] , X[true_labels == i , 1] , label = i)
plt.scatter(centroids[:, 0], centroids[:, 1], c="red")
plt.legend()
plt.show()

In [None]:
class KMeans():
    def __init__(self, clusters_count=2, iterations_count=10000):
        self.clusters_count = clusters_count
        self.iterations_count = iterations_count
    
    def fit(self, x):
        idx = np.random.choice(len(x), self.clusters_count, replace=False)
        centroids = x[idx, :]
        distances = cdist(x, centroids ,'euclidean')
        clusters = np.argmin(distances, axis=1)

        for _ in range(self.iterations_count): 
            centroids = []
            for idx in range(k):
                temp_cent = x[clusters==idx].mean(axis=0) 
                centroids.append(temp_cent)

            centroids = np.vstack(centroids)

            distances = cdist(x, centroids ,'euclidean')
            clusters = np.argmin(distances, axis=1)

        return centroids 

In [None]:
clust = KMeans(3, 1000)

In [None]:
clust.fit(X)

In [None]:
u_labels = np.unique(true_labels)
for i in u_labels:
    plt.scatter(X[true_labels == i , 0] , X[true_labels == i , 1] , label = i)
plt.scatter(centroids[:, 0], centroids[:, 1], c="red")
plt.legend()
plt.show()

In [None]:
class KMeans():
    def __init__(self, clusters_count=2, iterations_count=100000):
        self.clusters_count = clusters_count
        self.iterations_count = iterations_count
        self.centroids = None
    
    def fit(self, x):
        idx = np.random.choice(len(x), self.clusters_count, replace=False)
        self.centroids = x[idx, :]
        distances = cdist(x, self.centroids ,'euclidean')
        clusters = np.argmin(distances, axis=1)

        for _ in range(self.iterations_count): 
            self.centroids = []
            for idx in range(k):
                temp_cent = x[clusters==idx].mean(axis=0) 
                self.centroids.append(temp_cent)

            self.centroids = np.vstack(self.centroids)

            distances = cdist(x, self.centroids ,'euclidean')
            clusters = np.argmin(distances, axis=1)

        return self
    
    def predict(self, x):
        distances = cdist(x, self.centroids ,'euclidean')
        prediction = np.argmin(distances, axis=1)
        return prediction

In [None]:
clust = KMeans(3, 1000)
clust.fit(X)
clust.predict(X)

In [None]:
pred = clust.predict(X)
centroids = clust.centroids

In [None]:
u_labels = np.unique(pred)
for i in u_labels:
    plt.scatter(X[pred == i , 0] , X[pred == i , 1] , label = i)
plt.scatter(centroids[:, 0], centroids[:, 1], c="red")
plt.legend()
plt.show()

In [None]:
class KMeans():
    def __init__(self, clusters_count=2, iterations_count=100000, plot_step=10000000):
        self.clusters_count = clusters_count
        self.iterations_count = iterations_count
        self.centroids = None
        self.plot_step = plot_step
        
    
    def fit(self, x):
        idx = np.random.choice(len(x), self.clusters_count, replace=False)
        self.centroids = x[idx, :]
        distances = cdist(x, self.centroids ,'euclidean')
        points = np.argmin(distances, axis=1)

        for i in range(self.iterations_count): 
            self.centroids = []
            for idx in range(k):
                temp_cent = x[points==idx].mean(axis=0) 
                self.centroids.append(temp_cent)

            self.centroids = np.vstack(self.centroids)

            distances = cdist(x, self.centroids ,'euclidean')
            points = np.argmin(distances, axis=1)
            
            if i % self.plot_step == 0:
                u_labels = np.unique(points)
                for i in u_labels:
                    plt.scatter(x[points == i , 0] , x[points == i , 1] , label = i)
                plt.scatter(self.centroids[:, 0], self.centroids[:, 1], c="red")
                plt.legend()
                plt.show()

        return self
    
    def predict(self, x):
        distances = cdist(x, self.centroids ,'euclidean')
        prediction = np.argmin(distances, axis=1)
        return prediction

In [None]:
clust = KMeans(3, 10000, 1000)
clust.fit(X)
clust.predict(X)

In [None]:
class KMeans():
    def __init__(self, clusters_count=2, iterations_count=100000, plot_step=10000000):
        self.clusters_count = clusters_count
        self.iterations_count = iterations_count
        self.centroids = None
        self.plot_step = plot_step
        
    
    def fit(self, x):
        idx = np.random.choice(len(x), self.clusters_count, replace=False)
        self.centroids = x[idx, :]
        distances = cdist(x, self.centroids ,'euclidean')
        points = np.argmin(distances, axis=1)

        for i in range(self.iterations_count): 
            self.centroids = []
            for idx in range(k):
                temp_cent = x[points==idx].mean(axis=0) 
                self.centroids.append(temp_cent)

            self.centroids = np.vstack(self.centroids)

            distances = cdist(x, self.centroids ,'euclidean')
            points = np.argmin(distances, axis=1)
            
            if i % self.plot_step == 0:
                self.plot(x, points)

        return self
    
    def predict(self, x):
        distances = cdist(x, self.centroids ,'euclidean')
        prediction = np.argmin(distances, axis=1)
        return prediction
    
    def plot(self, x, cluster):
        u_labels = np.unique(cluster)
        for i in u_labels:
            plt.scatter(x[cluster == i , 0] , x[cluster == i , 1] , label=i)
        plt.scatter(self.centroids[:, 0], self.centroids[:, 1], c="red")
        plt.legend()
        plt.show()

In [None]:
clust = KMeans(3, 10000, 1000)
clust.fit(X)
clust.predict(X)

In [None]:
centers = 3
X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=4, random_state=42)

In [None]:
u_labels = np.unique(true_labels)
for i in u_labels:
    plt.scatter(X[true_labels == i , 0] , X[true_labels == i , 1], label=i)
plt.legend()
plt.show()

In [None]:
clust = KMeans(3, 10000, 1000)
clust.fit(X)
clust.predict(X)