In [1]:
#Let's start by randomly initializing K cluster centroids
import numpy as np
def initialize_centroids(X, K):
    m, n = np.shape(X)
    centroids = np.empty((K, n)) #K x n matrix
    for i in range(K):
        centroids[i] = X[np.random.choice(range(m))] # randomly select a row from X
    return centroids

In [None]:
#Compute euclidean distance between two vectors
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))


In [None]:
def closest_centroid(x, centroids, K):
    distances = np.empty(K)
    for i in range(K):
        distances[i] = euclidean_distance(centroids[i], x)
    return np.argmin(distances)

In [None]:
#create the clusters
def create_clusters(X, centroids, K):
    m , _= np.shape(X)
    clusters = np.empty(m)
    for i in range(m):
        clusters[i] = closest_centroid(X[i], centroids, K)
    return clusters

In [None]:
#compute the means of the clusters
def compute_means(X, clusters, K):
    _, n = np.shape(X)
    centroids = np.empty((K, n))
    for i in range(K):
        centroids[i] = np.mean(X[clusters == i], axis=0)
    return centroids

In [None]:
def run_kmeans(X, K, max_iters):
    centroids = initialize_centroids(X, K)
    print(f"Inital centroids: {centroids}")
    for _ in range(max_iters):
        clusters = create_clusters(X, centroids, K)
        prev_centroids = centroids
        centroids = compute_means(X, clusters, K)
        diff = prev_centroids - centroids
        if not diff.any():
            return clusters
    return clusters

In [None]:
#testing:
from sklearn import datasets
X, y = datasets.make_blobs()
y_preds = run_kmeans(3, X)

In [None]:
import matplotlib.pyplot as plt
