## Homework 8 Question 2

In [1]:
import numpy as np

data = np.array([
    [1, 1, 0],
    [1, 0, 1],
    [2, 0, 0],
    [1, 3, 1],
    [3, 3, 2],
    [3, 0, 0],
    [0, 2, 1],
    [2, 3, 0],
    [0, 0, 0],
    [2, 1, 3]
])

#### K-means Algorithm

In [2]:
# euclidean distance formula
def dist(p1, p2):
    return np.sqrt(np.sum((p1 - p2) ** 2))

# assigns points to closest centroid
def assign_clusters(data, centroids):
    clusters = [[] for _ in centroids]
    
    for p in data:
        # get distance from point to all centroids
        distances = [dist(p, centroid) for centroid in centroids]
        
        # get index of closest centroid
        closest = np.argmin(distances)
        clusters[closest].append(p)
    return clusters
        
# recompute centroids
def recompute_means(clusters):
    new_centroids = []
    for cluster in clusters:
        new_centroids.append(np.mean(cluster, axis=0))

    return new_centroids
        
# k means algorithm
def k_means(data, initial_centroids):
    centroids = np.array(initial_centroids)
    
    prev_clusters = None
    while True:
        # assign clusters
        clusters = assign_clusters(data, centroids)
        
        # check if there are changes in cluster assignments
        if prev_clusters is not None and all(np.array_equal(c1, c2) for c1, c2 in zip(clusters, prev_clusters)):
            break
        
        # recompute means
        centroids = recompute_means(clusters)
        
        # update prev_clusters
        prev_clusters = clusters
            
    return centroids, clusters


In [3]:
initial_centroid_a = np.array([
    [1, 1, 0],
    [2, 3, 0]
])

centroids, clusters = k_means(data, initial_centroid_a)

print("Centroids:")
for centroid in centroids:
    print(centroid)

for cluster in clusters:
    print("Cluster:")
    for p in cluster:
        print(p)

Centroids:
[1.28571429 0.57142857 0.71428571]
[2. 3. 1.]
Cluster:
[1 1 0]
[1 0 1]
[2 0 0]
[3 0 0]
[0 2 1]
[0 0 0]
[2 1 3]
Cluster:
[1 3 1]
[3 3 2]
[2 3 0]


In [4]:
initial_centroid_b = np.array([
    [0, 0, 0],
    [3, 3, 2],
    [2, 0, 0]
])
centroids, clusters = k_means(data, initial_centroid_b)

print("Centroids:")
for centroid in centroids:
    print(centroid)

for cluster in clusters:
    print("Cluster:")
    for p in cluster:
        print(p)

Centroids:
[0.5  0.75 0.5 ]
[2.  2.5 1.5]
[2.5 0.  0. ]
Cluster:
[1 1 0]
[1 0 1]
[0 2 1]
[0 0 0]
Cluster:
[1 3 1]
[3 3 2]
[2 3 0]
[2 1 3]
Cluster:
[2 0 0]
[3 0 0]
