In [22]:
# Imports section
import numpy as np
import random
import math

In [23]:
# Input the number of clusters
print("Enter the value of k: ")
k = int(input())

Enter the value of k: 
3


In [24]:
# Prepare random data
arr1 = np.array(np.random.randint(100,size=15)).astype(np.int)
arr2 = np.array(np.random.randint(100,size=15)).astype(np.int)
data = list(zip(arr1, arr2))
print(data)

[(96, 50), (62, 42), (6, 10), (13, 73), (29, 36), (87, 27), (63, 27), (99, 74), (54, 62), (71, 21), (73, 65), (70, 21), (84, 79), (70, 76), (60, 87)]


In [25]:
# Choose k random samples from data
centroids = random.sample(set(list(data)),k=k)
print(centroids)

[(54, 62), (84, 79), (96, 50)]


In [26]:
# Below function checks if the clusters formed before and after the iteration
# remained same.
# If yes, terminate the loop.
def check_clusters_same(cluster1: dict, cluster2: dict):
    flag = True
    if cluster1.keys() == cluster2.keys():
        for key in cluster1.keys():
            list1 = cluster1[key].sort()
            list2 = cluster2[key].sort()
            if not list1 == list2:
                flag = False
                break
    else:
        flag = False
    return flag

def euclidean_distance(x1,y1,x2,y2):
    return round(math.sqrt(abs(x2-x1)**2 + abs(y2-y1)**2),2)

result = {}
prev_result = {}
iteration = 0
# Repeat forever until loop broken
while(True):
    print("Iteration ", iteration, ":")
    prev_result = result
    result = {}
    
    # Repeat for all data points:
    #    1. Calculate distance between each data point and the centroids
    #    2. Find the centroid this data point is the closest to.
    #    3. Prepare the clusters in a dictionary
    for x in data:
        x_dist = []
        for center in centroids:
            x_dist.append(euclidean_distance(x[0],x[1],center[0],center[1]))
        x_dist = np.array(x_dist)
        centroid_index = np.argmin(x_dist)
        centroid = centroids[centroid_index]
        if centroid in result:
            result[centroid].append(x)
        else:
            result[centroid] = [x]
            
    print("Clustered dataset: ", result)
    
    # Check if the previous cluster and current cluster are same.
    if check_clusters_same(prev_result, result):
        break
    
    # Prepare the new centroids by calculating the mean of all items in that cluster
    centers = list(centroids)
    centroids.clear()
    for center in centers:
        points = result[center]
        x = []
        y = []
        for data_tuple in points:
            x.append(data_tuple[0])
            y.append(data_tuple[1])
        x_mean = round(np.mean(np.array(x)),2)
        y_mean = round(np.mean(np.array(y)),2)
        centroids.append((x_mean, y_mean))
    print("New Centroids: ", centroids)
    iteration = iteration + 1
    print("\n")
        
print("\n----- OUTPUT -----")
print("NO. OF CLUSTERS  : ", k)
print("CLUSTER CENTROIDS: ", result.keys())
print("CLUSTERED DATASET:", result)

Iteration  0 :
Clustered dataset:  {(96, 50): [(96, 50), (87, 27), (71, 21), (70, 21)], (54, 62): [(62, 42), (6, 10), (13, 73), (29, 36), (63, 27), (54, 62)], (84, 79): [(99, 74), (73, 65), (84, 79), (70, 76), (60, 87)]}
New Centroids:  [(37.83, 41.67), (77.2, 76.2), (81.0, 29.75)]


Iteration  1 :
Clustered dataset:  {(81.0, 29.75): [(96, 50), (62, 42), (87, 27), (63, 27), (71, 21), (70, 21)], (37.83, 41.67): [(6, 10), (13, 73), (29, 36), (54, 62)], (77.2, 76.2): [(99, 74), (73, 65), (84, 79), (70, 76), (60, 87)]}
New Centroids:  [(25.5, 45.25), (77.2, 76.2), (74.83, 31.33)]


Iteration  2 :
Clustered dataset:  {(74.83, 31.33): [(96, 50), (62, 42), (87, 27), (63, 27), (71, 21), (70, 21)], (25.5, 45.25): [(6, 10), (13, 73), (29, 36)], (77.2, 76.2): [(99, 74), (54, 62), (73, 65), (84, 79), (70, 76), (60, 87)]}
New Centroids:  [(16.0, 39.67), (73.33, 73.83), (74.83, 31.33)]


Iteration  3 :
Clustered dataset:  {(74.83, 31.33): [(96, 50), (62, 42), (87, 27), (63, 27), (71, 21), (70, 21)],