In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def assign(dataset, centroids):
    """
    Time complexity: O(K*N)
    """
    K = len(centroids)
    clusters = [[] for _ in range(K)]
    assigned = [np.sum((np.tile(d, (K, 1)) - centroids)**2,
                       axis=1).argmin() for d in dataset]
    for i, c in enumerate(assigned):
        clusters[c].append(dataset[i])
    return list(map(np.array, clusters))


def update(clusters):
    """
    Time complexity: O(N)
    """
    return [c.mean(axis=0).round() for c in clusters]


def compare(centroids1, centroids2):
    """
    Time complexity: O(K)
    """
    K = len(centroids1)
    for i in range(K):
        if (centroids1[i] == centroids2[i]).min() == False:
            return False
    return True

In [None]:
K = 3
columns = ['track_popularity', 'key']
df = pd.read_csv('./dataset/spotify.csv')[columns]
dataset = df.to_numpy()
seeds = [x for x in range(0, len(df), len(df)//K)][:K]
centroids = dataset[seeds]

In [None]:
# K-modes Method
old_centroids = [np.zeros(len(columns)) for _ in range(K)]
old_clusters = [[] for _ in range(K)]
new_centroids = centroids
new_clusters = assign(dataset, centroids)

t = 0
while not compare(old_centroids, new_centroids):
    print(f'{t}/ old: {old_centroids[0]}..., new: {new_centroids[0]}...')
    t += 1
    old_centroids = new_centroids
    old_clusters = new_clusters
    new_centroids = update(new_clusters)
    new_clusters = assign(dataset, new_centroids)
clusters = new_clusters
centroids = new_centroids

In [None]:
for c in clusters:
    plt.scatter(c.T[0], c.T[1], s=1)
for c in centroids:
    plt.scatter(c[0], c[1], s=20, c='black')
plt.xlabel(columns[0])
plt.ylabel(columns[1])
plt.show()