### Kaustav Vats (2016048)
### Topic- K-Mean Clustering

In [147]:
import numpy as np
# from math import sqrt
from copy import deepcopy
from tqdm import tqdm_notebook as tqdm

In [171]:
class Point:
    def __init__(self, name, point_class):
        self.Name = name
        self.c = point_class
#         self.v = value
        
    def __str__(self):
        return self.c + " " + self.Name
    
    def getName(self):
        return self.Name
    def getClass(self):
        return self.c
    
def load_data(filename, point_class):
    data = []
    file = open(filename, 'r')
    for line in file:
        line = line.strip().split(" ")
        val = list(map(float, line[1:]))
        data.append((line[0], point_class, np.asarray(val)))
    return data

def pre_process(data):
    X = np.zeros((len(data), data[0][2].shape[0]))
    Y = []
    for i in range(len(data)):
        X[i, :] = data[i][2][:]
        Y.append(Point(data[i][0], data[i][1]))
    return X, Y

def getEuclideanDistance(center, x):
    t = center - x
    t = np.square(t)
    t = np.sum(t, axis=1)
    t = np.sqrt(t)
    return t

def UpdateMean(k, label, x):
    centers = []
#     print(label)
    for i in range(k):
        centers.append([])
    
    for i in range(label.shape[0]):
        centers[label[i]].append(x[i, :])
    
    for i in range(k):
        if len(centers[i]) > 0:
            centers[i] = np.vstack(tuple(centers[i]))
            centers[i] = np.mean(centers[i], axis=0)
#             print(centers[i].shape)
        else:
            centers[i] = np.zeros((1, x.shape[1]))
    return centers

def KMean(K, X, mini=-3.3685, maxi=1.9523, max_iter=1000):
    Label = np.zeros(X.shape[0])
    Label.fill(-1)
    Prev = Label
    Centroids = []
    for i in range(K):
        Centroids.append(np.random.uniform(low=mini, high=maxi, size=(1, X.shape[1])))
    
    # Stopping Conditions
    for i in tqdm(range(max_iter)):
        D = np.zeros((K, X.shape[0]))
        for c in range(K):
            D[c, :] = getEuclideanDistance(Centroids[c], X)
        Label = np.argmin(D, axis=0)
        Centroids = UpdateMean(K, Label, X)
        if np.array_equal(Label, Prev):
            break
        Prev = Label
    return Label

In [172]:
Classes = ['animals', 'countries', 'fruits', 'veggies']
dataPath = 'clustering-data/'
Data = []
for i in range(len(Classes)):
    Data += load_data(dataPath+Classes[i], Classes[i])
print("Data Points: {}".format(len(Data)))
print("Vector Length: {}".format(Data[0][2].shape))

Data Points: 329
Vector Length: (300,)


In [173]:
X, Y = pre_process(Data)
print(X.shape)
print("Min Val: {}, Max Val: {}".format(np.amin(X), np.amax(X)))

(329, 300)
Min Val: -3.3685, Max Val: 1.9523


In [174]:
KMean(4, X, max_iter=10000)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

array([2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,