KMeans
===

Pattern Recognition Course MCS 2020 <br />
K_Means implementation by hand <br />
Herbelin Ludovic <br />

In [2]:
import csv
import numpy as np
import random
from scipy.stats.mstats import gmean
import scipy.spatial as sp
from math import sqrt
from tqdm import tqdm, trange

In [3]:
TRAIN_FILE = 'data/train.csv'
TEST_FILE = 'data/test.csv'

K_CLUSTERS = 5

In [4]:
class Sample:
    def __init__(self, cluster, feature):
        self.cluster = cluster
        self.feature = feature

## Read dataset

The data are formatted as such : digit, pixel1, pixel2, pixel3, ...

In [5]:
def read_dataset(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        data = list(reader)
        matrix = np.array(data, dtype = int)
        samples = matrix[:,1:]
        labels = matrix[:,0]

        return samples, labels

In [24]:
train_samples, train_labels = read_dataset(TRAIN_FILE)
# reshape the array as a matrix so we can use scipy's cdist
train_samples = [Sample(None, np.array(sample)) for sample in train_samples][:100]

print(len(train_samples))

100


## KMeans function

def k_means(samples, k_clusters, max_iter=100):
    
    # assign centers as random points (temporarily)
    initial_centers = random.sample(samples, k_clusters)
    cluster_centers = []
    for point in initial_centers:
        cluster_centers.append(point.feature)

    for i in trange(max_iter):
        for xi in (sample for sample in samples):
            # compute distances to each cluster for this sample 
            cluster_distances = []
            for cluster_center in cluster_centers:
                try:
                    center_distance = sp.distance.cdist(xi.feature, cluster_center, metric='euclidean')
                    cluster_distances.append(center_distance)
                except:
                    print(xi.feature.shape)
                    print(cluster_center.shape)
            
            # get the cluster center with the minimal distance
            smallest_dist, cluster_index = min((value, index) for (index, value) in enumerate(cluster_distances))
            #closest_center = min(cluster_distances, key=cluster_distances.get)
            xi.cluster = cluster_index
            
        # recompute the center of the cluster
        for i in range(K_CLUSTERS):
            cluster_samples = [sample.feature[0] for sample in samples if sample.cluster == i]
            # compute the geometric mean and shape it as matrix so we can use scipy's cdist
            cluster_centers[i] = np.array(gmean(cluster_samples)).reshape(1, -1)
            print(cluster_samples[:2])
            print(gmean(cluster_samples))
            return


In [42]:
def k_means(samples, k_clusters, max_iter=100):
    # assign centers as random points (temporarily)
    initial_centers = random.sample(samples, k_clusters)
    cluster_centers = []
    for point in initial_centers: 
        cluster_centers.append(point.feature)
    
    for i in trange(max_iter):
        # compute distances to each center returned as matrix M, for each ij ∈ M, M_ij = dist(XA[i], XB[j]) 
        cluster_distances = sp.distance.cdist([sample.feature for sample in samples], cluster_centers, metric='euclidean')
        
        # find the smallest (argmin) cluster index distance and assign it to the sample
        for sample, sample_distances in zip(samples, cluster_distances):
            cluster_id = np.argmin(sample_distances)
            sample.cluster = cluster_id
        
        # recompute the new center for the cluster
        for i in range(k_clusters):
            cluster_samples = [sample.feature for sample in samples if sample.cluster == i]
            centroid = np.mean(cluster_samples, axis=0)
            cluster_centers[i] = centroid
            
        

## Main

In [43]:
print("starting clustering...")
k_means(train_samples, K_CLUSTERS, max_iter=10)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 665.93it/s]

starting clustering...



