KMeans
===

Pattern Recognition Course MCS 2020 <br />
K_Means implementation by hand <br />
Herbelin Ludovic <br />

In [1]:
import csv
import numpy as np
import random
from scipy.stats.mstats import gmean
import scipy.spatial as sp
from math import sqrt
from tqdm import tqdm, trange

In [2]:
TRAIN_FILE = 'data/train.csv'
TEST_FILE = 'data/test.csv'

K_CLUSTERS = 5

In [3]:
class Sample:
    def __init__(self, cluster, feature):
        self.cluster = cluster
        self.feature = feature

## Read dataset

The data are formatted as such : digit, pixel1, pixel2, pixel3, ...

In [4]:
def read_dataset(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        data = list(reader)
        matrix = np.array(data, dtype = int)
        samples = matrix[:,1:]
        labels = matrix[:,0]

        return samples, labels

In [5]:
train_samples, train_labels = read_dataset(TRAIN_FILE)
# reshape the array as a matrix so we can use scipy's cdist
train_samples = [Sample(None, np.array(sample).reshape(1, -1)) for sample in train_samples]

print(len(train_samples))

26999


## KMeans function

In [6]:
def k_means(samples, k_clusters, max_iter=100):
    
    # assign centers as random points (temporarily)
    initial_centers = random.sample(samples, k_clusters)
    cluster_centers = []
    for point in initial_centers:
        cluster_centers.append(point.feature)

    for i in trange(max_iter):
        for xi in (sample for sample in samples):
            # compute distances to each cluster for this sample 
            cluster_distances = []
            for cluster_center in cluster_centers:
                try:
                    center_distance = sp.distance.cdist(xi.feature, cluster_center, metric='euclidean')
                    cluster_distances.append(center_distance)
                except:
                    print(xi.feature.shape)
                    print(cluster_center.shape)
            
            # get the cluster center with the minimal distance
            smallest_dist, cluster_index = min((value, index) for (index, value) in enumerate(cluster_distances))
            #closest_center = min(cluster_distances, key=cluster_distances.get)
            xi.cluster = cluster_index
            
        # recompute the center of the cluster
        for i in range(K_CLUSTERS):
            cluster_samples = [sample.feature[0] for sample in samples if sample.cluster == i]
            # compute the geometric mean and shape it as matrix so we can use scipy's cdist
            cluster_centers[i] = np.array(gmean(cluster_samples)).reshape(1, -1)
            print(cluster_samples[:2])
            print(gmean(cluster_samples))
            return


## Main

In [7]:
print("starting clustering...")
k_means(train_samples, K_CLUSTERS, max_iter=5)

  0%|                                                                                                                                          | 0/5 [00:00<?, ?it/s]

starting clustering...


  log_a = np.log(np.array(a, dtype=dtype))
  0%|                                                                                                                                          | 0/5 [00:05<?, ?it/s]

[array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  


