# Implementation of k-means clustering algorithm

This notebook contains implemetation of k-means clustering algorithm and clusterisation of iris flowers using implementation.

In [1]:
from math import fsum,sqrt
from pprint import pprint
from collections import defaultdict
from functools import partial
from random import sample
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()
data=[tuple(d) for d in iris.data] #convert iris.data to list of tuples

In [3]:
def dist(p,q):
  """To compute distance between two points (p and q)"""
  return sqrt(fsum([(x-y)**2 for x, y in zip(p,q)]))

In [4]:
def group_data(centroids, data):
  """To group data by minimal distance from centroid (the closest centroid)"""
  d =defaultdict(list)
  for sample in data:
    closest_centroid =tuple(min(centroids, key=partial(dist, sample)))
    d[closest_centroid].append(sample)
  return d

In [5]:
def compute_centroids(groups):
  """To compute new centroid of each group (mean value of sample features in the group)"""
  centroids=[]
  for group in groups:
    centroid=[fsum(features)/len(features) for features in zip(*group)]
    centroids.append(centroid)
  return(centroids)

In [6]:
def k_means(data,k=3,max_iterations=40):
  """k_means - cluster data in k groups"""
  data=list(data)
  centroids = sample(data,k)
  i=0
  while (i<max_iterations):
    old_centroids = centroids
    clustered_data =group_data(centroids,data)
    centroids = compute_centroids(clustered_data.values())
    i+=1
    if (old_centroids == centroids):
      break
  return clustered_data

In [7]:
if __name__=="__main__":
  clustered_data =k_means(data,k=3)
  #Analysis of results
  k=0
  for c in clustered_data.keys():
    k+=1
    print ("Cluster No.: ",k,)
    print ("Cluster centroid: ", c)
    print ("Objects in cluster: ",len(clustered_data[tuple(c)]))

Cluster No.:  1
Cluster centroid:  (4.803225806451613, 3.225806451612903, 1.4193548387096775, 0.20967741935483872)
Objects in cluster:  31
Cluster No.:  2
Cluster centroid:  (5.290909090909091, 3.5727272727272723, 1.7590909090909093, 0.40454545454545454)
Objects in cluster:  22
Cluster No.:  3
Cluster centroid:  (6.301030927835052, 2.88659793814433, 4.958762886597938, 1.6958762886597938)
Objects in cluster:  97
