In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import randrange
from copy import deepcopy
from PIL import Image
MAX_ITERATIONS = 100
import math

In [33]:
def euclidean_distance(p1, p2):
    # Simple distance formula
    return math.sqrt( ((p1[0]-p2[0])**2)+((p1[1]-p2[1])**2)+((p1[2]-p2[2])**2) )

In [34]:
def read_image(filename):
    return Image.open(filename)

In [35]:
data = np.array(read_image("obama.png"))
data.shape

(280, 237, 3)

In [36]:
def get_new_k(arr, k, data):
    k_sum = [0, 0, 0]
    count = 0
    # Go through all the book keeping array, if one is found, add that row data
    for i in range(len(arr)):
        if arr[i][k-1] == 1:
            row = math.floor(i/data.shape[1])
            col = i - row*data.shape[1]
            
            for k_point in range(3):
                k_sum[k_point] += data[row][col][k_point]
            count += 1
    # take average
    if count != 0:
        k_sum[0] /= count
        k_sum[1] /= count
        k_sum[2] /= count
    for i in range(len(k_sum)):
        k_sum[i] = round(k_sum[i])
    return k_sum
      

In [37]:
def clusterize_data(arr, k, data):
    # Create a dict and check if the row has 1 for corresponding k
    clusters = dict()
    for i in range(k):
        clusters[i+1] = []
    for row in range(len(arr)):
        for k_val in range(k):
            if arr[row][k_val] == 1:
                t_row = math.floor(row/data.shape[1])
                t_col = row - t_row*data.shape[1]
                clusters[k_val+1].append([t_row, t_col])
    return clusters

In [38]:
def k_means(k, data):
    centroids = []
    # Initialize centroids
    for i in range(k):
        centroids.append(data[randrange(0, data.shape[0])][randrange(0, data.shape[1])])
    
    # Create a book keeper
    book_keeper = [[0 for j in range(k)] for i in range(data.shape[0] * data.shape[1])]
    
    # run till max iters
    for k_means_iteration in range(MAX_ITERATIONS):
        prev_book_keeper = deepcopy(book_keeper)
        
        index = 0
        for row in data:
            for col in row:
                distances = [euclidean_distance(col, centroids[i]) for i in range(len(centroids))]
                # Find min of distances
                for i in range(k):
                    book_keeper[index][i] = 0
                book_keeper[index][np.argmin(distances,axis=None,out=None)] = 1
                index += 1
        
        centroids = [get_new_k(book_keeper, i, data) for i in range(1, k+1)]
            
        if prev_book_keeper == book_keeper:
            return centroids, clusterize_data(book_keeper, k, data)
    # Return the dict
    return centroids, clusterize_data(book_keeper, k, data)
    

In [None]:
k = 4
centroids, clusters = k_means(k, data)

In [40]:
new_data = np.empty((data.shape[0], data.shape[1], 3), dtype='uint8')
for i in range(k):
    for coords in clusters[i+1]:
        new_data[coords[0]][coords[1]] = centroids[i]
        
new_image = Image.fromarray(new_data)
new_image.show()