<a href="https://colab.research.google.com/github/IngyBadawi/K_means-clustering/blob/main/K_means_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing dataset and loading it into the environment**

In [None]:
import matplotlib.pyplot as plt
import pickle
import shutil
import pandas as pd
import numpy as np
import random
from collections import defaultdict

In [None]:
!wget {"https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"}

--2021-01-04 01:02:01--  https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170498071 (163M) [application/x-gzip]
Saving to: ‘cifar-10-python.tar.gz.1’


2021-01-04 01:02:07 (28.3 MB/s) - ‘cifar-10-python.tar.gz.1’ saved [170498071/170498071]



In [None]:
shutil.unpack_archive("cifar-10-python.tar.gz", "/content/")

In [None]:
def unpickle(file):
    """load the cifar-10 data"""

    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data

In [None]:
def load_cifar_10_data(data_dir, negatives=False):
    """
    Return train_data, train_filenames, train_labels, test_data, test_filenames, test_labels
    """

    # get the meta_data_dict
    # num_cases_per_batch: 1000
    # label_names: ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
    # num_vis: :3072

    meta_data_dict = unpickle(data_dir + "/batches.meta")
    cifar_label_names = meta_data_dict[b'label_names']
    cifar_label_names = np.array(cifar_label_names)

    # training data
    cifar_train_data = None
    cifar_train_filenames = []
    cifar_train_labels = []

    # cifar_train_data_dict
    # 'batch_label': 'training batch 5 of 5'
    # 'data': ndarray
    # 'filenames': list
    # 'labels': list

    for i in range(1, 6):
        cifar_train_data_dict = unpickle(data_dir + "/data_batch_{}".format(i))
        if i == 1:
            cifar_train_data = cifar_train_data_dict[b'data']
        else:
            cifar_train_data = np.vstack((cifar_train_data, cifar_train_data_dict[b'data']))
        cifar_train_filenames += cifar_train_data_dict[b'filenames']
        cifar_train_labels += cifar_train_data_dict[b'labels']

    cifar_train_data = cifar_train_data.reshape((len(cifar_train_data), 3, 32, 32))
    if negatives:
        cifar_train_data = cifar_train_data.transpose(0, 2, 3, 1).astype(np.float32)
    else:
        cifar_train_data = np.rollaxis(cifar_train_data, 1, 4)
    cifar_train_filenames = np.array(cifar_train_filenames)
    cifar_train_labels = np.array(cifar_train_labels)

    # test data
    # cifar_test_data_dict
    # 'batch_label': 'testing batch 1 of 1'
    # 'data': ndarray
    # 'filenames': list
    # 'labels': list

    cifar_test_data_dict = unpickle(data_dir + "/test_batch")
    cifar_test_data = cifar_test_data_dict[b'data']
    cifar_test_filenames = cifar_test_data_dict[b'filenames']
    cifar_test_labels = cifar_test_data_dict[b'labels']

    cifar_test_data = cifar_test_data.reshape((len(cifar_test_data), 3, 32, 32))
    if negatives:
        cifar_test_data = cifar_test_data.transpose(0, 2, 3, 1).astype(np.float32)
    else:
        cifar_test_data = np.rollaxis(cifar_test_data, 1, 4)
    cifar_test_filenames = np.array(cifar_test_filenames)
    cifar_test_labels = np.array(cifar_test_labels)

    return cifar_train_data, cifar_train_filenames, cifar_train_labels, \
        cifar_test_data, cifar_test_filenames, cifar_test_labels, cifar_label_names

In [None]:
if __name__ == "__main__":
    """show it works"""

    cifar_10_dir = 'cifar-10-batches-py'

    train_data, train_filenames, train_labels, test_data, test_filenames, test_labels, label_names = \
        load_cifar_10_data(cifar_10_dir)

    

# **Initializing constants**

In [None]:
n = 30 #Number of images
k = 3 #Number of centroids
mx_it = 10 #Maximum number of iterations
  

# **Helper functions**

In [None]:
def get_rand_centroids(n, k):
  random_index = random.sample(range(0, n), k)
  centroids = np.zeros(shape=(k,32,32,3))
  for i in range(len(random_index)):
      print('random index is ', random_index[i])
      centroids[i] = (train_data[i])
  return centroids

In [None]:
def get_dist(curr_image, centroid):
  diff = 0
  for i in range(32):
    for j in range(32):
      for l in range(3):
        diff += abs(curr_image[i][j][l] - centroid[i][j][l])
  return diff

In [None]:
import sys
def get_nearest_centroid(my_centroid, centroids):
  mn = sys.maxsize
  best_centroid = 0
  for x in range(k):
    dist = get_dist(my_centroid, centroids[x])
    if dist < mn:
      mn = dist
      best_centroid = x
  return best_centroid

In [None]:
def compare_centroids(old_centroids, new_centroids):
  for m in range (k):
    for i in range(32):
      for j in range(32):
        for l in range(3):
          if old_centroids[m][i][j][l] != new_centroids[m][i][j][l]:
            return 0
  return 1

# **K-Means Function**

In [None]:
def kmeans(n, k, centroids):
  new_buckets = defaultdict(list)
  new_centroids = np.zeros(shape=(k,32,32,3))
  cntr = 0
  sum = 0
  while(1):
    new_buckets = defaultdict(list)
    cntr += 1
    for i in range(n):
      nearest_centroid = get_nearest_centroid(train_data[i], centroids)
      new_buckets[nearest_centroid].append(i);

    new_centroids = np.zeros(shape=(k,32,32,3))
    for key, val in new_buckets.items():  #traversing all buckets, key is the current centroid, val is the list of indices of this bucket
      print(key," : ",val)
      for j in range(32): #traversing pixel 0 in first dimension
        for l in range(32): #traversing pixel 0 in second dimension
          for m in range(3):  #traversing 3 layers (RGB) in each pixel
            sum = 0
            for o in range (len(val)):#traversing the 3 layers in each pixel for the whole bucket, o is the current element in the i th bucket
              curr = train_data[o]
              sum += curr[j][l][m]
            new_centroids[key][j][l][m] = sum/len(val)   #assigning the value of that pixel and that layer for the new centroid
    
    if(compare_centroids(centroids, new_centroids) or cntr > mx_it):
      break
    centroids = new_centroids
  return [new_buckets, new_centroids]

# **Running the code and getting output**

In [None]:
rand_centroids = get_rand_centroids(n, k)
result = kmeans(n, k, rand_centroids)
buckets = result[0]
final_centroids = np.zeros(shape=(k,32,32,3))
final_centroids = result[1]

random index is  2
random index is  5
random index is  29
0  :  [0, 3, 5, 6, 7, 9, 10, 13, 14, 17, 19, 21, 22, 23, 24, 26, 27, 28]
1  :  [1, 4, 8, 15, 20]
2  :  [2, 11, 12, 16, 18, 25, 29]
2  :  [0, 5, 6, 21, 22, 26]
1  :  [1, 3, 4, 17]
0  :  [2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 23, 24, 25, 27, 28, 29]
1  :  [0, 1]
0  :  [2, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 19, 20, 23, 25, 28, 29]
2  :  [3, 4, 5, 10, 17, 21, 22, 24, 26, 27]
1  :  [0, 1]
0  :  [2, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 23, 25, 27, 28, 29]
2  :  [3, 4, 5, 6, 7, 19, 21, 22, 24, 26]


In [None]:
import matplotlib.pyplot as plt
from PIL import Image as im
final_centroids = final_centroids.astype(np.uint8)
for i in range(len(final_centroids)):
  # print('final_centroids[',i,'] = ', final_centroids[i])
  img = im.fromarray(final_centroids[i], 'RGB')
  name = 'centroid_' + str(i) + '.png'
  img.save(name)


final_centroids[ 0 ] =  [[[122 135 138]
  [118 129 133]
  [119 128 132]
  ...
  [127 137 137]
  [123 135 135]
  [121 132 134]]

 [[121 134 135]
  [121 131 132]
  [119 128 130]
  ...
  [132 142 139]
  [130 140 138]
  [124 135 134]]

 [[122 135 137]
  [121 132 133]
  [120 130 130]
  ...
  [130 138 134]
  [130 138 134]
  [123 132 130]]

 ...

 [[143 137 116]
  [140 134 110]
  [141 136 112]
  ...
  [ 94  95  74]
  [ 88  90  70]
  [ 95  95  77]]

 [[143 135 114]
  [142 135 112]
  [142 137 114]
  ...
  [ 99 100  79]
  [ 97  98  78]
  [ 98  99  80]]

 [[143 135 116]
  [142 135 116]
  [140 134 115]
  ...
  [102 102  83]
  [101 100  81]
  [103 101  82]]]
final_centroids[ 1 ] =  [[[106 119 125]
  [ 84  91  90]
  [ 77  76  69]
  ...
  [124 113  89]
  [119 107  86]
  [113 102  86]]

 [[ 78  90  94]
  [ 72  76  77]
  [ 71  66  59]
  ...
  [109  93  66]
  [ 98  81  56]
  [ 96  80  59]]

 [[ 82  89  92]
  [ 77  76  74]
  [ 82  71  60]
  ...
  [ 98  83  57]
  [ 94  77  52]
  [ 88  71  48]]

 ...

 [[1