# Comparison of Compresson Methods

In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

### Data loading


In [2]:
from Outils.dataloader import load_CIFAR10

# Load the raw CIFAR-10 data.
cifar10_dir = 'Dataset/cifar-10-batches-py'

# Cleaning up variables to prevent loading data multiple times (which may cause memory issue)
try:
   del X_train, y_train
   del X_test, y_test
   print('Clear previously loaded data.')
except:
   pass

X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

X = np.concatenate([X_train, X_test])
# As a sanity check, we print out the size of the training and test data.
# print('Training data shape: ', X_train.shape)
# print('Training labels shape: ', y_train.shape)
# print('Test data shape: ', X_test.shape)
# print('Test labels shape: ', y_test.shape)

#### Data prepocessing

We have concatenated the training set and the test set in to a single numpy array. To use the kmeans, we have to flatten the data

In [5]:
# Total number of images
N = X.shape[0] 
X_reshaped = X.reshape((N,-1))

In [7]:
X_reshaped

array([[ 59.,  62.,  63., ..., 123.,  92.,  72.],
       [154., 177., 187., ..., 143., 133., 144.],
       [255., 255., 255., ...,  80.,  86.,  84.],
       ...,
       [ 20.,  15.,  12., ...,  25.,  20.,  47.],
       [ 25.,  40.,  12., ...,  92., 120.,  80.],
       [ 73.,  78.,  75., ...,  27.,  26.,  26.]])

In [8]:
X = X_reshaped

In [11]:
min_im = np.min(X, axis=0)
max_im = np.max(X, axis=0)

In [12]:
min_im.shape

(3072,)

In [13]:
dis = (X[:,1:] - min_im[1:]) > (X[:,1:] - max_im[1:])

KeyboardInterrupt: 

In [None]:
dis.shape

## Image Clustering

In [3]:
from sklearn.cluster import  KMeans

In [6]:
X_reshaped.shape

(60000, 3072)

Define the number of clusters

In [5]:
n_clusters = 10

Fit the Kmeans model to our dataset

In [7]:
k = 20
kmeans = KMeans(n_clusters=k)
kmeans.fit(X_reshaped)

KeyboardInterrupt: 

Calculate the cluster index for images and save it for later uses

In [None]:
clusters = kmeans.predict(X_reshaped)
np.save('./Clusters/Kmeans10.npy', clusters)

Next, we will create the set of similar images resulted from Kmeans, the `image_sets` dictionnary contains index of images of each cluster

In [None]:
image_sets = {}
for k in range(n_clusters):
    image_sets[k] = np.where( clusters == k)

In [None]:
image_sets

{0: (array([    0,    19,    33, ..., 59992, 59993, 59996]),),
 1: (array([    3,    10,    13, ..., 59936, 59978, 59997]),),
 2: (array([    8,    36,    61, ..., 59976, 59979, 59995]),),
 3: (array([    4,    12,    25, ..., 59971, 59972, 59991]),),
 4: (array([   11,    18,    29, ..., 59969, 59973, 59975]),),
 5: (array([    2,    15,    16, ..., 59955, 59963, 59988]),),
 6: (array([    5,     9,    14, ..., 59983, 59985, 59994]),),
 7: (array([    6,     7,    21, ..., 59961, 59990, 59999]),),
 8: (array([   20,    41,    48, ..., 59960, 59980, 59986]),),
 9: (array([    1,    30,    39, ..., 59968, 59987, 59998]),)}

## Min Max Differential

In [None]:
from min_max_diff import *

In [None]:
min_max_diff = {}
for k in range(n_clusters):
    min_max_diff['max_' + str(k)], min_max_diff['min_' + str(k)], min_max_diff['X_' + str(k)] = MMD_Encoder(X[image_sets[k]])

In [None]:
np.save("./Saved_datas/min_max_diff.npy", min_max_diff)

In [None]:
from Outils.compression import *

for k in range(n_clusters):
    single_compression(min_max_diff['X_' + str(k)], method = "MinMax" + str(k))

KeyboardInterrupt: 