# CIFAR10 Dataset Compression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from Outils.dataloader import load_CIFAR10
import matplotlib.pyplot as plt
%matplotlib inline

import os 
import glob

## Data Loading

In [2]:
from Outils.dataloader import load_CIFAR10

# Load the raw CIFAR-10 data.
cifar10_dir = 'Dataset/cifar-10-batches-py'

# Cleaning up variables to prevent loading data multiple times (which may cause memory issue)
try:
   del X_train, y_train
   del X_test, y_test
   print('Clear previously loaded data.')
except:
   pass

X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

X = np.concatenate([X_train, X_test])
X.shape

(60000, 32, 32, 3)

## Loading the clusters

Here we'll load the clusters obtained from KMeans Clustering algorithm for different number of clusters into a dictionary for comparison purpose.

Note that the clusters are saved in `.\Clusters\` folder

In [3]:
cluster_path = "./Clusters/*.npy"   # Path to all .npy files
clusters = {}
for f in glob.glob(cluster_path):
    clusters[f[11:19]] = np.load(f)

clusters

{'KMeans10': array([ 8, 28, 14, ..., 45, 33, 79]),
 'KMeans20': array([17,  3,  0, ...,  6, 13,  1]),
 'KMeans30': array([28, 18, 27, ..., 13, 20, 11]),
 'KMeans40': array([13, 34, 11, ...,  9, 37, 19]),
 'KMeans50': array([26, 19,  3, ..., 11, 30, 45]),
 'KMeans60': array([16, 47, 55, ...,  6, 11, 21]),
 'KMeans70': array([46, 39, 32, ..., 28,  3, 47]),
 'KMeans80': array([ 8, 60, 59, ..., 48, 10, 61]),
 'KMeans90': array([74, 85,  6, ..., 50, 44, 34])}

## Apply Compression Methods

### Min Max Differential

In [4]:
from Outils.compression import single_compression
import min_max_diff_flatten as mmdf 
import cv2

### Get the image sets (clusters)

####  Prepare the directories for saving the compressed images

In [36]:
for name, cluster in clusters.items():
    print(name)
    for value in np.unique(cluster):

        path = ".\Dataset\Compressed\mmdf_png\ " + name  + "\ "

        if os.path.exists(path):
            # os.rmdir(path)
            files = glob.glob(path + '*')
            for f in files:
                os.remove(f)
        else:
            os.mkdir(path)

KMeans40
KMeans50
KMeans60
KMeans70
KMeans80
KMeans90


In [37]:
for name, cluster in clusters.items():
    print(name)
    for value in np.unique(cluster):
        image_set = X[cluster == value]
        compressed_set, min, max = mmdf.Encoder(image_set)

        # os.chdir(path)
        path = ".\Dataset\Compressed\mmdf_png\ " + name  + "\ "

        for i in range(compressed_set.shape[0]):
            cv2.imwrite(path + "cluster" + str(value) + 'im' + str(i) + ".png", compressed_set[i].reshape((32,32,-1)))
        
        cv2.imwrite(path + "min" + str(value) + ".png", min.reshape((32,32,-1)))
        cv2.imwrite(path + "max" + str(value) + ".png", max.reshape((32,32,-1)))


KMeans40
KMeans50
KMeans60
KMeans70
KMeans80
KMeans90


#### Get size of compressed images

In [38]:
for name in clusters:
    print(name)

KMeans40
KMeans50
KMeans60
KMeans70
KMeans80
KMeans90


In [39]:
for name in clusters:
    path = ".\Dataset\Compressed\mmdf_png\ " + name  + "\ "

    size = 0
    for f in glob.glob(path + "*.png"):
        size = size + os.path.getsize(f)
    
    print(size)
    np.save("./Saved_datas/mmdf_" + name + "_png_size.npy", size)

151919381
152633064
153323341
154012987
154616893
155083185


## Min Max Predictive

In [5]:
import min_max_predictive as mmp

####  Prepare the directories for saving the compressed images

In [12]:
for name, cluster in clusters.items():
    print(name)
    for value in np.unique(cluster):

        path = ".\Dataset\Compressed\mmp_png\ " + name  + "\ "

        if os.path.exists(path):
            # os.rmdir(path)
            files = glob.glob(path + '*')
            for f in files:
                os.remove(f)
        else:
            os.mkdir(path)

KMeans10
KMeans20
KMeans30
KMeans40
KMeans50
KMeans60
KMeans70
KMeans80
KMeans90


In [13]:
for name, cluster in clusters.items():
    print(name)
    for value in np.unique(cluster):
        image_set = X[cluster == value]
        compressed_set, level, min, max = mmp.Encoder(image_set)

        # os.chdir(path)
        path = ".\Dataset\Compressed\mmp_png\ " + name  + "\ "

        for i in range(compressed_set.shape[0]):
            cv2.imwrite(path + "cluster" + str(value) + 'im' + str(i) + ".png", compressed_set[i])
        
        np.save(path + "level" + str(value) + ".npy", level)
        cv2.imwrite(path + "min" + str(value) + ".png", min)
        cv2.imwrite(path + "max" + str(value) + ".png", max)


KMeans10
KMeans20
KMeans30
KMeans40
KMeans50
KMeans60
KMeans70
KMeans80
KMeans90


#### Get size of compressed images

In [14]:
for name in clusters:
    path = ".\Dataset\Compressed\mmp_png\ " + name  + "\ "

    size = 0
    for f in glob.glob(path + "*.png"):
        size = size + os.path.getsize(f)
    for f in glob.glob(path + "*.npy"):
        size = size + os.path.getsize(f)
    
    print(size)
    np.save("./Saved_datas/mmdf_" + name + "_png_size.npy", size)

334790159
332244375
332712277
333148966
333435277
333821840
334086773
334339200
334568023


## Combine with Delta Encoding

In [16]:
import delta

####  Prepare the directories for saving the compressed images

In [22]:
for name, cluster in clusters.items():
    print(name)
    for value in np.unique(cluster):

        path = ".\Dataset\Compressed\delta_mmdf_png\ " + name  + "\ "

        if os.path.exists(path):
            # os.rmdir(path)
            files = glob.glob(path + '*')
            for f in files:
                os.remove(f)
        else:
            os.mkdir(path)

KMeans10
KMeans20
KMeans30
KMeans40
KMeans50
KMeans60
KMeans70
KMeans80
KMeans90


In [23]:
for name, cluster in clusters.items():
    print(name)
    for value in np.unique(cluster):
        image_set = X[cluster == value]
        image_set = delta.Delta_Encoder(image_set)
        compressed_set, min, max = mmdf.Encoder(image_set)
        

        # os.chdir(path)
        path = ".\Dataset\Compressed\delta_mmdf_png\ " + name  + "\ "

        for i in range(compressed_set.shape[0]):
            cv2.imwrite(path + "cluster" + str(value) + 'im' + str(i) + ".png", compressed_set[i])
        
        cv2.imwrite(path + "min" + str(value) + ".png", min)
        cv2.imwrite(path + "max" + str(value) + ".png", max)


KMeans10
KMeans20
KMeans30
KMeans40
KMeans50
KMeans60
KMeans70
KMeans80
KMeans90


#### Get size of compressed images

In [24]:
for name in clusters:
    path = ".\Dataset\Compressed\delta_mmdf_png\ " + name  + "\ "

    size = 0
    for f in glob.glob(path + "*.png"):
        size = size + os.path.getsize(f)
    
    print(size)
    np.save("./Saved_datas/delta_mmdf" + name + "_png_size.npy", size)

166271795
162732042
163828303
164291782
164934439
165173088
165587916
165915535
166092224


## Delta and PNG

####  Prepare the directories for saving the compressed images

In [29]:

path_png = ".\Dataset\Compressed\delta_png\ "

if os.path.exists(path_png):
    # os.rmdir(path_png)
    files = glob.glob(path_png + '*')
    for f in files:
        os.remove(f)
else:
    os.mkdir(path_png)

# os.chdir(path_png)

X_delta = delta.Delta_Encoder(X)

for i in range(X.shape[0]):
    cv2.imwrite(path_png + str(i) + ".png", X_delta[i,:,:,:])


#### Get size of compressed images

In [30]:

path = ".\Dataset\Compressed\delta_png\ "

size = 0
for f in glob.glob(path + "*.png"):
    size = size + os.path.getsize(f)

print(size)
np.save("./Saved_datas/delta_png_size.npy", size)

95154655
