# DataPreprocessing

## imports

In [0]:
import tensorflow as tf
from tensorflow.python.client import device_lib
import glob
from PIL import Image
from skimage.color import rgb2lab, lab2rgb, rgb2gray, xyz2lab
import numpy as np

## configs

In [0]:
# connect with Doogle Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [0]:
# give the path of dataset, where image files are store
train_img_path = ".../datasets/dataset"

In [0]:
# set input size of network
SIZE = 224
# set batch size of the dataset
BATCH = 400

In [0]:
# print number of avaliable GPUs
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [0]:
# get the name of available GPUs
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

## dataset generation

In [0]:
# append all images to a variable
img_paths = []

for i, filename in enumerate(glob.glob(train_img_path+'/1/*.jpg')):
  img_paths.append(filename)

In [0]:
# construct dictionary of batches from img_paths
img_paths_dict = []

for i in range(0, len(img_paths), BATCH):
  chunk = img_paths[i:i + BATCH]
  img_paths_dict.append(chunk)

In [0]:
# store image_paths_dict in a file (optional)
with open('.../variables/img_paths_dict.pickle', 'wb+') as f:
  pickle.dump(img_paths_dict, f)

print(len(img_paths_dict))

In [0]:
# load store image_paths_dict (optional, if it's stored)
with open('.../variables/img_paths_dict.pickle', 'rb') as f:
  img_paths_dict = pickle.load(f)

print(len(img_paths_dict))

In [0]:
# store each batch in a file
for i,batch in enumerate(img_paths_dict):
  print("--- --- batch "+str(i)+"/"+str(len(img_paths_dict))+" --- ---")
  X = []
  y = []
  for j,img_path in enumerate(batch):
    print("--- image "+str(j)+"/"+str(len(batch))+" ---")
    img = Image.open(img_path)
    # img = img.resize((SIZE,SIZE))
    img_data = asarray(img)

    img_data = img_data*1.0/255

    lab = rgb2lab(img_data)
    X_l = lab[:,:,0]
    y_l = lab[:,:,1:]/ 128

    X_l = X_l.astype('float16')
    y_l = y_l.astype('float16')

    X.append(X_l)
    y.append(y_l)

  X = np.array(X)
  y = np.array(y)
  X = X.reshape(X.shape+(1,))

  print(X.shape)
  print(y.shape)

  print("saving...")
  
  with open('.../variables/dataset/X'+str(i)+'.pickle', 'wb+') as f:
    pickle.dump(X, f)

  with open('.../variables/dataset/y'+str(i)+'.pickle', 'wb+') as f:
    pickle.dump(y, f)

  del X
  del y