In [1]:
import os
import numpy as np
from scipy import ndimage
import h5py
from sys import getsizeof

In [2]:
IMAGE_WIDTH = 256
IMAGE_HEIGHT = 256
CHANNEL = 3
PIXEL_DEPTH = 255.0

def load_image_from_folder(folder):
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files), IMAGE_WIDTH, IMAGE_HEIGHT, CHANNEL),
                         dtype=np.float32)
    
    index_image = 0
    for image in image_files:
        image_file = os.path.join(folder, image)
    
        try:
            image_data = (ndimage.imread(image_file).astype(float) - PIXEL_DEPTH/2) / PIXEL_DEPTH

            if image_data.shape != (IMAGE_WIDTH, IMAGE_HEIGHT, CHANNEL):
                raise Exception('Unexpected image shape: %s' % str(image_data.shape))

            dataset[index_image, :, :, :] = image_data
            index_image = index_image + 1

        except IOError as e:
            print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
        
    print('Full dataset tensor:', dataset.shape)
    return dataset  

def store_in_file(folders, force=False):
    dataset_names = []
    for folder in folders:
        set_filename = folder + '.hdf5'
        dataset_names.append(set_filename)
        
        if os.path.exists(set_filename) and not force:
            print('%s already present - Skipping storing.' % set_filename)
        else:
            print('Storing %s.' % set_filename)
            dataset = load_image_from_folder(folder)
            try:
                f = h5py.File(set_filename, "w")
                f.create_dataset("ds", data=dataset)
                f.close()
            except Exception as e:
                print('Unable to save data to', set_filename, ':', e)
    
    return dataset_names

train_datasets = store_in_file(['./train/LuaCoDe', './train/LuaNga', './train/LuaTot'])

Storing ./train/LuaCoDe.hdf5.
('Full dataset tensor:', (7168, 256, 256, 3))
Storing ./train/LuaNga.hdf5.
('Full dataset tensor:', (20512, 256, 256, 3))
Storing ./train/LuaTot.hdf5.
('Full dataset tensor:', (17792, 256, 256, 3))


In [3]:
def make_placeholder(n_examples, img_width, img_height, channel):
    if n_examples:
        dataset = np.ndarray((n_examples, img_width, img_height, channel), dtype=np.float32)
        label = np.ndarray(n_examples, dtype=np.int32)
    else:
        dataset, label = None, None
    return dataset, label

def merge_datasets(hdf5_files, train_size, valid_size=0):
    num_classes = len(hdf5_files)
    train_dataset, train_labels = make_placeholder(train_size, IMAGE_WIDTH, IMAGE_HEIGHT, CHANNEL)
    valid_dataset, valid_labels = make_placeholder(valid_size, IMAGE_WIDTH, IMAGE_HEIGHT, CHANNEL)
  
    tsize_per_class = train_size // num_classes
    vsize_per_class = valid_size // num_classes   
    
    start_t, start_v = 0, 0
    end_t, end_v = tsize_per_class, vsize_per_class
    end_l = tsize_per_class + vsize_per_class 
    
    for label, hdf5_file in enumerate(hdf5_files):       
        try:
            f = h5py.File(hdf5_file, 'r')
            crop_set = f["ds"][...]
            np.random.shuffle(crop_set)
            if valid_dataset is not None:
                valid_crop = crop_set[:vsize_per_class, :, :, :]
                valid_dataset[start_v:end_v, :, :, :] = valid_crop
                valid_labels[start_v:end_v] = label
                start_v += vsize_per_class
                end_v += vsize_per_class
                    
            train_crop = crop_set[vsize_per_class:end_l, :, :, :]
            train_dataset[start_t:end_t, :, :, :] = train_crop
            train_labels[start_t:end_t] = label
            start_t += tsize_per_class
            end_t += tsize_per_class
            f.close()
        except Exception as e:
            print('Unable to process data from', hdf5_file, ':', e)

    
    return valid_dataset, valid_labels, train_dataset, train_labels
            
            
train_size = 18000
valid_size = 3000
#test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
#_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
#print('Testing:', test_dataset.shape, test_labels.shape)

('Training:', (18000, 256, 256, 3), (18000,))
('Validation:', (3000, 256, 256, 3), (3000,))


In [5]:
def randomize(dataset, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_dataset = dataset[permutation,:,:,:]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)
#test_dataset, test_labels = randomize(test_dataset, test_labels)

In [6]:
hdf5_file = "cropfield.hdf5"

try:
    f = h5py.File(hdf5_file, 'w')
#     save = {
#         'train_dataset': train_dataset,
#         'train_labels': train_labels,
#         'valid_dataset': valid_dataset,
#         'valid_labels': valid_labels,
#         'test_dataset': test_dataset,
#         'test_labels': test_labels,
#         }
    f.create_dataset("train_dataset", data=train_dataset)
    f.create_dataset("train_labels", data=train_labels)
    f.create_dataset("valid_dataset", data=valid_dataset)
    f.create_dataset("valid_labels", data=valid_labels)
    f.close()
except Exception as e:
    print('Unable to save data to', hdf5_file, ':', e)

In [7]:
statinfo = os.stat(hdf5_file)
print('Compressed hdf5 size:', statinfo.st_size/1048576/1024)

('Compressed hdf5 size:', 15)
