In [None]:
import os
from PIL import Image
import random

import h5py
import numpy as np
import matplotlib.pyplot as plt

from db_helper import get_files_paths_recursive

In [None]:
categories_and_paths = {0: R"F:\master-thesis-databases\classification_db\fake", 
                        1 :R"F:\master-thesis-databases\classification_db\real"}
result_dir = R"C:\Users\Marcin\Dysk Google\masterDB"

In [None]:
def load_dataset_h5(path, dataset_name):
    with h5py.File(path, "r") as hf:
        print(hf.keys())
        X = hf[dataset_name][:]
        hf.close()
    return X

In [None]:
def convert_images_to_h5(img_paths, h5_path, dataset_name, img_in_cycle):
    print(f"Saving images as {dataset_name}")
    img_list = []
    i=0
    for img_path in img_paths:
        if i % 100 == 0:
            print(f"Image number: {i}")
        i += 1
        # loading images 
        pil_image = Image.open(img_path)
        np_image = np.array(pil_image).astype((np.uint8))
        img_list.append(np_image)
        # saving to h5 file
        if i % img_in_cycle == 0 and i > 0:
            # h5 dataset creation
            np_img_list = np.asarray(img_list)
            #TODO maybe some static function
            if i == img_in_cycle:
                with h5py.File(h5_path, 'w') as hf:
                    #images
                    hf.create_dataset(dataset_name, 
                                    np_img_list.shape, 
                                    data=np_img_list,
                                    maxshape=((None,)+np_img_list.shape[1:]),
                                    chunks=True)
            # h5 dataset append
            else:
                with h5py.File(h5_path, "a") as hf:
                    hf[dataset_name].resize(i, axis=0)
                    hf[dataset_name][-img_in_cycle:] = np.asarray(img_list)
                    hf.close()   
            img_list.clear()

In [None]:
def convert_dataset_to_h5(img_paths, img_categories, h5_path, dataset_name, img_in_cycle=10):
    print(f"To {h5_path} for dataset:{dataset_name}")
    assert len(img_paths) > img_in_cycle, "Database to small for specified saving cycle"

    if os.path.exists(h5_path):
        os.remove(h5_path)
    X_dataset_name = 'X_' + dataset_name
    y_dataset_name = 'y_' + dataset_name 

    # add images
    convert_images_to_h5(img_paths, h5_path, X_dataset_name, img_in_cycle=img_in_cycle)
    # add categories
    np_img_categories = np.asarray(img_categories)
    with h5py.File(h5_path, 'a') as hf:
        #images
        hf.create_dataset(y_dataset_name, 
                        np_img_categories.shape, 
                        data=np_img_categories,
                        maxshape=np_img_categories.shape)

In [None]:
def make_dataset_for_keras_h5(categories, result_directory, split_factor=0.8):
    print(f"Make dataset from: {categories} to {result_directory}")
    flag='w'
    train_path = os.path.join(result_directory, 'train.h5')
    val_path = os.path.join(result_directory, 'val.h5')
    img_paths_and_categories = []
    for name, path in categories.items():
        img_paths_and_categories += get_files_paths_recursive(path, category=name)[:5000]
    # create shuffled lists
    random.shuffle(img_paths_and_categories)
    img_paths = [path for path, category in img_paths_and_categories]
    img_categories = [category for path, category in img_paths_and_categories]
    # create datasets
    train_val_border = int(split_factor*len(img_paths_and_categories))
    convert_dataset_to_h5(img_paths[:train_val_border], img_categories[:train_val_border], train_path, "train")
    convert_dataset_to_h5(img_paths[train_val_border:], img_categories[train_val_border:], val_path, "val")
       


In [None]:
make_dataset_for_keras_h5(categories_and_paths, result_dir)