# Writing Tiny ImageNet dataset into HDF5 file

## Importing Libraries


In [None]:
from config import tiny_imagenet_config as config
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from compvis.datasets import HD5FDatasetWriter
from imutils import paths
import numpy as np
import json
import cv2
import os

In [None]:
# grab the paths to the training images, then extract the training class labels and encode them
trainPaths = list(paths.list_images(config.TRAIN_IMAGES))
trainLabels = [p.split(os.path.sep)[-3] for p in trainPaths]
le = LabelEncoder()
trainLabels = le.fit_transform(trainLabels)

In [None]:
# Splitting the dataset into training and validation set

split = train_test_split(trainPaths, trainLabels, test_size=config.NUM_TEST_IMAGES,
                         stratify=trainLabels, random_state=42)
(trainPaths, testPaths, trainLabels, testLabels) = split

In [None]:
# load the validation filename => class from file and then use these mappings to build the validation paths and label lists
M = open(config.VAL_MAPPINGS).read().strip().split("\n")
M = [r.split("\t")[:2] for r in M]
valPaths = [os.path.sep.join([config.VAL_IMAGES, m[0]]) for m in M]
valLabels = le.transform([m[1] for m in M])


In [None]:
# construct a list pairing the training, validation, and testing image paths along with their corresponding labels and output HDF5
# files

datasets = [("train", trainPaths, trainLabels, config.TRAIN_HDF5),
            ("val", valPaths, valLabels, config.VAL_HDF5),
            ("test", testPaths, testLabels, config.TEST_HDF5)]

In [None]:
# initialize the lists of RGB channel averages
(R, G, B) = ([], [], [])

In [None]:
# loop over the dataset tuples
for (dTypes, paths, labels, outputPath) in datasets:
    # Creating the HDF5 writter
    print("[INFO] building {}...".format(outputPath))
    writer = HD5FDatasetWriter((len(paths), 64, 64, 3), outputPath)
    # loop over the image paths
    for (i, (path, label)) in enumerate(zip(paths, labels)):
        # Loading image from disk
        image = cv2.imread(path)
        # if we are building the training dataset, then compute the
        # mean of each channel in the image, then update the
        # respective lists
        if dTypes == "train":
            (b, g, r) = cv2.mean(image)[:3]
            R.append(r)
            G.append(g)
            B.append(b)
        writer.add([image], [label])
    
    # closing the current dataset
    writer.close()

In [None]:
# construct a dictionary of averages, then serialize the means to a JSON file
print("[INFO] serializing means...")
D = {"R": np.mean(R), "G": np.mean(G), "B": np.mean(B)}
f = open(config.DATASET_MEAN, "w")
f.write(json.dumps(D))
f.close()