# Create dataset
Label the dataset and split into train and test data.

In [1]:
import os
import glob
import gc
from tqdm import tqdm
from datetime import datetime as dt
import numpy as np
np.random.seed(0)
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import *

In [4]:
def draw_pics(tensor, nb=0, template='{}', classnumber=None):
    """Function to plot dataset images.

    Args:
        tensor : dataset tensor.
        nb -- int : number of images to be plotted.
        template -- str : template for titles of subplots.
        classnumber : titles of subplots.
    """
    if nb == 0:
        N = tensor.shape[0]
    else:
        N = min(nb, tensor.shape[0])
    fig = plt.figure(figsize=(16,16))
    nbligne = floor(N/20) + 1

    for m in range(N):
        subplot = fig.add_subplot(nbligne, min(N,20), m+1)
        plt.imshow(tensor[m, :, :, 0], vmin=0, vmax=1, cmap='gray')

        if classnumber != None:
            subplot.title.set_text((template.format(classnumber)))

In [5]:
def label_data(path):
    """Split data into data, labels.

    Args:
        path -- str : path for images.
    
    Returns:
        data -- np.array : list of all images.
        labels -- np.array : list of corresponding labels.
    """
    data = []
    labels = []
    total_data = sorted(glob.glob(path +"\\*"))

    for dirs in tqdm(total_data):
        for img_path in sorted(glob.glob(f'{dirs}\\*')):
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img = cv2.resize(img, (SIZE,SIZE))

            data.append([img])
            
            rep = ["0", "data\\CEDAR\\", "data\\BHSig26-Hindi\\",
                   "data\\BHSig26-Bengali\\"]
            for i in rep:
                dirs = dirs.replace(i, "")
            labels.append(int(dirs))

    data = np.array(data)
    data = data.reshape(-1, SIZE, SIZE)
    labels = np.array(labels)

    return data, labels

In [6]:
def build_dataset(path, classes):
    """Build dataset for train and test.

    Returns:
        dataset -- list : list of lengh classes containing images for each classes of shape (?, 224, 224, 1)
    """
    data, labels = label_data(path)

    x_train_origin, x_test_origin, y_train_origin, y_test_origin = train_test_split(data, labels, test_size=0.2)

    assert keras_backend.image_data_format() == 'channels_last'
    x_train_origin = x_train_origin.reshape(x_train_origin.shape[0], 224, 224, 1)
    x_test_origin = x_test_origin.reshape(x_test_origin.shape[0], 224, 224, 1)

    dataset_train = []
    dataset_test = []

    # Sorting images by classes and normalize values 0=>1
    for n in tqdm(range(1,classes+1)):
        images_class_n = np.asarray([row for idx, row in enumerate(x_train_origin) if y_train_origin[idx] == n])
        dataset_train.append(images_class_n/255)

        images_class_n = np.asarray([row for idx, row in enumerate(x_test_origin) if y_test_origin[idx] == n])
        dataset_test.append(images_class_n/255)

    return dataset_train, dataset_test, x_train_origin, y_train_origin, x_test_origin, y_test_origin