# Create dataset

1. Label the dataset.
2. Preprocess the images.
3. Make batches of triplets with anchor, positive and negative images.

In [1]:
import math                                                 # mathematical functions
import glob                                                 # iterate over directiories
import gc                                                   # garbage collection
from tqdm import tqdm                                       # progress meter
import pandas  as pd                                        # dataframes
import numpy as np                                          # linear algebra
np.random.seed(0)                                           # setting up numpy seed
import cv2                                                  # image processing
import matplotlib.pyplot as plt                             # image plots

from sklearn.model_selection import train_test_split        # dataset train test split

from keras import backend as keras_backend                  # keras backend

In [2]:
SIZE = 224

## Preprocessing

In [3]:
def label_data(path):
    """Saves data into CSV format.

    Args:
        path -- str : path to dataset.
    
    Returns:
        df -- pd.DataFrame : dataframe containing labeled images.
    """
    data = []
    class_label = []
    total_data = sorted(glob.glob(path + "\\*"))

    for dirs in total_data:
        for img_path in sorted(glob.glob(f'{dirs}\\*')):
            data.append(img_path)

            rep = ["data\\CEDAR\\", "data\\BHSig260-Bengali\\", "data\\BHSig260-Hindi\\"]
            for i in rep:
                dirs = dirs.replace(i, "")

            class_label.append(dirs)

    df = pd.DataFrame({'image': data, 'class': class_label}).sort_values('class')
    df.to_csv(path+'_labels.csv', index=False)

    return df

In [4]:
def build_dataset(path):
    """Build dataset for training.

    Args:
        path -- str : path to dataset.

    Returns:
        dataset -- list : containing images divided into original and forged
                          for each class of shape (?, 224, 224, 1).
    """
    df = label_data(path).groupby('class')
    data = []
    X = []
    y = []

    # Sorting images by classes
    for name, group in tqdm(df):
        inner = [[], []]

        images = group['image'].values
        for img in images:
            img_path = img
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img = cv2.resize(img, (SIZE, SIZE))

            X.append([img])
            y.append(name)

            # Appending normalized values 0=>1
            if '-G-' in img_path or 'original' in img_path:
                inner[0].append(img)
            else:
                inner[1].append(img)

        data.append(np.asarray(inner)/255)

    data = np.array(data)

    X = np.array(X)/255
    X = X.reshape(-1, SIZE, SIZE)
    y = np.array(y)

    x_train_origin, x_test_origin, y_train_origin, y_test_origin = train_test_split(X, y, test_size=0.2)

    assert keras_backend.image_data_format() == 'channels_last'
    x_train_origin = x_train_origin.reshape(x_train_origin.shape[0], 224, 224, 1)
    x_test_origin = x_test_origin.reshape(x_test_origin.shape[0], 224, 224, 1)

    return data, x_train_origin, y_train_origin, x_test_origin, y_test_origin

In [5]:
def draw_pics(data, nb=0, template='{}', classnumber=None):
    """Draw dataset images.

    Args:
        data : dataset.
        nb -- int : number of images to be plotted.
        template -- str : template for titles of subplots.
        classnumber : titles of subplots.
    """
    fig = plt.figure(figsize=(8,5))

    for m in range(2):
        subplot = fig.add_subplot(1, 2, m+1)
        plt.axis("off")
        plt.imshow(data[m][0], vmin=0, vmax=1, cmap='gray')

## Make triplet batches

In [6]:
def get_batch_hard(network, chosen_class, data):
    """Create batch of APN triplets with a complete random strategy.

    Args:
        network -- CNN : The Convolution Neural Network.
        chosen_class -- int : The chosen class label.
        data -- list : List containing all data for the dataset.

    Returns:
        triplets -- list : List containing 3 tensors Anchor, Positive,
                           Negative of shape (batch_size, w, h, c).
    """
    # Step 1 : pick a batch to study
    w, h = data[chosen_class][0][0].shape

    # Length of genuine images
    nb_sample_available_for_class_AP = int(len(data[chosen_class][0])/2)

    # Length of forged images
    nb_sample_available_for_class_N = len(data[chosen_class][1])

    batch_size = min(nb_sample_available_for_class_AP,
                     nb_sample_available_for_class_N)

    # initialize result
    studybatch = [np.zeros((batch_size, h, w)) for _ in range(3)]

    for i in range(batch_size):
        # Pick two different random pics for this class => A and P
        [idx_A, idx_P] = np.random.choice(
            nb_sample_available_for_class_AP, size=2, replace=False)

        # Pick a random pic for this negative class => N
        idx_N = np.random.randint(0, nb_sample_available_for_class_N)

        studybatch[0][i, :, :] = data[chosen_class][0][idx_A, :, :]
        studybatch[1][i, :, :] = data[chosen_class][0][idx_P, :, :]
        studybatch[2][i, :, :] = data[chosen_class][1][idx_N, :, :]

    # Step 2 : compute the loss with current network : d(A,P)-d(A,N).
    # The alpha parameter here is omited here since we want to order them.
    studybatchloss = np.zeros((batch_size))

    # Compute embeddings for anchors, positive and negatives
    A = network.predict(studybatch[0])
    P = network.predict(studybatch[1])
    N = network.predict(studybatch[2])

    # Compute d(A,P)-d(A,N)
    studybatchloss = np.sum(np.square(A-P), axis=1) - np.sum(np.square(A-N), axis=1)

    # Step 3 : Sort by distance (high distance first)
    selection = np.argsort(studybatchloss)[::-1]

    triplets = [
        studybatch[0][selection, :, :],
        studybatch[1][selection, :, :],
        studybatch[2][selection, :, :]
    ]

    return triplets

In [7]:
def get_batch_all(network, data):
    """Create batch of APN "hard" triplets.

    Args:
        network -- CNN : The Convolution Neural Network.
        data -- list : list containing all data for the dataset.

    Returns:
        triplets -- list : containing 3 tensors Anchor, Positive, Negative of
                           shape (hard_batchs_size+norm_batchs_size, w, h, c).
    """
    w, h = data[0][0][0].shape

    # Length of genuine images
    nb_sample_available_for_class_AP = int(len(data[0][0])/2)

    # Length of forged images
    nb_sample_available_for_class_N = len(data[0][1])

    batch_size = min(nb_sample_available_for_class_AP,
                     nb_sample_available_for_class_N)

    # initialize result
    finalbatch = [np.empty((batch_size, h, w)) for i in range(3)]

    classes = len(data)
    for chosen_class in range(classes):
        batch = get_batch_hard(network, chosen_class, data)

        if chosen_class == 0:
            finalbatch[0] = batch[0]
            finalbatch[1] = batch[1]
            finalbatch[2] = batch[2]
        else:
            finalbatch[0] = np.concatenate((finalbatch[0], batch[0]))
            finalbatch[1] = np.concatenate((finalbatch[1], batch[1]))
            finalbatch[2] = np.concatenate((finalbatch[2], batch[2]))

    return finalbatch

In [8]:
def draw_triplets(tripletbatch, nbmax=None):
    """Display the three images for each triplets in the batch.

    Args:
        tripletbatch -- list : list of batches of triplets.
        nbmax -- int/None : number of batches to be displayed,
                            if set to None will show all batches.
    """
    labels = ["Anchor", "Positive", "Negative"]

    if nbmax == None:
        nbrows = tripletbatch[0].shape[0]
    else:
        nbrows = min(nbmax, tripletbatch[0].shape[0])

    for row in range(nbrows):
        fig = plt.figure(figsize=(16, 2))

        for i in range(3):
            subplot = fig.add_subplot(1, 3, i+1)
            plt.axis("off")
            plt.imshow(tripletbatch[i][row, :, :], vmin=0, vmax=1, cmap='gray')
            subplot.title.set_text(labels[i])