# Dataset library

In [1]:
import glob
import os
import random
import numpy as np
import struct
import math
import sys
import concurrent.futures
import time

from PIL import Image
from threading import Thread
from multiprocessing.pool import ThreadPool

buffer_size = -1

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

### This function create an index of data with associate labels

In [3]:
def create_index_file(source, labels, dest_data, dest_label, separator=","):
    current_filename = ''
    dataset_size = 0
    file_dataset_image = open(dest_data,'w')
    file_dataset_label = open(dest_label,'w')
    file_index = -1
    for filename in glob.iglob(source, recursive=True):
        folder = filename.split('\\')[1]
        if (current_filename != folder):
            current_filename = folder
            file_index += 1
        file_dataset_image.write(filename + separator)
        file_dataset_label.write(str(labels[filename.split('\\')[1]]) + separator)
        dataset_size += 1
    print("Create ", dest_data, " : ", os.path.getsize(dest_data), " bytes.")
    print("Create ", dest_label, " : ", os.path.getsize(dest_label), " bytes.")
    file_dataset_image.close()
    file_dataset_label.close()
    return dataset_size

### This function create a shuffle version of index/label file

In [4]:
def create_shuffle_index_file(source_data, source_label, dest_data, dest_label, separator=","):
    dataset_size = 0
    file_dataset_image = open(source_data,'r')
    file_dataset_label = open(source_label,'r')
    file_dataset_image_shuffle = open(dest_data,'w')
    file_dataset_label_shuffle = open(dest_label,'w')

    dataset_image_tab = file_dataset_image.read().split(separator)
    dataset_image_tab.pop()
    dataset_label_tab = file_dataset_label.read().split(separator)
    dataset_label_tab.pop()

    l = list(zip(dataset_image_tab, dataset_label_tab))
    random.shuffle(l)
    dataset_image_tab, dataset_label_tab = zip(*l)

    for i in range(len(dataset_image_tab)):
        file_dataset_image_shuffle.write(dataset_image_tab[i] + separator)
        file_dataset_label_shuffle.write(dataset_label_tab[i] + separator)
        dataset_size += 1

    print("Create ", dest_data, " : ", os.path.getsize(dest_data), " bytes.")
    print("Create ", dest_label, " : ", os.path.getsize(dest_label), " bytes.")
    file_dataset_image.close()
    file_dataset_label.close()
    file_dataset_image_shuffle.close()
    file_dataset_label_shuffle.close()
    return dataset_size

### Get an array of data from index/label file with specified size and offset

In [5]:
def get_data_from_index_file(filename, number, offset, separator=","):
    file_dataset_image = open(filename,'r')
    dataset_image_tab = file_dataset_image.read().split(separator)
    dataset_image_tab.pop()
    tab = []
    if (number == -1):
        number = len(dataset_image_tab)
    size = offset + number
    if (size > len(dataset_image_tab)):
        size = len(dataset_image_tab)
    for i in range(offset, size):
        tab.append(dataset_image_tab[i])
    return tab

### Write header for binary image dataset

In [6]:
def write_header_image(file, nb_images, nb_columns, nb_rows):
    h_magic = (2051).to_bytes(4, byteorder='big')
    h_nb_images = (nb_images).to_bytes(4, byteorder='big')
    h_nb_columns = (nb_columns).to_bytes(4, byteorder='big')
    h_nb_rows = (nb_rows).to_bytes(4, byteorder='big')
    header = [h_magic, h_nb_images, h_nb_columns, h_nb_rows]
    for e in header:
        file.write(e)

### Write header for binary label dataset

In [7]:
def write_header_label(file, nb_labels):
    h_magic = (2049).to_bytes(4, byteorder='big')
    h_nb_label = (nb_labels).to_bytes(4, byteorder='big')
    header = [h_magic, h_nb_label]
    for e in header:
        file.write(e)

### Create binary image dataset from index file

In [8]:
def generate_binary_dataset_from_image(path, number, offset=0, file_number=""):
    filenames = get_data_from_index_file(path, number, offset)
    filename = path.split('.')[0] + file_number + '.ubyte'
    out = open(filename,'wb')
    init = False
    time.sleep(1)
    data = []
    global buffer_size
    buffer_count = 0
    for f in log_progress(filenames, every=100):
        with Image.open(f, 'r') as image:
            width, height = image.size

            if (init == False):
                write_header_image(out, len(filenames), width, height)
                init = True
            pixel_values = list(image.getdata())
            if image.mode == 'RGB':
                if (buffer_size == -1):
                    buffer_size = 3 * len(pixel_values)
                channels = 3
                for p in pixel_values:
                    data.append(p[0])
                    data.append(p[1])
                    data.append(p[2])
                    buffer_count += 3
            elif image.mode == 'L':
                if (buffer_size == -1):
                    buffer_size = len(pixel_values)
                channels = 1
                for p in pixel_values:
                    data.append(p[0])
                    buffer_count += 1
            else:
                print("Unknown mode: %s" % image.mode)
                return None
            
            if (buffer_count >= buffer_size):
                buffer_count = 0
                out.write(bytearray(data))
                data = []
    if (len(data) != 0):
        out.write(bytearray(data))

### Create binary label dataset from label file

In [9]:
def generate_binary_dataset_from_label(path, number, offset=0, file_number="", label_size=1):
    labels = get_data_from_index_file(path, number, offset)
    filename = path.split('.')[0] + file_number + '.ubyte'
    out = open(filename,'wb')
    write_header_label(out, len(labels))
    data = []
    for l in labels:
        data.append((int(l)).to_bytes(label_size, byteorder='big'))
    for d in log_progress(data, every=100):
        out.write(d)

### Generate single binary chunck dataset

In [10]:
def generate_single_chunck(path, number, offset, type_file=0, file_number="", label_size=1):
    if (type_file == 0):
        generate_binary_dataset_from_image(path, number, offset, file_number)
    else:
        generate_binary_dataset_from_label(path, number, offset, file_number, label_size=1)
    pa = path.split('.')[0] + file_number + '.ubyte'

def generate_single_chunck_thread(path, number, offset, thread_name, type_file=0, file_number="", label_size=1):
    generate_single_chunck(path, number, offset, type_file, file_number, label_size)
    pa = path.split('.')[0] + file_number + '.ubyte'
    return "Create file with thread n°" + str(thread_name) + " : " + pa + ' (' + str(os.path.getsize(pa)) + " bytes)."

### Generate binary dataset

In [11]:
def generate_chunck_dataset_thread(path, size, nb_thread, max_iteration=-1, type_file=0):
    j = 0
    s = len(get_data_from_index_file(path, -1, 0))
    if (max_iteration == -1):
        max_iteration = math.ceil(s / size)
    pool = ThreadPool(nb_thread)
    for i in range(s - size):
        if (j == max_iteration):
            break
        with concurrent.futures.ThreadPoolExecutor(max_workers=nb_thread) as executor:
            # Start the load operations and mark each future with its URL
            tasks = {}
            for t in range(nb_thread):
                print("Writing in " + path.split('.')[0] + "_" + str(j) + ".ubyte has started...")
                tasks[executor.submit(generate_single_chunck_thread, path, size, i, t, type_file, "_" + str(j))] = t
                j += 1
                i += size
                if (j == max_iteration):
                    break
            for future in concurrent.futures.as_completed(tasks):
                task = tasks[future]
                try:
                    data = future.result()
                except Exception as exc:
                    print('%r generated an exception: %s' % (task, exc))
                else:
                    print(data)
        
def generate_chunck_dataset(path, size, max_iteration=-1, type_file=0):
    j = 0
    s = len(get_data_from_index_file(path, -1, 0))
    if (max_iteration == -1):
        max_iteration = math.ceil(s / size)
    for i in range(s - size):
        generate_single_chunck(path, size, i, type_file, "_" + str(j))
        pa = path.split('.')[0] + "_" + str(j) + '.ubyte'
        j += 1
        print("Create file (" + str(j) + "/" + str(max_iteration) + ") : " + pa + ' (', os.path.getsize(pa), " bytes).")
        print(str(((j) / max_iteration) * 100) + " %")
        i += size
        if (j == max_iteration):
            return

In [12]:
def generate_dataset(path, number, offset=0, type_file=0):
    generate_single_chunck(path, number, offset, type_file)
    pa = path.split('.')[0] + '.ubyte'
    print("Create file : " + pa + "(" + str(os.path.getsize(pa)) + " bytes).")

In [13]:
def read_number(file, size):
    return int.from_bytes(file.read(size), byteorder='big', signed=False)

def read_label_data(filename, number_label, offset):
    file = open(filename, 'rb')
    magix_number = (0x00000801).to_bytes(4, byteorder='big')
    read_value = file.read(4)
    if (read_value != magix_number):
        print("This isn't a label file!")
        return 0
    number_of_items = read_number(file, 4)
    if (offset >= number_of_items):
        return [number_of_items, 0]
    if (number_label + offset > number_of_items):
        number_label = number_of_items - offset
    header_size = 8
    file.seek(header_size + offset * number_label)
    buf = file.read(number_label)
    data = np.frombuffer(buf, dtype=np.uint8)
    return data

def normalise_number(number, minimum, maximum):
    return (number - minimum) / (maximum - minimum)

def read_image_data(filename, depth, number_images, offset):
    file = open(filename, 'rb')
    magix_number = (0x00000803).to_bytes(4, byteorder='big')
    read_value = file.read(4)
    if (read_value != magix_number):
        print("This isn't an image file!")
        return 0
    number_of_items = read_number(file, 4)
    number_of_rows = read_number(file, 4)
    number_of_columns = read_number(file, 4)
    if (offset >= number_of_items):
        return [number_of_items, number_of_rows, number_of_columns, 0]
    if (number_images + offset > number_of_items):
        number_images = number_of_items - offset
    image_size = number_of_rows * number_of_columns
    header_size = 16
    file.seek(header_size + image_size * offset * depth)
    buf = file.read(number_images * image_size * depth)
    data = np.frombuffer(buf, dtype=np.uint8)
    data = data.reshape(number_images, number_of_rows, number_of_columns, depth)
    return data

In [14]:
def create_dataset_darknet(imgs, labels, anchorsX, anchorY, width, height, separator=','):
    file_dataset_image_shuffle = open(imgs,'r')
    file_dataset_label_shuffle = open(labels,'r')
    
    dataset_image_tab = file_dataset_image_shuffle.read().split(separator)
    dataset_image_tab.pop()
    dataset_label_tab = file_dataset_label_shuffle.read().split(separator)
    dataset_label_tab.pop()
    
    for i in range(len(dataset_image_tab)):
        path = os.path.splitext(dataset_image_tab[i])[0] + '.txt'
        f = open(path,'w')
        line = '{0} {1} {2} {3} {4}'.format(dataset_label_tab[i], anchorsX, anchorY, width, height)
        f.write( line ) 
        f.close()