## Columbia University
### ECBM E4040 Neural Networks and Deep Learning. Fall 2019.

# Dataprocess 

In this file, we are going to implement: 

(1) Read data

(2) Process data

(3) Convert to specific type for using

(4) Start and output
<br><br>
#### PS: In every several lines we create comments for better understanding the functions.

In [1]:
import os
import numpy as np
import h5py
import random
from PIL import Image
import tensorflow as tf
import json

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def preprocess_train(train_path, train_mat, train_tfrecords, val_tfrecords):
    num_total = 33402
    num_train = 30062
    num_val = 3340
    writer_train = tf.python_io.TFRecordWriter(train_tfrecords)
    writer_val = tf.python_io.TFRecordWriter(val_tfrecords)

    num_train_c = num_train
    num_val_c = num_val
    # Load paths to each images
    images_path = tf.gfile.Glob(os.path.join(train_path, '*.png'))


    # read .mat file 
    f = h5py.File(train_mat, 'r')
    for i in range(1, num_total):
        if i % 100 == 0:
            print ('(%d/%d) ' % (i, num_total))

        # Final goal: tf.train.Example containing image(input) and length, digits(ground truth)
        # Pick up paths one by one from the list 
        image_path = images_path[i-1]
        # Extract each image's index, for example get 3974 from './data/train/3975.png' 
        index = int(image_path.split('/')[-1].split('.')[0]) - 1
        #Extract information of corresponding bbox of each image
        bbox = {}
        item = f['digitStruct']['bbox'][index].item()
        for key in ['label', 'left', 'top', 'width', 'height']:
            attr = f[item][key]
            if len(attr) > 1:
                vals = []
                for j in range(len(attr)):               
                    vals.append(f[attr.value[j].item()].value[0][0])
            else:
                vals.append(attr.value[0][0])

            bbox[key] = vals

        # Get labels from the dict
        labels = bbox['label']
        length = len(labels)
        # Transfer labels into list of length 5 digits, for example digits = [1, 2, 0, 10 ,10] when labels_of_digits = [1, 2, 0]
        digits = [10, 10, 10, 10, 10]   # digit 10 represents no digit
        # If the length of the labels is bigger than five, discard this example
        if length > 5:
            # Substract 1 from the number of data 
            if i <= num_val:
                num_val_c -= 1
            else:
                num_train_c -= 1
            continue
        # Transfer labels into list of length 5 digits, for example digits = [1, 2, 0, 10 ,10] when labels_of_digits = [1, 2, 0]
        for idx, label in enumerate(labels):  
            if label == 10: # label 10 is essentially digit zero
                digits[idx] = 0  
            else:
                digits[idx] = int(label) 

        # Get blue boxes' positions and dimensions from the dict
        # *map() function returns a list of the results after applying the given function to each item of a given iterable 
        # This statement might have the same function of attrs_left = attrs['left'], attrs_top = attrs['top], ...
        bbox_left, bbox_top, bbox_width, bbox_height = map(lambda x: [int(i) for i in x], [bbox['left'], bbox['top'], bbox['width'], bbox['height']])
        min_left, min_top, max_right, max_bottom = (min(bbox_left),
                                                    min(bbox_top),
                                                    max(map(lambda x, y: x + y, bbox_left, bbox_width)),
                                                    max(map(lambda x, y: x + y, bbox_top, bbox_height)))
        c_x, c_y, max_side = ((min_left + max_right) / 2.0,
                                        (min_top + max_bottom) / 2.0,
                                        max(max_right - min_left, max_bottom - min_top))
        rbox_left, rbox_top, rbox_width, rbox_height = (c_x - max_side / 2.0,
                                                        c_y - max_side / 2.0,
                                                        max_side,
                                                        max_side)
        cropped_left, cropped_top, cropped_width, cropped_height = (int(round(rbox_left - 0.15 * rbox_width)),
                                                                    int(round(rbox_top - 0.15 * rbox_height)),
                                                                    int(round(rbox_width * 1.3)),
                                                                    int(round(rbox_height * 1.3)))

        # Crop the image by the enlarged blue box and resize the cropped image to 64x64 and turn it into Python bytes
        image = Image.open(image_path)
        image = image.crop([cropped_left, cropped_top, cropped_left + cropped_width, cropped_top + cropped_height])
        image = np.array(image.resize([64, 64])).tobytes()
        
        # Put image(input) and length, digits(ground truth) into a tf.train.Example
        # tf.train.Example isn’t a normal Python class, but a protocol buffer.
        # Later use a tf.python_io.TFRecordWriter to write it to disk.
        image_featured = tf.train.Feature(bytes_list=tf.train.BytesList(value=[image]))
        length_featured = tf.train.Feature(int64_list=tf.train.Int64List(value=[length]))
        digits_featured = tf.train.Feature(int64_list=tf.train.Int64List(value=digits))

        example = tf.train.Example(features=tf.train.Features(feature={
            'image': image_featured,
            'length': length_featured,
            'digits': digits_featured
        }))
        if i <= num_val:
            writer_val.write(example.SerializeToString())
        else:
            writer_train.write(example.SerializeToString())
    
    num_train = num_train_c
    num_val = num_val_c
    writer_train.close()
    writer_val.close()
    return num_train, num_val




In [5]:
def preprocess_test(test_path, test_mat, test_tfrecords):
    num_test = 13068
    writer_test = tf.python_io.TFRecordWriter(test_tfrecords)


    # Load paths to each images
    images_path = tf.gfile.Glob(os.path.join(test_path, '*.png'))

    # read .mat file 
    f = h5py.File(test_mat, 'r')
    for i in range(1, num_test):
        if i % 100 == 0:
            print ('(%d/%d) ' % (i, num_test))

        # Final goal: tf.train.Example containing image(input) and length, digits(ground truth)
        image_path = images_path[i-1]
        index = int(image_path.split('/')[-1].split('.')[0]) - 1
        #Extract information of corresponding bbox of each image
        bbox = {}
        item = f['digitStruct']['bbox'][index].item()
        for key in ['label', 'left', 'top', 'width', 'height']:
            attr = f[item][key]
            if len(attr) > 1:
                vals = []
                for j in range(len(attr)):               
                    vals.append(f[attr.value[j].item()].value[0][0])
            else:
                vals.append(attr.value[0][0])

            bbox[key] = vals


        labels = bbox['label']
        length = len(labels)
        # Transfer labels into list of length 5 digits, for example digits = [1, 2, 0, 10 ,10] when labels_of_digits = [1, 2, 0]
        digits = [10, 10, 10, 10, 10]   # digit 10 represents no digit
        if length > 5:
            num_test -= 1
            continue
        for idx, label in enumerate(labels):  
            if label == 10: # label 10 is essentially digit zero
                digits[idx] = 0  
            else:
                digits[idx] = int(label) 

        # Get blue boxes' positions and dimensions from the dict
        # *map() function returns a list of the results after applying the given function to each item of a given iterable 
        # This statement might have the same function of attrs_left = attrs['left'], attrs_top = attrs['top], ...
        bbox_left, bbox_top, bbox_width, bbox_height = map(lambda x: [int(i) for i in x], [bbox['left'], bbox['top'], bbox['width'], bbox['height']])
        min_left, min_top, max_right, max_bottom = (min(bbox_left),
                                                    min(bbox_top),
                                                    max(map(lambda x, y: x + y, bbox_left, bbox_width)),
                                                    max(map(lambda x, y: x + y, bbox_top, bbox_height)))
        c_x, c_y, max_side = ((min_left + max_right) / 2.0,
                                        (min_top + max_bottom) / 2.0,
                                        max(max_right - min_left, max_bottom - min_top))
        rbox_left, rbox_top, rbox_width, rbox_height = (c_x - max_side / 2.0,
                                                        c_y - max_side / 2.0,
                                                        max_side,
                                                        max_side)
        cropped_left, cropped_top, cropped_width, cropped_height = (int(round(rbox_left - 0.15 * rbox_width)),
                                                                    int(round(rbox_top - 0.15 * rbox_height)),
                                                                    int(round(rbox_width * 1.3)),
                                                                    int(round(rbox_height * 1.3)))

        image = Image.open(image_path)
        image = image.crop([cropped_left, cropped_top, cropped_left + cropped_width, cropped_top + cropped_height])
        image = np.array(image.resize([64, 64])).tobytes()

        image_featured = tf.train.Feature(bytes_list=tf.train.BytesList(value=[image]))
        length_featured = tf.train.Feature(int64_list=tf.train.Int64List(value=[length]))
        digits_featured = tf.train.Feature(int64_list=tf.train.Int64List(value=digits))

        example = tf.train.Example(features=tf.train.Features(feature={
            'image': image_featured,
            'length': length_featured,
            'digits': digits_featured
        }))
        writer_test.write(example.SerializeToString())
        

    writer_test.close()
    return num_test

In [6]:
# Set up paths to raw data
train_path = './data/train'
test_path = './data/test'
train_mat = './data/train/digitStruct.mat'
test_mat = './data/test/digitStruct.mat'

# Define paths to store processed data
train_tfrecords = os.path.join('./data', 'train.tfrecords')
val_tfrecords = os.path.join('./data', 'val.tfrecords')
test_tfrecords = os.path.join('./data', 'test.tfrecords')
meta_file = os.path.join('./data', 'meta.json')

num_train, num_val = preprocess_train(train_path, train_mat, train_tfrecords, val_tfrecords)
num_test = preprocess_test(test_path, test_mat, test_tfrecords)

print('Number of training examples', num_train)
print('Number of validation examples', num_val)
print('Number of test examples', num_test)

# Write numbers of training, validation, test data into meta file for further call
with open(path_to_tfrecords_meta_file, 'w') as f:
        content = {
            'num_examples': {
                'train': num_train,
                'val': num_val,
                'test': num_test
            }
        }
        json.dump(content, f)

(100/13050) 
(200/13031) 
(300/13020) 
(400/12993) 
(500/12976) 


KeyboardInterrupt: 