# Data preparation

You can run this file to shuffle train, val and test dataset. Then you should repeat learning and test procedure. If model result doesn't change significant, it haven't overfitted.

In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.model_selection import train_test_split
from skimage import transform, color

In [3]:
from matplotlib import pyplot as plt
import numpy as np
import cv2
import os
import pickle as pickle
from copy import copy
from collections import Counter
import pandas as pd
from itertools import count

ModuleNotFoundError: No module named 'pandas'

In [None]:
image_count = 0

def fold(n_fold):
    global image_count
    fnames, bboxes = [], []

    with open("data/FDDB-folds/FDDB-fold-{n_fold:02d}-ellipseList.txt".format(n_fold=n_fold), "r") as fin:
        fin = iter(fin)
        
        try:
            while True:
                fnames.append(next(fin).strip())
                shape = imread("data/originalPics/" + fnames[-1] + ".jpg").shape[:2]
                
                count = int(next(fin))
                
                for i in range(count):
                    
                    a, b, phi, center_x, center_y, _1 = (float(c) for c in next(fin).split())
                    t_x = np.arctan2(-b * np.tan(phi), a )
                    x_diff = np.abs(a * np.cos(t_x) * np.cos(phi) - b * np.sin(t_x) * np.sin(phi))
                    t_y = np.arctan2(b, a * np.tan(phi))
                    y_diff = np.abs(b * np.sin(t_y) * np.cos(phi) + a * np.cos(t_y) * np.sin(phi))

                    

                    bbox = [np.floor(center_y - y_diff), np.floor(center_x - x_diff), np.ceil(center_y + y_diff), np.ceil(center_x + x_diff)]
                    bbox = [max((int(c), 0)) for c in bbox]
                    bbox[::2] = (min((c, shape[0])) for c in bbox[::2])
                    bbox[1::2] = (min((c, shape[1])) for c in bbox[1::2])
                    bbox = [image_count, *bbox, *shape]
                    
                    bboxes.append(bbox)
                
                image_count += 1
        except StopIteration:
            pass
        
        return fnames, bboxes

In [None]:
fnames, bboxes = [], []
image_count = 0

for n_fold in range(1, 11):
    _fnames, _bboxes = fold(n_fold)
    fnames.extend(_fnames)
    bboxes.extend(_bboxes)

bboxes = np.array(bboxes, dtype=int)

In [None]:
convert_scales = []

for image_index in set(bboxes[:, 0]):
    image_bboxes = bboxes[bboxes[:, 0] == image_index]
    bbox_sizes = image_bboxes[:, (3, 4)] - image_bboxes[:, (1, 2)]
    avg_size = bbox_sizes.mean()
    rescale = 32 / avg_size
    
    converted_bbox_sizes = bbox_sizes * rescale
    converted_image_size = image_bboxes[0, -2:] * rescale
    
    TR = 8
    if (converted_bbox_sizes.min() >= 32 - TR and
        converted_bbox_sizes.max() <= 32 + TR and
        converted_image_size.min() >= 40 and
        converted_image_size.max() <= 176
       ):
        convert_scales.append([image_index, rescale])

In [None]:
len(convert_scales)

In [None]:
convert_bboxes = []

for image_index, rescale in convert_scales:
    image = imread("data/originalPics/" + fnames[image_index] + ".jpg")
    image = transform.rescale(image, rescale, mode="reflect")
    if len(image.shape) == 2: # image is gray
        image = color.gray2rgb(image)
    converted_image = np.zeros((176, 176, 3))
    converted_image[:image.shape[0], :image.shape[1]] = image
    imsave("data/convertedPics/" + str(image_index) + ".png", converted_image)
    # print(bboxes[bboxes[:, 0] == image_index, 1:], rescale)
    convert_bboxes.append(bboxes[bboxes[:, 0] == image_index, 1:] * rescale)

In [None]:
convert_bboxes = np.vstack([np.hstack([np.array([[image_index]]*len(bboxes)), bboxes]).astype(int)
                            for bboxes, (image_index, rescale) in zip(convert_bboxes, convert_scales)])

In [None]:
image_indeces = sorted(set(convert_bboxes[:, 0]))

trainval_indeces, test_indeces = train_test_split(image_indeces, test_size=0.2)
train_indeces, val_indeces = train_test_split(trainval_indeces, test_size=0.25)

def extract_images(image_indeces, convert_bboxes):
    fnames = ["convertedPics/{image_index}.png".format(image_index=image_index) for image_index in image_indeces]
    
    result_bboxes = []
    for i, image_index in enumerate(image_indeces):
        part_bboxes = convert_bboxes[convert_bboxes[:, 0] == image_index]
        part_bboxes[:, 0] = i
        result_bboxes.append(part_bboxes)
    
    return fnames, np.vstack(result_bboxes)

train_fnames, train_bboxes = extract_images(sorted(train_indeces), convert_bboxes)
val_fnames, val_bboxes = extract_images(sorted(val_indeces), convert_bboxes)
test_fnames, test_bboxes = extract_images(sorted(test_indeces), convert_bboxes)

In [None]:
original_indeces = sorted(set(bboxes[:, 0]) - set(image_indeces))
original_bboxes = []
original_fnames = []
for image_index in original_indeces:
    original_fnames.append("originalPics/" + fnames[image_index] + ".jpg")
    original_bboxes.append(bboxes[bboxes[:, 0] == image_index, 1:])

original_bboxes = np.vstack([np.hstack([np.array([[i]]*len(bboxes)), bboxes]).astype(int)
                             for i, bboxes in enumerate(original_bboxes)])

In [None]:
with open("data/original_fnames.csv", "w") as fout:
    for fname in original_fnames:
        print(fname, file=fout)
with open("data/original_bboxes.pkl", "wb") as fout:
    pickle.dump(original_bboxes.tolist(), fout, protocol=2)

In [None]:
with open("data/train_fnames.csv", "w") as fout:
    for fname in train_fnames:
        print(fname, file=fout)
with open("data/val_fnames.csv", "w") as fout:
    for fname in val_fnames:
        print(fname, file=fout)
with open("data/test_fnames.csv", "w") as fout:
    for fname in test_fnames:
        print(fname, file=fout)
        
with open("data/train_bboxes.pkl", "wb") as fout:
    pickle.dump(train_bboxes.tolist(), fout, protocol=2)
with open("data/val_bboxes.pkl", "wb") as fout:
    pickle.dump(val_bboxes.tolist(), fout, protocol=2)
with open("data/test_bboxes.pkl", "wb") as fout:
    pickle.dump(test_bboxes.tolist(), fout, protocol=2)