This script is used to process the existing dataset of faces images, and save the preprocessed images as new files on a folder

Imports

In [21]:
import math
from typing import List, Tuple
import os
import random
from PIL import Image
import numpy as np
import cv2

Variables

In [22]:
dataset_source_folder = "C:/Users/tmayet/Documents/datasets/300W/"
indoor  = "01_Indoor"
outdoor = "02_Outdoor"

dataset_destination_folder = "C:/Users/tmayet/Documents/datasets/300W_64/"

shuffle = True

How one tuple of data is transformed into np.ndarry:

In [23]:
def getPointFromFile(y_file: str) -> List[Tuple[int, int]]:
    values = []
    with open(y_file, 'rb') as f:
        for i, line in enumerate(f):
            line = line.decode("utf-8")
            try:
                value1, value2 = line.split(" ")
                values.append( (float(value1), float(value2)) )
            except Exception:
                pass
    return values

In [24]:
def get_number(filename: str) -> int:
    start_removed = filename.split('_')[-1]
    end_removed   = start_removed.split('.')[0]
    number = int(end_removed)
    return number

# get all the patient folder in the dataset_source_folder
files_indoor = sorted(
    [folder_i_name for folder_i_name in os.listdir(os.path.join(dataset_source_folder, indoor))],
    key=get_number,
)
files_outdoor = sorted(
    [folder_i_name for folder_i_name in os.listdir(os.path.join(dataset_source_folder, outdoor))],
    key=get_number,
)
print(f"{len(files_indoor)=}")
print(f"{len(files_outdoor)=}")
files_indoor_img = [filename for filename in files_indoor if '.png' in filename]
files_indoor_pts = [filename for filename in files_indoor if '.pts' in filename]

files_outdoor_img = [filename for filename in files_outdoor if '.png' in filename]
files_outdoor_pts = [filename for filename in files_outdoor if '.pts' in filename]

print(f"{len(files_indoor_img)=}")
print(f"{len(files_indoor_pts)=}")

print(f"{len(files_outdoor_img)=}")
print(f"{len(files_outdoor_pts)=}")
files_indoor  = list(zip(files_indoor_img, files_indoor_pts, ['indoor']*len(files_indoor_img)))
files_outdoor = list(zip(files_outdoor_img, files_outdoor_pts, ['outdoor']*len(files_outdoor_img)))


len(files_indoor)=600
len(files_outdoor)=600
len(files_indoor_img)=300
len(files_indoor_pts)=300
len(files_outdoor_img)=300
len(files_outdoor_pts)=300


In [25]:
def save_data(
        iteration: int,
        x_array: np.ndarray,
        y_array: np.ndarray,
        original: int,
        type_door: str
) -> None:
    # Save as numpy array
    x_save = os.path.join(dataset_destination_folder, f'iter_{iteration}_original_{original}_type_{type_door}_x')
    y_save = os.path.join(dataset_destination_folder, f'iter_{iteration}_original_{original}_type_{type_door}_y')

    np.save(x_save+"_numpy", x_array)
    np.save(y_save+"_numpy", y_array)

    # Save as image
    x_image = Image.fromarray(x_array.astype(np.uint8)).convert('RGB')
    x_image.save(x_save+".jpeg")

In [26]:
def preprocessing_image(x_file, y_file, global_var) -> tuple[np.ndarray, np.ndarray, dict]:
    x: np.ndarray = cv2.imread(x_file).astype(np.uint8)
    x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)

    y: List[Tuple[int, int]] = getPointFromFile(y_file)

    img_h, img_w = x.shape[0], x.shape[1]

    max_height = math.ceil (max(y, key=lambda xy: xy[1])[1])
    min_height = math.floor(min(y, key=lambda xy: xy[1])[1])
    max_width  = math.ceil (max(y, key=lambda xy: xy[0])[0])
    min_width  = math.floor(min(y, key=lambda xy: xy[0])[0])

    height = max_height - min_height
    width  = max_width  - min_width

    # region if region in not square, make it square:
    diff = abs(height-width)
    if height<width:
        top_add = diff//2
        bottom_add = diff-top_add
        left_add = 0
        right_add = 0
    else:
        top_add = 0
        bottom_add = 0
        left_add = diff//2
        right_add = diff-left_add

    max_height += bottom_add
    min_height -= top_add
    max_width += right_add
    min_width -= left_add
    # endregion

    # region fix if the square region is outside the image
    if max_height>img_h:
        diff = max_height-img_h
        min_height -= diff
        max_height = img_h

    if min_height<0:
        max_height += abs(min_height)
        min_height = 0

    if max_width>img_w:
        diff = max_width-img_w
        min_width -= diff
        max_width = img_w

    if min_width<0:
        max_width += abs(min_width)
        min_width = 0
    # endregion

    x = x[min_height:max_height, min_width:max_width, :]

    # resize the image to 64, 64
    x = cv2.resize(x, dsize=(64, 64))

    # recenter the data
    y = [(_x-min_width, _y-min_height) for _x, _y in y]
    # rescale the data
    scale_factor        = max_height-min_height # since image is square, other dim is the same
    y = [(_x/scale_factor, _y/scale_factor) for _x, _y in y]

    return  x, y, global_var

global_var = dict(
    max_width=-1,
    max_height=-1,
    min_height=99999,
    min_width=99999,
)
if shuffle:
    random.shuffle(files_indoor)
    random.shuffle(files_outdoor)
files = []
for i in range(len(files_indoor)):
    files.append(files_indoor[i])
    files.append(files_outdoor[i])

# files = [('outdoor_090.png', 'outdoor_090.pts', 'outdoor')]

for i, (image_filename, points_filename, type_mode) in enumerate(files):
    initial_number = get_number(image_filename)

    ins = indoor if type_mode == 'indoor' else outdoor

    path_x = os.path.join(dataset_source_folder, ins, image_filename)
    path_y = os.path.join(dataset_source_folder, ins, points_filename)


    x, y, global_var = preprocessing_image(path_x, path_y, global_var)
    y = np.array(y)
    save_data(i, x, y, initial_number, type_mode)

print("END")

END


In [27]:
print(global_var)


{'max_width': -1, 'max_height': -1, 'min_height': 99999, 'min_width': 99999}
