# Imports

In [4]:
import os
import math
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# ---
import sys; sys.path.append('../')
from commons.dataset import *
import modules.img_normalizer

# Constants

In [5]:
DATASET_INPUT = DATASETS_PATHS.faces
DATASET_OUTPUT = DATASETS_PATHS.norm_faces

IMAGE_SIZE = 220

TRAIN_PERCENT = .65# .7
VAL_PERCENT = .2
TEST_PERCENT = .15# .1
RANDOM_STATE = None

# Load dataset

In [6]:
dataset = pd.read_csv(DATASET_INPUT.info)

# Apply filters

## Test filters

In [None]:
plt.figure()
for idx,path in enumerate(dataset.path.sample(3).iloc):
    plt.subplot(1,3,idx+1)
    plt.imshow(normalizer(cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)))
    plt.axis('off')

## Apply filters

In [None]:
norm_faces_dataset = pd.DataFrame(columns=dataset.columns)

count = 0
def process(entry):
    cv2.imwrite(os.path.join(DATASET_OUTPUT.data, f'{count:3}.png'),
        cv2.cvtColor(normalizer(
            cv2.cvtColor(cv2.imread(entry.path), cv2.COLOR_BGR2RGB)), cv2.COLOR_RGB2BGR))
    count+=1

dataset.apply(process, axis=1)
norm_faces_dataset.to_csv(DATASET_OUTPUT.info, index=False)

## Data slicing in training, validation and tests groups

In [None]:
train_dataset, tmp = train_test_split(dataset, train_size=TRAIN_PERCENT, stratify=dataset.setid, shuffle=True, random_state=RANDOM_STATE)
val_dataset, test_dataset = train_test_split(tmp, test_size=TEST_PERCENT/(TEST_PERCENT+VAL_PERCENT), stratify=tmp.setid, shuffle=True, random_state=RANDOM_STATE)

print('\t\tsetid\tmin-max\tproportion')
print(f"> train: \t{train_dataset.count(axis=1).size}  \t{train_dataset.setid.value_counts().min()}-{train_dataset.setid.value_counts().max()} \t{train_dataset.count(axis=1).size/dataset.count(axis=1).size:.5f}")
print(f"> val: \t\t{val_dataset.count(axis=1).size}  \t{val_dataset.setid.value_counts().min()}-{val_dataset.setid.value_counts().max()} \t{val_dataset.count(axis=1).size/dataset.count(axis=1).size:.5f}")
print(f"> test: \t{test_dataset.count(axis=1).size}  \t{test_dataset.setid.value_counts().min()}-{test_dataset.setid.value_counts().max()} \t{test_dataset.count(axis=1).size/dataset.count(axis=1).size:.5f}")

		setid	min-max	proportion
> train: 	287  	9-10 	0.65977
> val: 		87  	3-3 	0.20000
> test: 	61  	2-3 	0.14023


In [None]:
print((train_dataset.path.isin(test_dataset.path) == True).value_counts(), end='\n\n')
print((train_dataset.path.isin(val_dataset.path) == True).value_counts(), end='\n\n')
print((val_dataset.path.isin(test_dataset.path) == True).value_counts(), end='\n\n')

path
False    287
Name: count, dtype: int64

path
False    287
Name: count, dtype: int64

path
False    87
Name: count, dtype: int64



## Save / load datasets slices (train, val, test)

## Save

In [None]:
train_dataset.to_csv(os.path.join(DATASETS_PATHS.norm_faces.info, 'train_dataset.csv'), index=False)
val_dataset.to_csv(os.path.join(DATASETS_PATHS.norm_faces.info, 'val_dataset.csv'), index=False)
test_dataset.to_csv(os.path.join(DATASETS_PATHS.norm_faces.info, 'test_dataset.csv'), index=False)

## Load

In [None]:
train_dataset = pd.read_csv(os.path.join(DATASETS_PATHS.norm_faces.info, 'train_dataset.csv'))
val_dataset = pd.read_csv(os.path.join(DATASETS_PATHS.norm_faces.info, 'val_dataset.csv'))
test_dataset = pd.read_csv(os.path.join(DATASETS_PATHS.norm_faces.info, 'test_dataset.csv'))