# Data Preprocessing and Data Splitting for Datasets (5 dataset is used)


The first data set: https://www.kaggle.com/datasets/mariafrenti/age-prediction

The second data set: https://www.kaggle.com/datasets/frabbisw/facial-age/data

The third data set: https://www.kaggle.com/datasets/mostafaebrahiem/egyptian-kids-faces

The forth data set: https://www.kaggle.com/datasets/tunguz/1-million-fake-faces?select=1m_faces_00_01_02_03

The fifth data set: https://www.kaggle.com/datasets/xhlulu/flickrfaceshq-dataset-nvidia-resized-256px


### Importing Required Libraries

In [2]:
import cv2
import os



## Data Preprocessing

#### Step 1 - cropping:

In [2]:
# Load the pre-trained face detection model from OpenCV
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Path to the folder containing the images
folder_path = "0-3"  # Replace it for each needed class with the actual folder path

# Output folder to save the processed images
output_folder = "0-3"  # Replace it for each needed class
os.makedirs(output_folder, exist_ok=True)

# Iterate through the images in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(('.jpg', '.jpeg', '.png')):  # Process image files only
        img_path = os.path.join(folder_path, filename)

        # Load the image
        img = cv2.imread(img_path, cv2.IMREAD_COLOR)

        # Check if the image is loaded successfully
        if img is None:
            print(f"Failed to load or corrupted: {img_path}")
            continue  # Skip to the next image

        # Detect faces in the image
        faces = face_cascade.detectMultiScale(img, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

        # Iterate through detected faces
        for i, (x, y, w, h) in enumerate(faces):
            # Expand the region to include more of the face
            expand_factor = 0.05  # Adjust this factor to control the zoom level (0.05 , 0.1), based on whether or not the faces are near and clear in the images.
            x -= int(w * expand_factor)
            y -= int(h * expand_factor)
            w += int(2 * w * expand_factor)
            h += int(2 * h * expand_factor)

            # Ensure the expanded region is within the image boundaries
            x = max(x, 0)
            y = max(y, 0)
            w = min(w, img.shape[1])
            h = min(h, img.shape[0])

            # Crop the detected face from the image (original color)
            face_img = img[y:y+h, x:x+w]

            # Modify the output filename to include "_cropped.jpg"
            base_filename, ext = os.path.splitext(filename)
            output_filename = f"{base_filename}_cropped.jpg" 

            # Save the cropped face image with the modified filename
            output_path = os.path.join(output_folder, output_filename)
            cv2.imwrite(output_path, face_img)


# It's not moving the undetected faces.

#### Step 2 - Resizing:

In [3]:

def is_image_file(filename):
    # Check if the file has a valid image extension
    valid_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff")
    return filename.lower().endswith(valid_extensions)

def resize_image(input_path, output_path, target_size):
    """
    Resize an image to the target size using OpenCV.

    Args:
        input_path (str): Path to the input image file.
        output_path (str): Path to save the resized image.
        target_size (tuple): A tuple (width, height) specifying the target size.

    Returns:
        None
    """
    image = cv2.imread(input_path)
    if image is not None:
        if not image.size == 0:
            resized_image = cv2.resize(image, target_size)
            cv2.imwrite(output_path, resized_image)
        else:
            print(f"Warning: {input_path} is an empty image.")
    else:
        print(f"Error: Failed to read {input_path}")

# Directory containing dataset of images
dataset_dir = "12+"   # Replace it for each needed class

# Directory to save the resized images
output_dir = "12+" #Replace it for each needed class

# Set the target size (224x224 pixels)
target_size = (224, 224)

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# List all files in the dataset directory
file_list = os.listdir(dataset_dir)

# Create a list to store resized image filenames
resized_image_filenames = []

# Loop through each image in the dataset
for filename in file_list:
    if is_image_file(filename):
        # Construct the full path of the image
        input_path = os.path.join(dataset_dir, filename)
        
        # Construct the output path for the resized image
        output_path = os.path.join(output_dir, filename)

        # Resize the image and save it
        resize_image(input_path, output_path, target_size)

        # Add the resized image filename to the list
        resized_image_filenames.append(output_path)

print("Resizing complete.")

Resizing complete.


## Data Splitting

### It is use the deterministic splitting strategy based on the order of the images within each class label directory.


so the spliting for the class label is: 0-3,4+, 9+ , 12+ ,17+:

80% of the data for training

10% of the data for validation

10% of the data for testing



In [6]:
# Define your dataset directory and class labels
dataset_dir = 'Faces_images'
class_labels = ['0-3','4+', '9+', '12+', '17+']  # class labels

# Define data split ratios for class_labels
train_split = 0.8 # 80% of the data for training
val_split = 0.1  # 10% of the data for validation
test_split = 0.1 # 10% of the data for testing



# Create train, validation, and test directories
train_dir = os.path.join(dataset_dir, 'train')
val_dir = os.path.join(dataset_dir, 'validation')
test_dir = os.path.join(dataset_dir, 'test')

# Subdirectory Creation
for dir in [train_dir, val_dir, test_dir]:
    for label in class_labels:
        os.makedirs(os.path.join(dir, label), exist_ok=True)


# Split data into train, validation, and test directories for class_labels
for label in class_labels:
    label_dir = os.path.join(dataset_dir, label)
    images = os.listdir(label_dir)
    num_images = len(images)

    train_split_idx = int(num_images * train_split) # calculate index up to which images will be used for training (80% of the data).
    val_split_idx = int(num_images * (train_split + val_split)) # calculate index up to which images will be used for validation (10% of the data in addition to the training set).

    train_images = images[:train_split_idx] #images from index 0 to train_split_idx - 1
    val_images = images[train_split_idx:val_split_idx] # images from index train_split_idx to val_split_idx - 1.
    test_images = images[val_split_idx:] #remaining images after the training and validation sets are defined.

    # Move Images to Respective Directories:
    for image in train_images:
        src = os.path.join(label_dir, image)
        dest = os.path.join(train_dir, label, image)
        os.rename(src, dest)

    for image in val_images:
        src = os.path.join(label_dir, image)
        dest = os.path.join(val_dir, label, image)
        os.rename(src, dest)

    for image in test_images:
        src = os.path.join(label_dir, image)
        dest = os.path.join(test_dir, label, image)
        os.rename(src, dest)

# Delete old directories (empty)
for label in class_labels  :
    label_dir = os.path.join(dataset_dir, label)
    os.rmdir(label_dir)

print("Splitting complete")

Splitting complete


#### So, after running this code, the dataset is organized into three main directories, each containing subdirectories for class labels, with images appropriately split into training, validation, and test sets according to the specified ratios.
#### This organization is typically done  to ensure each class has more than half of real face images, as well as as a step before training a machine learning model on the dataset.
#### This predictability can be useful for reproducibility in research or when you want to ensure that the data split remains constant across different runs of the code.