# Dataset Builder for FER2013+ utility Module

**Description**: This module processes the FER2013+ dataset, detecting faces and landmarks, and saving the processed data into .npz files for training and testing.

## Libraries

In [1]:
import os
import cv2
import numpy as np
from tqdm import tqdm
from pathlib import Path
from preprocessing.face_detection import FaceDetectorMP
from preprocessing.landmark_detection import LandmarkDetectorMP
from utils.logger import get_logger
from collections import defaultdict

In [2]:
# Logger setup
logger = get_logger(__name__)

## Functions

In [3]:
def count_images_per_class(dataset_dir):
    """
    Count the number of images per class in a dataset directory.
    Args:
        dataset_dir (str or Path): Path to the 'train' or 'test' dataset folder.
    Returns:
        dict: Dictionary mapping class names to number of images.
    """
    dataset_dir = Path(dataset_dir)
    class_counts = {}

    for class_dir in sorted(dataset_dir.iterdir()):
        if class_dir.is_dir():
            image_count = len(list(class_dir.glob("*")))
            class_counts[class_dir.name] = image_count

    # Mostrar los resultados ordenados de menor a mayor
    print("Conteo de imágenes por clase:")
    for class_name, count in sorted(class_counts.items(), key=lambda x: x[1]):
        print(f"  - {class_name}: {count} imágenes")
    
    return class_counts

In [4]:
train_dir = "../data/fer2013plus/train"
train_class_count = count_images_per_class(train_dir)

Conteo de imágenes por clase:
  - contempt: 165 imágenes
  - disgust: 191 imágenes
  - fear: 652 imágenes
  - anger: 2466 imágenes
  - sadness: 3514 imágenes
  - surprise: 3562 imágenes
  - happiness: 7528 imágenes
  - neutral: 10308 imágenes


In [5]:
test_dir = "../data/fer2013plus/test"
test_class_count = count_images_per_class(test_dir)

Conteo de imágenes por clase:
  - contempt: 51 imágenes
  - disgust: 57 imágenes
  - fear: 167 imágenes
  - anger: 644 imágenes
  - sadness: 856 imágenes
  - surprise: 900 imágenes
  - happiness: 1827 imágenes
  - neutral: 2597 imágenes


Desbalance de clases muy desproporcionado, se opta por descartar clases con baja representación, pues no se tiene certeza de cuantas imágenes si lograran ser detectadas.
Clases a eliminar `contempt`, `disgust` y `fear`

In [6]:
def process_partition(partition_dir, detector, landmarker, balance_train=False):
    """
    Process a single partition of the dataset (train or test).
    Args:
        partition_dir (str): Path to the partition directory (train or test).
        detector (FaceDetectorMP): Face detection instance.
        landmarker (LandmarkDetectorMP): Landmark detection instance.
        balance_train (bool): Whether to apply class balancing (only for train).
        
    Returns:
        Tuple[np.ndarray, np.ndarray]: Processed features and labels as numpy arrays.
    """
    # Define the allowed classes for FER2013+ (removed underrepresented classes)
    # Removed: "contempt", "disgust", "fear"
    allowed_classes = {"anger", "sadness", "surprise", "happiness", "neutral"}
    # Ensure the partition directory exists
    partition_name = Path(partition_dir).name
    logger.info(f"Processing partition: {Path(partition_dir).as_posix()}")

    # Initialize data structures
    class_vectors = {}  # class_name -> list of vectors
    class_labels = {}   # class_name -> list of label indices
    label_map = {}      # class_name -> integer label
    skipped_total = defaultdict(int)

    # Check if the partition directory exists
    label_dirs = sorted(os.listdir(partition_dir))
    label_index_counter = 0

    # Iterate over each class directory
    for label_name in label_dirs:
        # Skip if the label is not in the allowed classes
        if label_name not in allowed_classes:
            logger.info(f"Skipping class '{label_name}' (not in allowed list)")
            continue

        # Skip if the label directory does not exist
        label_path = os.path.join(partition_dir, label_name)
        if not os.path.isdir(label_path):
            continue
        
        # Count images in the class directory and sort them
        images = sorted(os.listdir(label_path))
        logger.info(f"Processing class '{label_name}' with {len(images)} images")
        vectors, labels = [], []
        # Initialize skipped counts for this class
        skipped_counts = defaultdict(int)

        # Process each image in the class directory
        for image_name in tqdm(images, desc=label_name, leave=False):
            # Get the full path of the image
            image_path = os.path.join(label_path, image_name)
            image = cv2.imread(image_path)

            # Check if the image was read successfully
            if image is None:
                skipped_counts["failed_read"] += 1
                continue
            
            # Detect face on the image
            detection_result = detector.detect_faces(image)
            if detection_result is None:
                skipped_counts["no_face_detected"] += 1
                continue
            
            # Crop the face from the image
            cropped = detector.crop_face(image, detection_result)
            if cropped is None:
                skipped_counts["failed_crop"] += 1
                continue
            
            # Detect landmarks on the cropped face
            landmark_result = landmarker.detect_landmarks(cropped)
            if landmark_result is None:
                skipped_counts["no_landmarks_detected"] += 1
                continue

            # Extract the landmark vector
            vector = landmarker.extract_landmark_vector(landmark_result)
            if vector is None:
                skipped_counts["vector_extraction_failed"] += 1
                continue
            
            # Add the vector and label to the lists
            vectors.append(vector)
            labels.append(label_index_counter)

        # Log the results for this class
        logger.info(f"Processed {len(vectors)} valid images for class '{label_name}'")
        for k, v in skipped_counts.items():
            logger.info(f"  Skipped {k}: {v}")
            skipped_total[k] += v

        # Check if we have valid vectors for this class
        if len(vectors) > 0:
            class_vectors[label_name] = vectors
            class_labels[label_name] = labels
            label_map[label_name] = label_index_counter
            label_index_counter += 1

    # Dynamically adjust the class count based on processed classes
    if balance_train:
        # Balance classes by selecting the minimum number of samples across all classes
        min_samples = min(len(v) for v in class_vectors.values())
        logger.info(f"Balancing classes to {min_samples} samples each")

        # Create a balanced dataset by selecting min_samples from each class
        all_features = []
        all_labels = []
        # Iterate over each class and select min_samples
        for class_name in class_vectors:
            # Select min_samples from each class
            selected_vectors = class_vectors[class_name][:min_samples]
            selected_labels = class_labels[class_name][:min_samples]
            all_features.extend(selected_vectors)
            all_labels.extend(selected_labels)
    else:
        # No balancing, concatenate all vectors and labels (for test split)
        all_features = [v for vectors in class_vectors.values() for v in vectors]
        all_labels = [l for labels in class_labels.values() for l in labels]

    # Convert lists to numpy arrays
    features_array = np.array(all_features, dtype=np.float32)
    labels_array = np.array(all_labels, dtype=np.int32)

    logger.info(f"Final sample count for {partition_name}: {len(features_array)}")
    logger.info("Global skipped summary:")
    for k, v in skipped_total.items():
        logger.info(f"  {k}: {v}")

    # Return the processed features and labels as numpy arrays
    return features_array, labels_array


In [7]:
def build_dataset(data_dir="../data/fer2013plus", output_dir="../data/processed"):
    """
    Build the FER2013+ dataset by processing images from the specified data directory.
    Applies balancing only to the training partition.

    Args:
        data_dir (str): Path to the root directory containing 'train' and 'test' partitions.
        output_dir (str): Path to the directory where processed dataset will be saved.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Initialize face detector and landmark detector
    detector = FaceDetectorMP()
    landmarker = LandmarkDetectorMP()

    # Process each partition (train and test)
    for partition in ["train", "test"]:
        # Ensure the partition directory exists
        partition_dir = os.path.join(data_dir, partition)
        # Mark if we are balancing the training set
        balance_train = (partition == "train")
        # Process the partition directory
        features, labels = process_partition(partition_dir, detector, landmarker, balance_train)
        # Save the processed features and labels to a compressed .npz file
        output_path = os.path.join(output_dir, f"{partition}.npz")
        np.savez_compressed(output_path, features=features, labels=labels)
        logger.info(f"Saved {Path(partition_dir).as_posix()} dataset with {len(features)} samples to: {Path(output_path).as_posix()}")

    logger.info("Dataset building completed.")


In [8]:
# Call the function to build the dataset
build_dataset()

[INFO] Processing partition: ../data/fer2013plus/train
[INFO] Processing class 'anger' with 2466 images
[INFO] Processed 1598 valid images for class 'anger'      
[INFO]   Skipped no_landmarks_detected: 568
[INFO]   Skipped no_face_detected: 300
[INFO] Skipping class 'contempt' (not in allowed list)
[INFO] Skipping class 'disgust' (not in allowed list)
[INFO] Skipping class 'fear' (not in allowed list)
[INFO] Processing class 'happiness' with 7528 images
[INFO] Processed 6606 valid images for class 'happiness'      
[INFO]   Skipped no_landmarks_detected: 704
[INFO]   Skipped no_face_detected: 218
[INFO] Processing class 'neutral' with 10308 images
[INFO] Processed 8428 valid images for class 'neutral'        
[INFO]   Skipped no_landmarks_detected: 1463
[INFO]   Skipped no_face_detected: 417
[INFO] Processing class 'sadness' with 3514 images
[INFO] Processed 2343 valid images for class 'sadness'      
[INFO]   Skipped no_face_detected: 366
[INFO]   Skipped no_landmarks_detected: 805
[

In [9]:
label_map = {
    0: "anger",
    1: "happiness",
    2: "neutral",
    3: "sadness",
    4: "surprise"
}

import json
with open("../data/processed/label_map.json", "w") as f:
    json.dump(label_map, f, indent=2)
logger.info("Label map saved to ../data/processed/label_map.json")

[INFO] Label map saved to ../data/processed/label_map.json


In [None]:
data = np.load("../data/processed/train.npz")
print(data["features"].shape)  # (7990, N)
print(data["labels"].shape)    # (7990,)
print(np.bincount(data["labels"]))  # Verify class distribution

(7990, 1434)
(7990,)
[1598 1598 1598 1598 1598]
