**Dataset Link:** https://www.kaggle.com/datasets/shuvoalok/raf-db-dataset

In [6]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import shutil

In [7]:
# The RAF-DB labels are not aligned with the universal emotion labels
RAF_DB_Label_Mapping = {1:'surprise', 2:'fear', 3:'disgust', 4:'happy', 5:'sad', 6:'angry', 7:'neutral'}
# This aligns the mapping of the RAF-DB labels with the universal emotion labels
Updated_RAF_DB_Label_Mapping = {1:5, 2:2, 3:1, 4:3, 5:4, 6:0, 7:6}
Global_Label_Mapping = {0:'Angry', 1:'Disgust', 2:'Fear', 3:'Happy', 4:'Sad', 5:'Surprise', 6:'Neutral', 7:'Contempt'}
Global_Label_Mapping_Inverse = {v: k for k, v in Global_Label_Mapping.items()}

In [8]:
def process_split(base_dir, rafdb_root, original_dir, split_name, target_dir, df=None):
    """
    Process a dataset split and create labels CSV
    original_dir: Path to original RAF-DB images (train/test)
    split_name: Prefix for filenames (train/val/test)
    target_dir: Output directory
    df: DataFrame containing labels (for test split)
    """
    os.makedirs(os.path.join(base_dir, target_dir), exist_ok=True)
    labels = []

    # For test split (validation/test), use integer labels
    if df is not None:
        for idx, row in df.iterrows():
            label = row['label']
            emotion = RAF_DB_Label_Mapping[int(label)]
            orig_path = os.path.join(rafdb_root, "DATASET", original_dir, str(label), row['image'])
            
            img = Image.open(orig_path)
            filename = f"{split_name}_{idx}_{emotion}.png"
            img.save(os.path.join(base_dir, target_dir, filename))
            
            # Store INTEGER label for validation/test
            labels.append({'filename': filename, 'label': Updated_RAF_DB_Label_Mapping[int(label)]})

    # For train split, keep emotion names in CSV
    else:
        idx = 0
        for label in os.listdir(os.path.join(rafdb_root, "DATASET", original_dir)):
            label_dir = os.path.join(rafdb_root, "DATASET", original_dir, label)
            if not os.path.isdir(label_dir):
                continue

            for img_file in os.listdir(label_dir):
                emotion = RAF_DB_Label_Mapping[int(label)]
                orig_path = os.path.join(label_dir, img_file)
                
                img = Image.open(orig_path)
                filename = f"{split_name}_{idx}_{emotion}.png"
                img.save(os.path.join(base_dir, target_dir, filename))
                
                # Store TEXT label for training
                labels.append({'filename': filename, 'label': Updated_RAF_DB_Label_Mapping[int(label)]})
                idx += 1

    # Order the entries and save labels CSV
    labels = pd.DataFrame(labels)
    labels['sort_key'] = labels['filename'].str.extract(r'_(\d+)_').astype(int)
    labels = labels.sort_values('sort_key').drop(columns='sort_key')

    pd.DataFrame(labels).to_csv(os.path.join(base_dir, target_dir, "labels.csv"), index=False)

In [20]:
def copy_images_to_label_subdirectories(base_dir):
    """
    For each subdirectory (test, train, validation) in base_dir,
    copy images into subdirectories based on their label, extracted from the filename.
    
    Expected filename format: {usecase}_{index}_{label}.{ext}
    """
    # Define the usage folders.
    usage_dirs = ['train', 'test', 'validation']
    
    for usage in usage_dirs:
        usage_path = os.path.join(base_dir, usage)
        if not os.path.isdir(usage_path):
            print(f"Directory {usage_path} does not exist. Skipping.")
            continue
        
        # Process each file in the usage folder.
        for filename in os.listdir(usage_path):
            file_path = os.path.join(usage_path, filename)
            if os.path.isfile(file_path):
                # Parse the filename. We expect at least 3 parts separated by '_'
                parts = filename.split('_')
                if len(parts) < 3:
                    print(f"Filename {filename} does not match expected format. Skipping.")
                    continue
                
                # The label is assumed to be the last part, with the file extension removed.
                label_with_ext = parts[-1]
                label, _ = os.path.splitext(label_with_ext)
                label = str(Global_Label_Mapping_Inverse.get(label.capitalize(), None))

                # Create the label subdirectory if it doesn't exist.
                label_dir = os.path.join(usage_path, label)
                os.makedirs(label_dir, exist_ok=True)
                
                # Copy the image into the label subdirectory.
                destination_file_path = os.path.join(label_dir, filename)
                shutil.copy2(file_path, destination_file_path)

In [None]:
# Define paths and structure
rafdb_root = "RAF-DB" 
base_dir = "RAF-DB_Structured_Aligned" 
os.makedirs(base_dir, exist_ok=True)

# Process training set
process_split(base_dir, rafdb_root, "train", "train", "train")

# Process test set with stratified split
test_labels = pd.read_csv(os.path.join(rafdb_root, "test_labels.csv"))
train_labels = pd.read_csv(os.path.join(rafdb_root, "train_labels.csv"))

# Stratified split of test set into validation/test (50/50)
val_df, test_df = train_test_split(
    test_labels,
    test_size=0.5,
    stratify=test_labels['label'],
    random_state=42
)

# Create validation set (uses integer labels in CSV)
process_split(base_dir, rafdb_root, "test", "val", "validation", val_df)

# Create test set (uses integer labels in CSV)
process_split(base_dir, rafdb_root, "test", "test", "test", test_df)

print("Dataset restructuring complete!")
copy_images_to_label_subdirectories(base_dir)