In [48]:
from torch.utils.data import Dataset
from torchvision import transforms
import torch
import numpy as np
import os
from PIL import Image
import glob

class MultimodalEmotionDataset(Dataset):
    def __init__(self, data_array, face_root, label_map, transform=None, sequence_len=5):
        """
        data_array: [(mfcc, folder_name, label), ...]
        face_root: path to the folder containing subfolders with face images
        label_map: {'happy': 0, ...}
        transform: torchvision transform
        sequence_len: number of face frames to load per sample
        """
        self.data = data_array
        self.face_root = face_root
        self.label_map = label_map
        self.transform = transform
        self.sequence_len = sequence_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        mfcc, folder_name, label = self.data[idx]

        # Load all face image paths from the folder
        face_folder = os.path.join(self.face_root, folder_name)
        face_files = [
        os.path.join(face_folder, fname)
        for fname in os.listdir(face_folder)
        if fname.lower().endswith((".png", ".jpg", ".jpeg"))
        ]
        face_files.sort()  # Sort to ensure consistent ordering
        if len(face_files) < self.sequence_len:
            raise ValueError(f"Not enough face images in {face_folder} (found {len(face_files)}, expected {self.sequence_len})")

        # Select evenly spaced frames across the folder
        step = len(face_files) // self.sequence_len
        selected_files = face_files[::step][:self.sequence_len]

        face_sequence = []
        for file in selected_files:
            img = Image.open(file).convert("RGB")
            if self.transform:
                img = self.transform(img)
            face_sequence.append(img)

        # Stack into shape: (sequence_len, 3, H, W)
        face_tensor = torch.stack(face_sequence)

        # Convert MFCC to (1, 40, T)
        mfcc_tensor = torch.tensor(mfcc.T, dtype=torch.float32).unsqueeze(0)

        label_tensor = torch.tensor(self.label_map[label], dtype=torch.long)

        return face_tensor, mfcc_tensor, label_tensor


In [49]:
dataset_path = r"../data/Processed/final_dataset.npy"
data_array = np.load(dataset_path, allow_pickle=True)

In [52]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])
#emotions (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
label_map = {'neutral': 1, 'calm': 2, 'happy': 3, 'sad':4, 'angry':5, 'fearful': 6, 'disgust':7, 'surprised':8}  # Example label map

dataset = MultimodalEmotionDataset(
    data_array,
    face_root="../data/Processed/faces/",
    label_map=label_map,
    transform=transform,
    sequence_len=1
)

In [54]:
from sklearn.model_selection import train_test_split

# First split: Train + (Val + Test)
train_data, valtest_data = train_test_split(data_array, test_size=0.3, random_state=42, stratify=[label for _, _, label in data_array])

# Second split: Val + Test
val_data, test_data = train_test_split(valtest_data, test_size=0.5, random_state=42, stratify=[label for _, _, label in valtest_data])

In [55]:
train_dataset = MultimodalEmotionDataset(train_data, face_root="data/faces", label_map=label_map, transform=transform)
val_dataset = MultimodalEmotionDataset(val_data, face_root="data/faces", label_map=label_map, transform=transform)
test_dataset = MultimodalEmotionDataset(test_data, face_root="data/faces", label_map=label_map, transform=transform)

In [56]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [57]:
import pickle

with open("train_split.pkl", "wb") as f:
    pickle.dump(train_data, f)