In [1]:
from torch.utils.data import Dataset
from torchvision import transforms
import torch
import numpy as np
import os
from PIL import Image
import glob
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from tqdm import tqdm
import torchvision.models as models


class MultimodalEmotionDataset(Dataset):
    def __init__(self, data_array, face_root, label_map, transform=None, sequence_len=5):
        """
        data_array: [(mfcc, folder_name, label), ...]
        face_root: path to the folder containing subfolders with face images
        label_map: {'happy': 0, ...}
        transform: torchvision transform
        sequence_len: number of face frames to load per sample
        """
        self.data = data_array
        self.face_root = face_root
        self.label_map = label_map
        self.transform = transform
        self.sequence_len = sequence_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        mfcc, folder_name, label = self.data[idx]

        # Load all face image paths from the folder
        face_folder = os.path.join(self.face_root, folder_name)
        face_files = [
        os.path.join(face_folder, fname)
        for fname in os.listdir(face_folder)
        if fname.lower().endswith((".png", ".jpg", ".jpeg"))
        ]
        face_files.sort()  # Sort to ensure consistent ordering
        if len(face_files) < self.sequence_len:
            raise ValueError(f"Not enough face images in {face_folder} (found {len(face_files)}, expected {self.sequence_len})")

        # Select evenly spaced frames across the folder
        step = len(face_files) // self.sequence_len
        selected_files = face_files[::step][:self.sequence_len]

        face_sequence = []
        for file in selected_files:
            img = Image.open(file).convert("RGB")
            if self.transform:
                img = self.transform(img)
            face_sequence.append(img)

        # Stack into shape: (sequence_len, 3, H, W)
        face_tensor = torch.stack(face_sequence)

        # Convert MFCC to (1, 40, T)
        mfcc_tensor = torch.tensor(mfcc.T, dtype=torch.float32).unsqueeze(0)

        label_tensor = torch.tensor(self.label_map[label], dtype=torch.long)

        return face_tensor, mfcc_tensor, label_tensor


In [2]:
dataset_path = r"../data/Processed/final_dataset.npy"
data_array = np.load(dataset_path, allow_pickle=True)

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])
#emotions (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
label_map = {'neutral': 1, 'calm': 2, 'happy': 3, 'sad':4, 'angry':5, 'fearful': 6, 'disgust':7, 'surprised':8}  # Example label map

dataset = MultimodalEmotionDataset(
    data_array,
    face_root="../data/Processed/faces/",
    label_map=label_map,
    transform=transform,
    sequence_len=1
)

In [4]:
from sklearn.model_selection import train_test_split

# First split: Train + (Val + Test)
train_data, valtest_data = train_test_split(data_array, test_size=0.3, random_state=42, stratify=[label for _, _, label in data_array])

# Second split: Val + Test
val_data, test_data = train_test_split(valtest_data, test_size=0.5, random_state=42, stratify=[label for _, _, label in valtest_data])

In [5]:
train_dataset = MultimodalEmotionDataset(train_data, face_root="data/faces", label_map=label_map, transform=transform)
val_dataset = MultimodalEmotionDataset(val_data, face_root="data/faces", label_map=label_map, transform=transform)
test_dataset = MultimodalEmotionDataset(test_data, face_root="data/faces", label_map=label_map, transform=transform)

In [6]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [7]:
from torchvision.models import ResNet18_Weights

class VisualBackbone(nn.Module):
    def __init__(self, embed_dim=256):
        super().__init__()
        resnet = models.resnet18(weights=ResNet18_Weights.DEFAULT)
        self.cnn = nn.Sequential(*list(resnet.children())[:-1])  # output: (B, 512, 1, 1)
        self.fc = nn.Linear(512, embed_dim)

    def forward(self, x_seq):  # x_seq: (B, T, 3, H, W)
        B, T, C, H, W = x_seq.shape
        x_seq = x_seq.view(B * T, C, H, W)
        features = self.cnn(x_seq)       # (B*T, 512, 1, 1)
        features = features.view(B, T, -1).mean(dim=1)  # average over time
        return self.fc(features)         # (B, embed_dim)

class AudioBranch(nn.Module):
    def __init__(self, embed_dim=256):
        super().__init__()
        self.lstm = nn.LSTM(input_size=40, hidden_size=128, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(128 * 2, embed_dim)

    def forward(self, x):  # x: (B, 1, 40, T)
        x = x.squeeze(1).permute(0, 2, 1)  # → (B, T, 40)
        _, (hn, _) = self.lstm(x)         # hn shape: (4, B, 128)
        out = torch.cat((hn[-2], hn[-1]), dim=1)  # (B, 256)
        return self.fc(out)               # (B, embed_dim)

class MultimodalEmotionRecognizer(nn.Module):
    def __init__(self, num_classes=8, embed_dim=256):
        super().__init__()
        self.visual_branch = VisualBackbone(embed_dim)
        self.audio_branch = AudioBranch(embed_dim)
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, face_seq, mfcc):
        visual_feat = self.visual_branch(face_seq)  # (B, embed_dim)
        audio_feat  = self.audio_branch(mfcc)       # (B, embed_dim)
        fused = torch.cat((visual_feat, audio_feat), dim=1)
        return self.classifier(fused)

In [8]:
image_size = 224
face_dir = "../data/Processed/faces/"
label_map = {'neutral': 1, 'calm': 2, 'happy': 3, 'sad':4, 'angry':5, 'fearful': 6, 'disgust':7, 'surprised':8}  # Example label map
batch_size = 32
num_classes = len(label_map)
learning_rate = 0.001
num_epochs = 10
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for faces, mfccs, labels in tqdm(loader, desc="Train", leave=False):
        faces, mfccs, labels = faces.to(device), mfccs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(faces, mfccs)           # [B, num_classes]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # stats
        running_loss += loss.item() * labels.size(0)
        preds = outputs.argmax(dim=1)
        correct   += (preds == labels).sum().item()
        total     += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc

def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for faces, mfccs, labels in tqdm(loader, desc="Valid", leave=False):
            faces, mfccs, labels = faces.to(device), mfccs.to(device), labels.to(device)
            outputs = model(faces, mfccs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(dim=1)
            correct   += (preds == labels).sum().item()
            total     += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc

def main():
    # 1. Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # 2. Datasets & Loaders
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor()
    ])
    train_ds = MultimodalEmotionDataset(
        train_data, face_root=face_dir,
        label_map=label_map, transform=transform,
        sequence_len=5)
    val_ds = MultimodalEmotionDataset(
        val_dataset, face_root=face_dir,
        label_map=label_map, transform=transform,
        sequence_len=5)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=4)

    # 3. Model, Loss, Optimizer
    model = MultimodalEmotionRecognizer(num_classes=num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    best_val_acc = 0.0
    os.makedirs("checkpoints", exist_ok=True)

    # 4. Epoch loop
    for epoch in range(1, num_epochs + 1):
        print(f"\nEpoch {epoch}/{num_epochs}")

        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss,   val_acc   = validate(model, val_loader, criterion, device)

        print(f"  Train — loss: {train_loss:.4f}, acc: {train_acc:.4f}")
        print(f"  Valid — loss: {val_loss:.4f}, acc: {val_acc:.4f}")

        # 5. Checkpoint
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            ckpt_path = f"checkpoints/best_epoch{epoch:02d}_acc{val_acc:.3f}.pth"
            torch.save(model.state_dict(), ckpt_path)
            print(f"  Saved best model to {ckpt_path}")

    print("\nTraining complete. Best valid acc:", best_val_acc)

In [None]:
main()

Using device: cpu

Epoch 1/10


Train:   0%|          | 0/54 [00:00<?, ?it/s]