In [2]:
# preprocess_frames.py
import os
import cv2
import numpy as np

# Cấu hình thư mục
data_dir = './'  # Thư mục gốc chứa RAVDESS
output_dir = './processed_frames'  # Thư mục lưu ảnh đã trích

os.makedirs(output_dir, exist_ok=True)

# Hàm trích frame
def extract_frame(video_path, frame_size=(64, 64)):
    cap = cv2.VideoCapture(video_path)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.1, 5)
        if len(faces) > 0:
            (x, y, w, h) = faces[0]
            face = frame[y:y+h, x:x+w]
            face = cv2.resize(face, frame_size)
            cap.release()
            return face
    cap.release()
    return np.zeros((frame_size[0], frame_size[1], 3), dtype=np.uint8)

# Lặp và trích frame
emotions_map = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fear', '07': 'disgust', '08': 'surprise'
}

image_paths, labels = [], []

for actor_folder in os.listdir(data_dir):
    actor_path = os.path.join(data_dir, actor_folder)
    if os.path.isdir(actor_path) and (actor_folder.startswith('Video_Speech_Actor_') or actor_folder.startswith('Video_Song_Actor_')):
        for subfolder in os.listdir(actor_path):
            subfolder_path = os.path.join(actor_path, subfolder)
            if os.path.isdir(subfolder_path) and subfolder.startswith('Actor_'):
                for file in os.listdir(subfolder_path):
                    if file.endswith('.mp4'):
                        emotion_code = file.split('-')[2]
                        emotion = emotions_map.get(emotion_code)
                        if emotion:
                            video_path = os.path.join(subfolder_path, file)
                            frame = extract_frame(video_path)

                            # Lưu ảnh
                            output_path = os.path.join(output_dir, file.replace('.mp4', '.jpg'))
                            cv2.imwrite(output_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

                            image_paths.append(output_path)
                            labels.append(emotion)

# Lưu đường dẫn và nhãn
import pickle
with open("image_paths_labels.pkl", "wb") as f:
    pickle.dump((image_paths, labels), f)


In [3]:
# train_from_images.py
import os
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as transforms
from PIL import Image

# Load danh sách ảnh và nhãn từ file
with open("image_paths_labels.pkl", "rb") as f:
    image_paths, labels = pickle.load(f)

# Encode nhãn
le = LabelEncoder()
labels_encoded = torch.tensor(le.fit_transform(labels), dtype=torch.long)

# Tạo transform
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Dataset dùng ảnh đã lưu
class PreprocessedDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = cv2.imread(self.image_paths[idx])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(img)
        return img, self.labels[idx]

# Chia train/test
train_paths, test_paths, train_labels, test_labels = train_test_split(
    image_paths, labels_encoded, test_size=0.3, random_state=42, stratify=labels_encoded
)

train_dataset = PreprocessedDataset(train_paths, train_labels, transform)
test_dataset = PreprocessedDataset(test_paths, test_labels, transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)  # ⚡ Thêm num_workers
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Mô hình CNN
class FacialEmotionCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64*8*8, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.model(x)

# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FacialEmotionCNN(num_classes=len(le.classes_)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, loader, criterion, optimizer, epochs=300):
    model.train()
    for epoch in range(epochs):
        total, correct, running_loss = 0, 0, 0
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss:.4f}, Acc: {100*correct/total:.2f}%")

    torch.save(model.state_dict(), "facial_emotion_cnn.pth")
    print("✅ Model saved to facial_emotion_cnn.pth")

def evaluate_model(model, loader):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"🎯 Test Accuracy: {100*correct/total:.2f}%")

# Train và evaluate
train_model(model, train_loader, criterion, optimizer)
evaluate_model(model, test_loader)


Epoch 1/100, Loss: 220.3968, Acc: 16.20%
Epoch 2/100, Loss: 213.7566, Acc: 22.03%
Epoch 3/100, Loss: 187.5016, Acc: 34.32%
Epoch 4/100, Loss: 168.2174, Acc: 40.36%
Epoch 5/100, Loss: 150.7216, Acc: 47.44%
Epoch 6/100, Loss: 139.7144, Acc: 51.52%
Epoch 7/100, Loss: 132.7163, Acc: 52.91%
Epoch 8/100, Loss: 121.8824, Acc: 57.78%
Epoch 9/100, Loss: 114.7458, Acc: 59.18%
Epoch 10/100, Loss: 111.4091, Acc: 60.11%
Epoch 11/100, Loss: 102.6077, Acc: 62.91%
Epoch 12/100, Loss: 94.8069, Acc: 66.26%
Epoch 13/100, Loss: 93.8960, Acc: 65.53%
Epoch 14/100, Loss: 89.4664, Acc: 67.42%
Epoch 15/100, Loss: 82.6344, Acc: 70.16%
Epoch 16/100, Loss: 81.0951, Acc: 70.43%
Epoch 17/100, Loss: 79.8578, Acc: 71.12%
Epoch 18/100, Loss: 72.8094, Acc: 74.13%
Epoch 19/100, Loss: 73.0655, Acc: 72.96%
Epoch 20/100, Loss: 70.1888, Acc: 74.21%
Epoch 21/100, Loss: 67.4310, Acc: 75.29%
Epoch 22/100, Loss: 63.5845, Acc: 77.42%
Epoch 23/100, Loss: 63.7853, Acc: 76.72%
Epoch 24/100, Loss: 62.5287, Acc: 76.98%
Epoch 25/100, 

In [1]:
# 1. IMPORT LIBRARIES
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as transforms

# 2. EXTRACT FRAME FUNCTION (resize nhỏ hơn)
def extract_frame(video_path, frame_size=(64, 64)):
    cap = cv2.VideoCapture(video_path)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.1, 5)
        if len(faces) > 0:
            (x, y, w, h) = faces[0]
            face = frame[y:y+h, x:x+w]
            face = cv2.resize(face, frame_size)
            face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
            cap.release()
            return face
    cap.release()
    return np.zeros((frame_size[0], frame_size[1], 3), dtype=np.uint8)

# 3. CUSTOM DATASET
class RAVDESSDataset(Dataset):
    def __init__(self, video_paths, labels, transform=None):
        self.video_paths = video_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        frame = extract_frame(self.video_paths[idx])
        if self.transform:
            frame = self.transform(frame)
        return frame, self.labels[idx]

# 4. LOAD DATASET (RAVDESS)
data_dir = './'  # Thư mục chứa dataset
emotions_map = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fear', '07': 'disgust', '08': 'surprise'
}

video_paths, labels = [], []
for actor_folder in os.listdir(data_dir):
    actor_path = os.path.join(data_dir, actor_folder)
    if os.path.isdir(actor_path) and (actor_folder.startswith('Video_Speech_Actor_') or actor_folder.startswith('Video_Song_Actor_')):
        for subfolder in os.listdir(actor_path):
            subfolder_path = os.path.join(actor_path, subfolder)
            if os.path.isdir(subfolder_path) and subfolder.startswith('Actor_'):
                for file in os.listdir(subfolder_path):
                    if file.endswith('.mp4'):
                        emotion_code = file.split('-')[2]
                        emotion = emotions_map.get(emotion_code)
                        if emotion:
                            video_paths.append(os.path.join(subfolder_path, file))
                            labels.append(emotion)

# 5. ENCODE LABELS + GIỚI HẠN DỮ LIỆU
le = LabelEncoder()
labels_encoded = torch.tensor(le.fit_transform(labels), dtype=torch.long)
video_paths = video_paths
labels_encoded = labels_encoded

# 6. TRANSFORMS
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# 7. SPLIT DATASET
train_paths, test_paths, train_labels, test_labels = train_test_split(
    video_paths, labels_encoded, test_size=0.3, random_state=42, stratify=labels_encoded
)
train_dataset = RAVDESSDataset(train_paths, train_labels, transform)
test_dataset = RAVDESSDataset(test_paths, test_labels, transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 8. CNN MODEL (nhẹ hơn)
class FacialEmotionCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64*8*8, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.model(x)

# 9. TRAINING
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FacialEmotionCNN(num_classes=len(le.classes_)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, loader, criterion, optimizer, epochs=100):
    model.train()
    for epoch in range(epochs):
        total, correct, running_loss = 0, 0, 0
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss:.4f}, Acc: {100*correct/total:.2f}%")
        # Save model
    model_path = "facial_emotion_cnn.pth"
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}")

train_model(model, train_loader, criterion, optimizer)

# 10. EVALUATE
def evaluate_model(model, loader):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy: {100*correct/total:.2f}%")

evaluate_model(model, test_loader)


KeyboardInterrupt: 