In [1]:
# 1. IMPORT LIBRARIES
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

# 2. HELPER FUNCTION: Load audio and convert to Mel-spectrogram
def extract_mel_spectrogram(file_path, n_mels=128, max_len=128):
    y, sr = librosa.load(file_path, sr=22050)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    # Pad or crop to fixed size
    if mel_db.shape[1] < max_len:
        pad_width = max_len - mel_db.shape[1]
        mel_db = np.pad(mel_db, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mel_db = mel_db[:, :max_len]

    return mel_db

# 3. LOAD DATASET (RAVDESS)
data_dir = './audio/'  # Change this to your folder
emotions_map = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fear', '07': 'disgust', '08': 'surprise'
}

X = []
y = []

for root, _, files in os.walk(data_dir):
    for file in files:
        if file.endswith('.wav'):
            emotion_code = file.split('-')[2]
            emotion = emotions_map.get(emotion_code)
            if emotion:  # skip unknown emotion codes
                file_path = os.path.join(root, file)
                mel = extract_mel_spectrogram(file_path)
                X.append(mel)
                y.append(emotion)

# 4. PREPROCESS INPUTS
X = np.array(X)
X = X[..., np.newaxis]  # Add channel dimension
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.3, random_state=42, stratify=y_cat)

# 5. BUILD 2D CNN MODEL
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    MaxPooling2D((2, 2)),
    BatchNormalization(),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    BatchNormalization(),

    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    BatchNormalization(),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(y)), activation='softmax')  # Output layer
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# 6. TRAIN
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

# 7. EVALUATE
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.2f}")

2025-04-12 22:44:14.789435: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-12 22:44:15.169439: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744472655.323674    1393 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744472655.377700    1393 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744472655.698381    1393 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 195ms/step - accuracy: 0.1988 - loss: 4.6988 - val_accuracy: 0.1541 - val_loss: 57.4326
Epoch 2/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 251ms/step - accuracy: 0.2559 - loss: 1.8724 - val_accuracy: 0.1599 - val_loss: 8.7219
Epoch 3/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 192ms/step - accuracy: 0.2822 - loss: 1.7963 - val_accuracy: 0.2384 - val_loss: 3.0889
Epoch 4/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 185ms/step - accuracy: 0.3090 - loss: 1.6978 - val_accuracy: 0.2326 - val_loss: 2.0273
Epoch 5/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 202ms/step - accuracy: 0.3240 - loss: 1.6081 - val_accuracy: 0.2849 - val_loss: 1.8223
Epoch 6/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 258ms/step - accuracy: 0.3815 - loss: 1.5968 - val_accuracy: 0.3692 - val_loss: 1.6560
Epoch 7/30
[1m43/43[0m

In [3]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# 1. Helper Function: Extract Mel-spectrogram + Normalize
def extract_mel_spectrogram(file_path, n_mels=256, max_len=256):
    y, sr = librosa.load(file_path, sr=22050)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    # Normalize to [0, 1]
    mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min())
    # Pad or crop
    if mel_db.shape[1] < max_len:
        pad_width = max_len - mel_db.shape[1]
        mel_db = np.pad(mel_db, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mel_db = mel_db[:, :max_len]
    return mel_db

# 2. Load Dataset (RAVDESS)
data_dir = './audio/'  # Change this to your folder
emotions_map = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fear', '07': 'disgust', '08': 'surprise'
}

X = []
y = []

for root, _, files in os.walk(data_dir):
    for file in files:
        if file.endswith('.wav'):
            emotion_code = file.split('-')[2]
            emotion = emotions_map.get(emotion_code)
            if emotion:
                file_path = os.path.join(root, file)
                mel = extract_mel_spectrogram(file_path)
                X.append(mel)
                y.append(emotion)

# 3. Preprocess Inputs
X = np.array(X)
X = X[..., np.newaxis]  # Add channel dimension
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.3, random_state=42, stratify=y_cat)

# 4. Build Improved CNN Model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(256, 256, 1)),
    MaxPooling2D((2, 2)),
    BatchNormalization(),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    BatchNormalization(),

    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    BatchNormalization(),

    Conv2D(256, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    BatchNormalization(),

    Flatten(),
    Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(len(np.unique(y)), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# 5. Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# 6. Train
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2,
                    callbacks=[early_stopping, lr_scheduler])

# 7. Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.2f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 584ms/step - accuracy: 0.1550 - loss: 20.1828 - val_accuracy: 0.0843 - val_loss: 9.0619 - learning_rate: 0.0010
Epoch 2/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 549ms/step - accuracy: 0.1471 - loss: 7.1244 - val_accuracy: 0.0843 - val_loss: 8.2460 - learning_rate: 0.0010
Epoch 3/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 518ms/step - accuracy: 0.1531 - loss: 4.9224 - val_accuracy: 0.0959 - val_loss: 5.7542 - learning_rate: 0.0010
Epoch 4/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 577ms/step - accuracy: 0.1592 - loss: 4.0115 - val_accuracy: 0.0988 - val_loss: 4.0502 - learning_rate: 0.0010
Epoch 5/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 521ms/step - accuracy: 0.1511 - loss: 3.4835 - val_accuracy: 0.1686 - val_loss: 3.0768 - learning_rate: 0.0010
Epoch 6/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

KeyboardInterrupt: 

In [2]:
import tensorflow as tf

# Kiểm tra danh sách các thiết bị khả dụng
physical_devices = tf.config.list_physical_devices('GPU')
print("GPU devices:", physical_devices)

# Kiểm tra xem TensorFlow có đang dùng GPU không
if len(physical_devices) > 0:
    print("CUDA is available! TensorFlow is using GPU.")
else:
    print("CUDA is not available. TensorFlow is using CPU.")

2025-04-12 23:42:43.568387: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-12 23:42:43.919420: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744476164.060262    6292 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744476164.098863    6292 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744476164.438467    6292 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

GPU devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
CUDA is available! TensorFlow is using GPU.


In [5]:
import tensorflow as tf

# In thông tin build của TensorFlow
print("TensorFlow version:", tf.__version__)
print("CUDA version:", tf.sysconfig.get_build_info().get('cuda_version'))
print("cuDNN version:", tf.sysconfig.get_build_info().get('cudnn_version'))

TensorFlow version: 2.19.0
CUDA version: 12.5.1
cuDNN version: 9


In [1]:
# 1. IMPORT LIBRARIES
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 2. HELPER FUNCTION: Load audio and convert to Mel-spectrogram
def extract_mel_spectrogram(file_path, n_mels=128, max_len=128):
    y, sr = librosa.load(file_path, sr=22050)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    # Pad or crop to fixed size
    if mel_db.shape[1] < max_len:
        pad_width = max_len - mel_db.shape[1]
        mel_db = np.pad(mel_db, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mel_db = mel_db[:, :max_len]

    return mel_db

# 3. LOAD DATASET (RAVDESS)
data_dir = './audio/'  # Change this to your folder
emotions_map = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fear', '07': 'disgust', '08': 'surprise'
}

X = []
y = []

for root, _, files in os.walk(data_dir):
    for file in files:
        if file.endswith('.wav'):
            emotion_code = file.split('-')[2]
            emotion = emotions_map.get(emotion_code)
            if emotion:  # skip unknown emotion codes
                file_path = os.path.join(root, file)
                mel = extract_mel_spectrogram(file_path)
                X.append(mel)
                y.append(emotion)

# 4. PREPROCESS INPUTS
X = np.array(X)
X = X[..., np.newaxis]  # Add channel dimension
X = X.transpose(0, 3, 1, 2)  # PyTorch expects [batch, channels, height, width]
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y_encoded, dtype=torch.long)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Create DataLoaders
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 5. BUILD 2D CNN MODEL
class EmotionCNN(nn.Module):
    def __init__(self, num_classes):
        super(EmotionCNN, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(32),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(64),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(128),

            nn.Flatten(),
            nn.Linear(128 * 16 * 16, 128),  # Adjust based on input size
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.model(x)

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EmotionCNN(num_classes=len(np.unique(y))).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# 6. TRAIN
def train_model(model, train_loader, criterion, optimizer, num_epochs=30):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader)
        epoch_acc = 100 * correct / total
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")

    model_path = "facial_emotion_audio.pth"
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}")

train_model(model, train_loader, criterion, optimizer)

# 7. EVALUATE
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy

evaluate_model(model, test_loader)

Epoch 1/30, Loss: 2.6047, Accuracy: 15.62%
Epoch 2/30, Loss: 1.9649, Accuracy: 18.30%
Epoch 3/30, Loss: 1.9370, Accuracy: 22.38%
Epoch 4/30, Loss: 1.8772, Accuracy: 25.52%
Epoch 5/30, Loss: 1.9120, Accuracy: 23.19%
Epoch 6/30, Loss: 1.8686, Accuracy: 25.12%
Epoch 7/30, Loss: 1.8629, Accuracy: 24.94%
Epoch 8/30, Loss: 1.8007, Accuracy: 29.66%
Epoch 9/30, Loss: 1.7501, Accuracy: 33.57%
Epoch 10/30, Loss: 1.6702, Accuracy: 38.23%
Epoch 11/30, Loss: 1.6437, Accuracy: 39.22%
Epoch 12/30, Loss: 1.5547, Accuracy: 41.32%
Epoch 13/30, Loss: 1.4688, Accuracy: 43.36%
Epoch 14/30, Loss: 1.3625, Accuracy: 47.03%
Epoch 15/30, Loss: 1.2882, Accuracy: 49.53%
Epoch 16/30, Loss: 1.2483, Accuracy: 50.29%
Epoch 17/30, Loss: 1.1120, Accuracy: 55.24%
Epoch 18/30, Loss: 1.0394, Accuracy: 60.14%
Epoch 19/30, Loss: 0.9884, Accuracy: 60.49%
Epoch 20/30, Loss: 0.9484, Accuracy: 62.30%
Epoch 21/30, Loss: 0.8702, Accuracy: 64.69%
Epoch 22/30, Loss: 0.8115, Accuracy: 66.78%
Epoch 23/30, Loss: 0.7703, Accuracy: 69.4

70.3804347826087