In [None]:
# ravdess_emotion_classifier.py

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# -------------------------------
# Step 1: Dataset preparation
# -------------------------------
DATA_FOLDER = "/Users/aditya/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1"  # path to extracted RAVDESS audio files
SAMPLE_RATE = 22050
MAX_MFCC_LEN = 100  # pad/truncate MFCCs to this length

# Map RAVDESS emotion IDs to labels (for simplicity, focus on "neutral", "happy", "sad", "angry", "fear", "surprise", "disgust")
EMOTION_MAP = {
    '01': 0,  # neutral
    '02': 1,  # calm
    '03': 2,  # happy
    '04': 3,  # sad
    '05': 4,  # angry
    '06': 5,  # fearful
    '07': 6,  # disgust
    '08': 7,  # surprised
}

def extract_features(file_path, n_mfcc=40):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # pad/truncate MFCCs to MAX_MFCC_LEN
    if mfcc.shape[1] < MAX_MFCC_LEN:
        pad_width = MAX_MFCC_LEN - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :MAX_MFCC_LEN]
    return mfcc

# Custom Dataset
class AudioDataset(Dataset):
    def __init__(self, file_list, labels):
        self.file_list = file_list
        self.labels = labels

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        mfcc = extract_features(self.file_list[idx])
        mfcc = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0)  # add channel dim for CNN
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return mfcc, label

# -------------------------------
# Step 2: Prepare file paths and labels
# -------------------------------
audio_files = []
labels = []

for root, dirs, files in os.walk(DATA_FOLDER):
    for file in files:
        if file.endswith(".wav"):
            # filename format: Actor_XX/03-01-03-01-02-01-12.wav
            parts = file.split('-')
            emotion_id = parts[2]  # third number in filename indicates emotion
            if emotion_id in EMOTION_MAP:
                audio_files.append(os.path.join(root, file))
                labels.append(EMOTION_MAP[emotion_id])

print(f"Total examples: {len(audio_files)}")

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(audio_files, labels, test_size=0.2, random_state=42)

train_dataset = AudioDataset(X_train, y_train)
val_dataset = AudioDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# -------------------------------
# Step 3: Define model (simple CNN)
# -------------------------------
class CNNEmotionClassifier(nn.Module):
    def __init__(self, num_classes=8):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool = nn.MaxPool2d(2,2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.fc1 = nn.Linear(32 * 20 * (MAX_MFCC_LEN//2), 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = self.relu(self.bn2(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# -------------------------------
# Step 4: Training
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNEmotionClassifier(num_classes=len(EMOTION_MAP))
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for mfccs, labels_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        mfccs, labels_batch = mfccs.to(device), labels_batch.to(device)
        optimizer.zero_grad()
        outputs = model(mfccs)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Avg Train Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for mfccs, labels_batch in val_loader:
            mfccs, labels_batch = mfccs.to(device), labels_batch.to(device)
            outputs = model(mfccs)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels_batch).sum().item()
            total += labels_batch.size(0)
    val_acc = correct / total
    print(f"Epoch {epoch+1} | Validation Accuracy: {val_acc:.4f}\n")

# -------------------------------
# Step 5: Save model
# -------------------------------
os.makedirs("saved_models", exist_ok=True)
torch.save(model.state_dict(), "saved_models/ravdess_cnn_model.pt")
print("Model saved to saved_models/ravdess_cnn_model.pt")

# -------------------------------
# Step 6: Prediction helper
# -------------------------------
def predict(file_path, model, device):
    model.eval()
    mfcc = extract_features(file_path)
    mfcc = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(mfcc)
        pred = torch.argmax(output, dim=1).item()
    return list(EMOTION_MAP.keys())[pred]

# Example usage
if __name__ == "__main__":
    test_file = X_val[0]
    print(f"Predicted emotion for {test_file}: {predict(test_file, model, device)}")


Total examples: 2880


Epoch 1: 100%|██████████| 144/144 [00:57<00:00,  2.49it/s]


Epoch 1 | Avg Train Loss: 2.0675
Epoch 1 | Validation Accuracy: 0.3819



Epoch 2: 100%|██████████| 144/144 [00:46<00:00,  3.10it/s]


Epoch 2 | Avg Train Loss: 1.4546
Epoch 2 | Validation Accuracy: 0.4635



Epoch 3: 100%|██████████| 144/144 [00:54<00:00,  2.63it/s]


Epoch 3 | Avg Train Loss: 0.9218
Epoch 3 | Validation Accuracy: 0.6927



Epoch 4: 100%|██████████| 144/144 [00:42<00:00,  3.42it/s]


Epoch 4 | Avg Train Loss: 0.4935
Epoch 4 | Validation Accuracy: 0.8264



Epoch 5: 100%|██████████| 144/144 [00:40<00:00,  3.55it/s]


Epoch 5 | Avg Train Loss: 0.2246
Epoch 5 | Validation Accuracy: 0.8993



Epoch 6: 100%|██████████| 144/144 [00:38<00:00,  3.77it/s]


Epoch 6 | Avg Train Loss: 0.0855
Epoch 6 | Validation Accuracy: 0.9340



Epoch 7: 100%|██████████| 144/144 [00:37<00:00,  3.81it/s]


Epoch 7 | Avg Train Loss: 0.0335
Epoch 7 | Validation Accuracy: 0.9201



Epoch 8: 100%|██████████| 144/144 [00:37<00:00,  3.86it/s]


Epoch 8 | Avg Train Loss: 0.0149
Epoch 8 | Validation Accuracy: 0.9410



Epoch 9: 100%|██████████| 144/144 [00:37<00:00,  3.86it/s]


Epoch 9 | Avg Train Loss: 0.0075
Epoch 9 | Validation Accuracy: 0.9410



Epoch 10: 100%|██████████| 144/144 [00:36<00:00,  3.91it/s]


Epoch 10 | Avg Train Loss: 0.0053
Epoch 10 | Validation Accuracy: 0.9375

Model saved to saved_models/ravdess_cnn_model.pt
Predicted emotion for /Users/aditya/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_17/03-01-07-02-01-01-17.wav: 07


In [None]:
# ravdess_emotion_classifier.py

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# -------------------------------
# Step 1: Dataset preparation
# -------------------------------
DATA_FOLDER = "/Users/aditya/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1"  # path to extracted RAVDESS audio files
SAMPLE_RATE = 22050
MAX_MFCC_LEN = 100  # pad/truncate MFCCs to this length

# Map RAVDESS emotion IDs to labels (for simplicity, focus on "neutral", "happy", "sad", "angry", "fear", "surprise", "disgust")
ENTHUSIASM_MAP = {
    '03': 1,  # happy -> enthusiastic
    '08': 1,  # surprised -> enthusiastic
    '01': 0,  # neutral -> not enthusiastic
    '04': 0,  # sad -> not enthusiastic
    '05': 0,  # angry -> not enthusiastic
    '06': 0,  # fear -> not enthusiastic
    '07': 0,  # disgust -> not enthusiastic
    '02': 0,  # calm -> not enthusiastic
}


def extract_features(file_path, n_mfcc=40):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # pad/truncate MFCCs to MAX_MFCC_LEN
    if mfcc.shape[1] < MAX_MFCC_LEN:
        pad_width = MAX_MFCC_LEN - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :MAX_MFCC_LEN]
    return mfcc

# Custom Dataset
class AudioDataset(Dataset):
    def __init__(self, file_list, labels):
        self.file_list = file_list
        self.labels = labels

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        mfcc = extract_features(self.file_list[idx])
        mfcc = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0)  # add channel dim for CNN
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return mfcc, label

# -------------------------------
# Step 2: Prepare file paths and labels
# -------------------------------
audio_files = []
labels = []

for root, dirs, files in os.walk(DATA_FOLDER):
    for file in files:
        if file.endswith(".wav"):
            # filename format: Actor_XX/03-01-03-01-02-01-12.wav
            parts = file.split('-')
            emotion_id = parts[2]  # third number in filename indicates emotion
            if emotion_id in EMOTION_MAP:
                audio_files.append(os.path.join(root, file))
                labels.append(EMOTION_MAP[emotion_id])

print(f"Total examples: {len(audio_files)}")

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(audio_files, labels, test_size=0.2, random_state=42)

train_dataset = AudioDataset(X_train, y_train)
val_dataset = AudioDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# -------------------------------
# Step 3: Define model (simple CNN)
# -------------------------------
class CNNEmotionClassifier(nn.Module):
    def __init__(self, num_classes=8):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool = nn.MaxPool2d(2,2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.fc1 = nn.Linear(32 * 20 * (MAX_MFCC_LEN//2), 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = self.relu(self.bn2(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# -------------------------------
# Step 4: Training
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNEmotionClassifier(num_classes=len(EMOTION_MAP))
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for mfccs, labels_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        mfccs, labels_batch = mfccs.to(device), labels_batch.to(device)
        optimizer.zero_grad()
        outputs = model(mfccs)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Avg Train Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for mfccs, labels_batch in val_loader:
            mfccs, labels_batch = mfccs.to(device), labels_batch.to(device)
            outputs = model(mfccs)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels_batch).sum().item()
            total += labels_batch.size(0)
    val_acc = correct / total
    print(f"Epoch {epoch+1} | Validation Accuracy: {val_acc:.4f}\n")

# -------------------------------
# Step 5: Save model
# -------------------------------
os.makedirs("saved_models", exist_ok=True)
torch.save(model.state_dict(), "saved_models/ravdess_cnn_model.pt")
print("Model saved to saved_models/ravdess_cnn_model.pt")

# -------------------------------
# Step 6: Prediction helper
# -------------------------------
def predict(file_path, model, device):
    model.eval()
    mfcc = extract_features(file_path)
    mfcc = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(mfcc)
        pred = torch.argmax(output, dim=1).item()
    return list(EMOTION_MAP.keys())[pred]

# Example usage
if __name__ == "__main__":
    test_file = X_val[0]
    print(f"Predicted emotion for {test_file}: {predict(test_file, model, device)}")
