In [1]:
!pip install librosa pandas scikit-learn torch torchaudio

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import IPython.display as ipd
import random

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


**Step 1: Load AND ENCODE  Data + FEATURE EXTRACTION**

In [5]:

df = pd.read_csv('/content/drive/MyDrive/Audio_Detection_Tone/tts_training_data.csv')  # <-- Change to your dataset path

audio_paths = df['audio_path'].tolist()
emotions = df['emotion'].tolist()

# Encode emotions to numbers
emotion_labels = sorted(list(set(emotions)))
emotion2idx = {emotion: idx for idx, emotion in enumerate(emotion_labels)}
idx2emotion = {idx: emotion for emotion, idx in emotion2idx.items()}
y = [emotion2idx[emotion] for emotion in emotions]

# ------------------------
def extract_features(path, n_mfcc=40):
    try:
        y, sr = librosa.load(path, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc = np.mean(mfcc.T, axis=0)  # Average over time
    except Exception as e:
        print(f"Error loading {path}: {e}")
        mfcc = np.zeros(n_mfcc)
    return mfcc

In [11]:
print("Emotion to Index Mapping:")
for idx, emotion in idx2emotion.items():
    print(f"{idx}: {emotion}")


Emotion to Index Mapping:
0: Angry
1: Disgusted
2: Fearful
3: Happy
4: Neutral
5: Sad
6: Suprised


**Custom Dataset **

In [6]:
class AudioDataset(Dataset):
    def __init__(self, audio_paths, labels):
        self.audio_paths = audio_paths
        self.labels = labels

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        #Only extract feature when needed (not before!)
        feature = extract_features(self.audio_paths[idx])
        label = self.labels[idx]
        return torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


In [7]:
import os

missing_files = [path for path in audio_paths if not os.path.exists(path)]
print(f"Missing files: {len(missing_files)}")
for path in missing_files:
    print(path)


Missing files: 0


In [8]:
audio_paths, y = zip(*[(path, label) for path, label in zip(audio_paths, y) if os.path.exists(path)])


**Train/Val/Test Split with stratification**

In [9]:
# Train/Val/Test Split with stratification
X_train, X_temp, y_train, y_temp = train_test_split(
    audio_paths, y, test_size=0.3, shuffle=True, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, shuffle=True, stratify=y_temp, random_state=42
)

# Create datasets
train_dataset = AudioDataset(X_train, y_train)
val_dataset = AudioDataset(X_val, y_val)
test_dataset = AudioDataset(X_test, y_test)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


**Build THE Model **

In [None]:

class EmotionClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, num_classes):
        super(EmotionClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim2, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

input_dim = 40  # 40 MFCCs
hidden_dim1 = 128
hidden_dim2 = 64
num_classes = len(emotion_labels)

model = EmotionClassifier(input_dim, hidden_dim1, hidden_dim2, num_classes)

**Train the Model **

In [None]:
# ------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

n_epochs = 30  # More epochs since you have bigger dataset

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0

    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{n_epochs}] "
          f"Train Loss: {running_loss/len(train_loader):.4f} "
          f"Val Loss: {val_loss/len(val_loader):.4f} "
          f"Val Accuracy: {100 * correct / total:.2f}%")

Epoch [1/30] Train Loss: 1.6996 Val Loss: 1.6531 Val Accuracy: 35.92%
Epoch [2/30] Train Loss: 1.4666 Val Loss: 1.3651 Val Accuracy: 45.29%
Epoch [3/30] Train Loss: 1.3491 Val Loss: 1.2953 Val Accuracy: 47.42%
Epoch [4/30] Train Loss: 1.2850 Val Loss: 1.2675 Val Accuracy: 49.14%
Epoch [5/30] Train Loss: 1.2427 Val Loss: 1.1927 Val Accuracy: 51.95%
Epoch [6/30] Train Loss: 1.1958 Val Loss: 1.2376 Val Accuracy: 51.07%
Epoch [7/30] Train Loss: 1.1759 Val Loss: 1.1791 Val Accuracy: 53.41%
Epoch [8/30] Train Loss: 1.1447 Val Loss: 1.2160 Val Accuracy: 51.43%
Epoch [9/30] Train Loss: 1.1296 Val Loss: 1.1721 Val Accuracy: 51.59%
Epoch [10/30] Train Loss: 1.1020 Val Loss: 1.1223 Val Accuracy: 55.75%
Epoch [11/30] Train Loss: 1.0866 Val Loss: 1.1455 Val Accuracy: 51.90%
Epoch [12/30] Train Loss: 1.0760 Val Loss: 1.1204 Val Accuracy: 55.54%
Epoch [13/30] Train Loss: 1.0626 Val Loss: 1.1046 Val Accuracy: 54.71%
Epoch [14/30] Train Loss: 1.0466 Val Loss: 1.0879 Val Accuracy: 57.37%
Epoch [15/30] T

** Final Evaluation on Test Set + save the model **

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'\nFinal Test Accuracy: {100 * correct / total:.2f}%')

# ------------------------
torch.save(model.state_dict(), 'emotion_classifier_final.pth')
print("Model saved as 'emotion_classifier_final.pth'.")




Final Test Accuracy: 60.82%
Model saved as 'emotion_classifier_final.pth'.


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Assume y_true and y_pred are already filled from your model evaluation

# Define your labels
emotion_labels = ['angry', 'happy', 'neutral', 'sad', 'fearful', 'disgust', 'surprised']

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Now for each class, calculate TP, FP, FN, TN
for idx, label in enumerate(emotion_labels):
    TP = cm[idx, idx]
    FP = cm[:, idx].sum() - TP
    FN = cm[idx, :].sum() - TP
    TN = cm.sum() - (TP + FP + FN)

    print(f"\nClass: {label}")
    print(f"  True Positive (TP): {TP}")
    print(f"  False Positive (FP): {FP}")
    print(f"  False Negative (FN): {FN}")
    print(f"  True Negative (TN): {TN}")



Class: angry
  True Positive (TP): 189
  False Positive (FP): 56
  False Negative (FN): 96
  True Negative (TN): 1355

Class: happy
  True Positive (TP): 124
  False Positive (FP): 86
  False Negative (FN): 128
  True Negative (TN): 1358

Class: neutral
  True Positive (TP): 134
  False Positive (FP): 79
  False Negative (FN): 134
  True Negative (TN): 1349

Class: sad
  True Positive (TP): 175
  False Positive (FP): 115
  False Negative (FN): 117
  True Negative (TN): 1289

Class: fearful
  True Positive (TP): 147
  False Positive (FP): 138
  False Negative (FN): 89
  True Negative (TN): 1322

Class: disgust
  True Positive (TP): 198
  False Positive (FP): 160
  False Negative (FN): 81
  True Negative (TN): 1257

Class: surprised
  True Positive (TP): 77
  False Positive (FP): 18
  False Negative (FN): 7
  True Negative (TN): 1594


**BETTER MODEL**

In [12]:


# Define upgraded model
class EmotionClassifierV2(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, num_classes):
        super(EmotionClassifierV2, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.bn1 = nn.BatchNorm1d(hidden_dim1)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(hidden_dim2, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        return x

# Model parameters
input_dim = 40  # 40 MFCC features
hidden_dim1 = 128
hidden_dim2 = 64
num_classes = len(emotion_labels)  # assume you already have emotion_labels

# Instantiate model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmotionClassifierV2(input_dim, hidden_dim1, hidden_dim2, num_classes).to(device)

# Loss, Optimizer, Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# Training
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total

    scheduler.step(avg_val_loss)

    print(f'Epoch [{epoch+1}/{num_epochs}], '
          f'Train Loss: {avg_train_loss:.4f}, '
          f'Val Loss: {avg_val_loss:.4f}, '
          f'Val Accuracy: {val_accuracy:.2f}%')

# Final Evaluation on Test Set
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print(f'\nFinal Test Accuracy: {100 * correct / total:.2f}%')

# Save model
torch.save(model.state_dict(), '/content/drive/MyDrive/Audio_Detection_Tone/emotion2_classifier2_final2.pth')
print("Model saved as 'emotion2_classifier2_final2.pth'.")




Epoch [1/50], Train Loss: 1.5467, Val Loss: 1.3025, Val Accuracy: 49.30%
Epoch [2/50], Train Loss: 1.3451, Val Loss: 1.2073, Val Accuracy: 54.14%
Epoch [3/50], Train Loss: 1.2760, Val Loss: 1.1542, Val Accuracy: 55.28%
Epoch [4/50], Train Loss: 1.2359, Val Loss: 1.1292, Val Accuracy: 54.61%
Epoch [5/50], Train Loss: 1.2038, Val Loss: 1.1077, Val Accuracy: 56.22%
Epoch [6/50], Train Loss: 1.1769, Val Loss: 1.0907, Val Accuracy: 57.63%
Epoch [7/50], Train Loss: 1.1595, Val Loss: 1.0665, Val Accuracy: 58.09%
Epoch [8/50], Train Loss: 1.1522, Val Loss: 1.0665, Val Accuracy: 57.73%
Epoch [9/50], Train Loss: 1.1417, Val Loss: 1.0615, Val Accuracy: 58.25%
Epoch [10/50], Train Loss: 1.1276, Val Loss: 1.0463, Val Accuracy: 59.14%
Epoch [11/50], Train Loss: 1.1246, Val Loss: 1.0886, Val Accuracy: 57.11%
Epoch [12/50], Train Loss: 1.1149, Val Loss: 1.0267, Val Accuracy: 59.76%
Epoch [13/50], Train Loss: 1.1084, Val Loss: 1.0670, Val Accuracy: 58.36%
Epoch [14/50], Train Loss: 1.1134, Val Loss: 1.

**SAMPLES**

In [15]:


# Display 5 random test examples with audio + prediction
num_samples = 5
indices = random.sample(range(len(X_test)), num_samples)

print("\nShowing some test predictions with actual audio...\n")
model.eval()

for idx in indices:
    audio_path = X_test[idx]
    true_label = y_test[idx]

    # Load audio and extract features the same way as during training
    waveform, sr = torchaudio.load(audio_path)
    mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sr, n_mfcc=40)
    mfcc = mfcc_transform(waveform).squeeze(0).mean(dim=1)

    input_tensor = mfcc.unsqueeze(0).to(device)
    output = model(input_tensor)
    pred_label = torch.argmax(output, dim=1).item()

    print(f"🔊 Audio: {audio_path}")
    print(f"✅ True Emotion: {idx2emotion[true_label]}")
    print(f"🧠 Predicted Emotion: {idx2emotion[pred_label]}")
    ipd.display(ipd.Audio(audio_path))
    print("-----------------------------------------------------\n")


Showing some test predictions with actual audio...

🔊 Audio: /content/drive/MyDrive/Emotions/Happy/n17.wav
✅ True Emotion: Happy
🧠 Predicted Emotion: Angry


-----------------------------------------------------

🔊 Audio: /content/drive/MyDrive/Emotions/Neutral/YAF_voice_neutral.wav
✅ True Emotion: Neutral
🧠 Predicted Emotion: Angry


-----------------------------------------------------

🔊 Audio: /content/drive/MyDrive/Emotions/Sad/1077_IWL_SAD_XX.wav
✅ True Emotion: Sad
🧠 Predicted Emotion: Neutral


-----------------------------------------------------

🔊 Audio: /content/drive/MyDrive/Emotions/Disgusted/OAF_long_disgust.wav
✅ True Emotion: Disgusted
🧠 Predicted Emotion: Disgusted


-----------------------------------------------------

🔊 Audio: /content/drive/MyDrive/Emotions/Sad/1012_IOM_SAD_XX.wav
✅ True Emotion: Sad
🧠 Predicted Emotion: Neutral


-----------------------------------------------------

