In [9]:
#Importing the necessary modules
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader

Emotions used:

01 -> neutral

02 -> happy

03 -> sad

04 -> angry

05 -> disgust

In [10]:
import os

print("CHECK 1: Does the folder exist?")
print(os.path.exists("/kaggle/input/ravdess-emotional-speech-audio/ravdess"))

print("\nCHECK 2: List contents of /kaggle/input/")
print(os.listdir("/kaggle/input"))

print("\nCHECK 3: List contents of ravdess-emotional-speech-audio folder")
print(os.listdir("/kaggle/input/ravdess-emotional-speech-audio"))

print("\nCHECK 4: List contents of ravdess folder")
print(os.listdir("/kaggle/input/ravdess-emotional-speech-audio/ravdess"))


CHECK 1: Does the folder exist?
True

CHECK 2: List contents of /kaggle/input/
['ravdess-emotional-speech-audio', 'fine-tune-emotion-dataset']

CHECK 3: List contents of ravdess-emotional-speech-audio folder
['ravdess']

CHECK 4: List contents of ravdess folder
['Actor_02', 'Actor_17', 'Actor_05', 'Actor_16', 'Actor_21', 'Actor_01', 'Actor_11', 'Actor_20', 'Actor_08', 'Actor_15', 'Actor_06', 'Actor_12', 'Actor_23', 'Actor_24', 'Actor_22', 'Actor_04', 'Actor_19', 'Actor_10', 'Actor_09', 'Actor_14', 'Actor_03', 'Actor_13', 'Actor_18', 'Actor_07']


In [11]:
emotion_map = {
    "01": "neutral",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "07": "disgust"
}

Extracting all the emotion names as ML cannot work with words.

In [12]:
classes = list(emotion_map.values())
class_to_idx = {c: i for i, c in enumerate(classes)}

Audio to Mel Spectrogram

In [13]:
def audio_to_mel_image(path):
    y, sr = librosa.load(path, sr=18050) # sr indicates sampling rate, y indicates audio waveform (1D array) , Fs = 18.05 KHz.

    mel = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=1024, hop_length=256, n_mels=128 ) # window_size = 1024.....converts raw audio into a visual time-freq plot.
    mel_db = librosa.power_to_db(mel, ref=np.max) # convert signal to dB unit for better representation and stability.

    mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min()) # normalise the range between 0 and 1 for input simplicity.
    mel_img = (mel_norm * 255).astype(np.uint8) # convert to pixels(0-255).

    img = Image.fromarray(mel_img).convert("RGB") # ResNet50 needs RGB images.
    img = img.resize((224, 224)) # input requirement.
    return img

Old labels → used for the signature model

New labels → used for the ResNet50 model

In [14]:
# During initialization, RAVDESSDataset scans the RAVDESS folder, picks only .wav files whose emotion code is ni target set, and stores (audio_path, label_index) pairs in self.samples.
# When the DataLoader asks for item i, __getitem__ looks up the path and label, and then calls audio_to_mel_image(audio_path) to actually build the input for the model.

    
class RAVDESSDataset(Dataset):                                     # creating a custom PyTorch Dataset class that the DataLoader can use.
    
    def __init__(self, root_dir, transform=None):                  # root_dir → path to RAVDESS audio folder
                                                                   # transform → any optional image transforms (resize, normalize, etc.)
        
        self.samples = []                                          # stores all (audioPath,labels) in a single list.
        self.transform = transform

        for actor in os.listdir(root_dir):                         # List all items in the main RAVDESS folder.
                                                                   # Each "Actor_xy" is one folder.
            
            actor_folder = os.path.join(root_dir, actor)           # Create the full path......
            if not os.path.isdir(actor_folder):                    # skip anything that is not a folder.
                continue

            for file in os.listdir(actor_folder):                  # List all files in the folder.
                if file.endswith(".wav"):                          # Only select the .wav files.
                    emo_code = file.split("-")[2]                  # Extract the emotion number(0,1,2,etc).

                    # USE ONLY OUR 5 TARGET EMOTIONS
                    if emo_code in emotion_map:                         # Check condition for the presence of the emotion number in emotion map
                        full_path = os.path.join(actor_folder, file)
                        label = class_to_idx[emotion_map[emo_code]]     # number conversion.
                        self.samples.append((full_path, label))         # add (audioPath,label) into the list created above.

    def __len__(self):                                                  # returns total no. of samples.
        return len(self.samples)

    def __getitem__(self, idx):                                         # selects sample at the idx index.
        audio_path, label = self.samples[idx]                           # Retrieve file path + label.
        img = audio_to_mel_image(audio_path)                            # creating the MEL-Spectrogram.

        if self.transform:
            img = self.transform(img)

        return img, label

Layer Freezing


In [15]:
def build_resnet50(num_classes):
    model = models.resnet50(weights="IMAGENET1K_V1")                    # Creates the ResNet 50 model and loads weights that were trained on the ImageNet dataset
                                                                        # This helps in the detection of textures,edges,shapes,etc.

    for name, param in model.named_parameters():
        if "layer4" in name:                                            # allow last block to train
            param.requires_grad = True
        else:
            param.requires_grad = False


    # Replace final FC layer
    model.fc = nn.Linear(2048, num_classes)                             # mapping from 2048 till num_classes.
    return model                                                        # returns the model.

Pre-Training

In [16]:
def pretrain_resnet50_ravdess(ravdess_path, epochs=8, batch_size=16):                       # ravdess_path: folder with the RAVDESS audio.

                                                                                                      # epochs, batch_size, lr: training hyperparameters.

    transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
    ])                                                     # transformation before feeding it to the model.

    dataset = RAVDESSDataset(ravdess_path, transform)                                                 # scans ravdess_path, filters files by emotion, and will return (mel_image, label) with the transform applied.
    
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)                                 # Wrapping the dataset in a DataLoader that will feed data in mini‑batches.

    model = build_resnet50(num_classes=len(classes))                                                  # Builds ResNet‑50 model with a final linear layer sized to no. of emotion classes.

    weights = torch.tensor([1.5, 1.0, 1.0, 1.0, 1.5])  # boost neutral & disgust
    criterion = nn.CrossEntropyLoss(weight=weights)    # Loss Function.
    optimizer = torch.optim.Adam([
    {"params": model.fc.parameters(), "lr": 5e-3},
    {"params": model.layer4.parameters(), "lr": 1e-4},
    ])
                                                       # Uses the Adam optimizer so that only the final layer will be trained.

    print("\n===== STARTING RESNET50 PRETRAINING (5 EMOTIONS) =====\n")

    for epoch in range(epochs):                                                                       # calculating losses and no. of correct predictions for the looping epochs.
        total_loss = 0
        correct = 0

        for imgs, labels in loader:
            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (outputs.argmax(dim=1) == labels).sum().item()

        accuracy = correct / len(dataset) * 100
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.3f} | Accuracy: {accuracy:.2f}%")

    torch.save(model.state_dict(), "ravdess_resnet50_5emotion_pretrained.pt")
    print("\n[SAVED] ravdess_resnet50_5emotion_pretrained.pt\n")

    return model
    
RAVDESS_ROOT = "/kaggle/input/ravdess-emotional-speech-audio/ravdess"

pretrain_resnet50_ravdess(RAVDESS_ROOT)    


===== STARTING RESNET50 PRETRAINING (5 EMOTIONS) =====

Epoch 1/8 | Loss: 101.034 | Accuracy: 39.70%
Epoch 2/8 | Loss: 30.036 | Accuracy: 79.98%
Epoch 3/8 | Loss: 13.958 | Accuracy: 90.62%
Epoch 4/8 | Loss: 10.041 | Accuracy: 93.17%
Epoch 5/8 | Loss: 12.944 | Accuracy: 92.94%
Epoch 6/8 | Loss: 12.717 | Accuracy: 93.63%
Epoch 7/8 | Loss: 15.227 | Accuracy: 91.67%
Epoch 8/8 | Loss: 14.257 | Accuracy: 93.17%

[SAVED] ravdess_resnet50_5emotion_pretrained.pt



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

Fine Tuning


In [20]:
# ============================================
# 1. IMPORTS
# ============================================
from sklearn.metrics.pairwise import cosine_similarity


# ============================================
# 2. DATASET PATH (CHANGE ONLY IF NEEDED)
# ============================================
DATASET_ROOT = "/kaggle/input/fine-tune-emotion-dataset/finetune_dataset"

emotion_labels = ["Neutral", "Happy", "Sad", "Anger", "Disgust"]
label_to_idx = {e:i for i,e in enumerate(emotion_labels)}

print("Dataset root:", DATASET_ROOT)
print("Folders:", os.listdir(DATASET_ROOT))


# ============================================
# 3. MEL SPECTROGRAM
# ============================================
def audio_to_mel_image(path):
    y, sr = librosa.load(path, sr=18050)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024,
                                         hop_length=256, n_mels=128)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min())
    mel_img = (mel_norm * 255).astype(np.uint8)

    img = Image.fromarray(mel_img).convert("RGB")
    img = img.resize((224,224))
    return img


# ============================================
# 4. TRANSFORMS + RESNET50 FEATURE EXTRACTOR
# ============================================
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225])
])

class ResNet50_Extractor(nn.Module):
    def __init__(self):
        super().__init__()
        net = models.resnet50(weights="IMAGENET1K_V1")
        self.feature_extractor = nn.Sequential(*list(net.children())[:-1])

    def forward(self, x):
        with torch.no_grad():
            z = self.feature_extractor(x)
        return z.view(z.size(0), -1)   # 2048-dim


resnet = ResNet50_Extractor().eval()


# ============================================
# 5. EXTRACT EMBEDDINGS FROM YOUR DATASET
# ============================================
all_embeddings = []
all_labels = []

print("Extracting embeddings...")

for emotion in emotion_labels:
    folder = os.path.join(DATASET_ROOT, emotion)
    for file in os.listdir(folder):
        if file.endswith(".wav"):
            path = os.path.join(folder, file)

            img = audio_to_mel_image(path)
            img = transform(img).unsqueeze(0)

            emb = resnet(img)[0].numpy()

            all_embeddings.append(emb)
            all_labels.append(label_to_idx[emotion])

all_embeddings = np.array(all_embeddings)
all_labels = np.array(all_labels)

print("Total samples:", len(all_embeddings))


# ============================================
# 6. COMPUTE EMOTION SIGNATURES
# ============================================
print("Computing signatures...")

signatures = {}
for emotion in emotion_labels:
    idxs = np.where(all_labels == label_to_idx[emotion])[0]
    signatures[emotion] = np.mean(all_embeddings[idxs], axis=0)

print("Signatures ready.")


# ============================================
# 7. HYBRID FEATURE CREATION
# ============================================
def hybrid_vector(embedding):
    sims = []
    for e in emotion_labels:
        sim = cosine_similarity([embedding], [signatures[e]])[0][0]
        sims.append(sim)
    return np.concatenate([embedding, sims])   # 1024 + 5 = 1053


X_hybrid = np.array([hybrid_vector(e) for e in all_embeddings])
y = all_labels

input_dim = X_hybrid.shape[1]
print("Hybrid feature dimension:", input_dim)


# ============================================
# 8. HYBRID CLASSIFIER
# ============================================
class HybridClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256,128),
            nn.ReLU(),
            nn.Linear(128,5)
        )

    def forward(self, x):
        return self.net(x)


clf = HybridClassifier(input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)

X_tensor = torch.tensor(X_hybrid, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)


# ============================================
# 9. TRAINING LOOP
# ============================================
print("\nTraining classifier...\n")

for epoch in range(60):
    optimizer.zero_grad()
    out = clf(X_tensor)
    loss = criterion(out, y_tensor)
    loss.backward()
    optimizer.step()

    acc = (out.argmax(1) == y_tensor).float().mean().item() * 100
    print(f"Epoch {epoch+1}/60 | Loss={loss:.4f} | Acc={acc:.2f}%")


# ============================================
# 10. SAVE FINAL MODEL
# ============================================
torch.save({
    "model_state": clf.state_dict(),
    "signatures": signatures
}, "/kaggle/working/HYBRID_FINAL_MODEL.pt")

print("\n[SAVED] HYBRID_FINAL_MODEL.pt")


# ============================================
# 11. INFERENCE FUNCTION
# ============================================
def predict_emotion(audio_path):
    # Step 1: embedding
    img = audio_to_mel_image(audio_path)
    img = transform(img).unsqueeze(0)
    emb = resnet(img)[0].numpy()

    # Step 2: similarity scores
    sims = []
    for e in emotion_labels:
        sim = cosine_similarity([emb], [signatures[e]])[0][0]
        sims.append(sim)

    # Step 3: hybrid vector
    vec = np.concatenate([emb, sims])
    x = torch.tensor(vec, dtype=torch.float32).unsqueeze(0)

    # Step 4: classify
    clf.eval()
    with torch.no_grad():
        out = clf(x)
        idx = out.argmax(1).item()
        confidence = torch.softmax(out,1).max().item()

    return emotion_labels[idx], confidence


# ============================================
# 12. TEST PREDICTION
# ============================================
# Example:
#test_file = "/kaggle/input/fine-tune-emotion-dataset/happy/sample1.wav"
#motion, conf = predict_emotion(test_file)
#print("Predicted:", emotion, "| Confidence:", round(conf*100,2), "%")


Dataset root: /kaggle/input/fine-tune-emotion-dataset/finetune_dataset
Folders: ['Neutral', 'Sad', 'Disgust', 'Happy', 'Anger']
Extracting embeddings...
Total samples: 17
Computing signatures...
Signatures ready.
Hybrid feature dimension: 2053

Training classifier...

Epoch 1/60 | Loss=1.6235 | Acc=0.00%
Epoch 2/60 | Loss=1.5536 | Acc=29.41%
Epoch 3/60 | Loss=1.4716 | Acc=64.71%
Epoch 4/60 | Loss=1.3714 | Acc=82.35%
Epoch 5/60 | Loss=1.2678 | Acc=82.35%
Epoch 6/60 | Loss=1.1529 | Acc=82.35%
Epoch 7/60 | Loss=1.0332 | Acc=82.35%
Epoch 8/60 | Loss=0.9120 | Acc=100.00%
Epoch 9/60 | Loss=0.7895 | Acc=100.00%
Epoch 10/60 | Loss=0.6698 | Acc=100.00%
Epoch 11/60 | Loss=0.5566 | Acc=100.00%
Epoch 12/60 | Loss=0.4519 | Acc=100.00%
Epoch 13/60 | Loss=0.3597 | Acc=100.00%
Epoch 14/60 | Loss=0.2803 | Acc=100.00%
Epoch 15/60 | Loss=0.2138 | Acc=100.00%
Epoch 16/60 | Loss=0.1599 | Acc=100.00%
Epoch 17/60 | Loss=0.1173 | Acc=100.00%
Epoch 18/60 | Loss=0.0847 | Acc=100.00%
Epoch 19/60 | Loss=0.0605 | 