# **Speech Emotion Recognition using Wav2Vec2**

This notebook implements a speech emotion recognition system using
transfer learning with a pretrained Wav2Vec2 model.

> Note: This project was developed as part of a Deep Learning course assignment.


## ***Environment Setup***




In [None]:
try:
    import transformers
    import safetensors
except ImportError:
    !pip install -q transformers safetensors

import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import random
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import warnings
warnings.filterwarnings("ignore")

# Reproducibility Configuration
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Compute Device: {device}")

# Configration
CONFIG = {
    "SAMPLE_RATE": 16000,
    "MAX_DURATION": 4,
    "MODEL_NAME": "facebook/wav2vec2-base",
    "BATCH_SIZE": 8,
    "BASE_DIR": "/content/kaggle_emotion_8actors/kaggle_emotion_8actors"
}

# Initialize Processor
processor = Wav2Vec2Processor.from_pretrained(CONFIG["MODEL_NAME"])

Compute Device: cuda


## ***Data Loading & Preprocessing Strategy***


Emotion labels are mapped to integer classes for training.


In [None]:
# Define paths
AUDIO_DIR = os.path.join(CONFIG["BASE_DIR"], "audio")
TRAIN_CSV = os.path.join(CONFIG["BASE_DIR"], "train.csv")
TEST_CSV = os.path.join(CONFIG["BASE_DIR"], "test.csv")

# Load metadata
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)


# Standardize Labels
label_map = {
    "neutral": 0, "calm": 1, "happy": 2, "sad": 3,
    "angry": 4, "fearful": 5, "disgust": 6, "surprised": 7
}

if train_df["label"].dtype == 'O':
    train_df["label"] = train_df["label"].map(label_map)
train_df["label"] = train_df["label"].fillna(-1).astype(int)

print(f"Class imbalance fixed. New training size: {len(train_df)}")

Class imbalance fixed. New training size: 384


# ***Custom Dataset with Augmentation***

In [None]:
class Wav2VecDataset(Dataset):
    def __init__(self, df, audio_dir, augment=False):
        self.df = df.reset_index(drop=True)
        self.audio_dir = audio_dir
        self.augment = augment
        self.target_len = CONFIG["SAMPLE_RATE"] * CONFIG["MAX_DURATION"]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = os.path.join(self.audio_dir, row['Id'])

        # Robust Loading
        try:
            y, sr = librosa.load(path, sr=CONFIG["SAMPLE_RATE"])
        except:
            y = np.zeros(CONFIG["SAMPLE_RATE"])

        # Augmentation
        if self.augment:
            # 1. Gaussian Noise
            if np.random.rand() > 0.5:
                noise_amp = 0.005 * np.random.uniform() * np.amax(y)
                y = y + noise_amp * np.random.normal(size=y.shape[0])

            # 2. Time Shifting
            if np.random.rand() > 0.5:
                shift_len = int(np.random.uniform(low=-0.1, high=0.1) * CONFIG["SAMPLE_RATE"])
                y = np.roll(y, shift_len)

        # Pad or Truncate
        if len(y) > self.target_len:
            y = y[:self.target_len]
        else:
            y = np.pad(y, (0, self.target_len - len(y)), 'constant')

        # Feature Extraction
        inputs = processor(
            y, sampling_rate=CONFIG["SAMPLE_RATE"], return_tensors="pt", padding=True
        )
        input_values = inputs.input_values.squeeze(0)

        if "label" in self.df.columns:
            return input_values, torch.tensor(row["label"], dtype=torch.long)
        else:
            return input_values

# Stratified Split
train_split, val_split = train_test_split(
    train_df, test_size=0.15, random_state=SEED, stratify=train_df['label']
)

# Addressing Class Imbalance
# Neutral has 26 samples, others 52
neutral_samples = train_df[train_df['label'] == 'neutral']
train_df = pd.concat([train_df, neutral_samples], axis=0).reset_index(drop=True)

# Initialize DataLoaders
train_loader = DataLoader(Wav2VecDataset(train_split, AUDIO_DIR, augment=True), batch_size=CONFIG["BATCH_SIZE"], shuffle=True)
val_loader = DataLoader(Wav2VecDataset(val_split, AUDIO_DIR, augment=False), batch_size=CONFIG["BATCH_SIZE"], shuffle=False)

**Custom Dataset and Audio Augmentation**

This dataset class handles:
- Audio loading and resampling
- Padding and truncation
- Data augmentation (noise injection, time shifting)


# ***Model Architecture & Transfer Learning***

In [None]:
print("Loading Pre-trained Wav2Vec 2.0...")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    CONFIG["MODEL_NAME"],
    num_labels=8,
    use_safetensors=True
).to(device)

# Freeze CNN Feature Extractor
model.freeze_feature_extractor()

# Optimizer & Loss
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.5)

# Weighted Loss
class_weights = torch.tensor([1.0, 1.0, 1.2, 1.0, 1.0, 1.2, 1.0, 1.0]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

Loading Pre-trained Wav2Vec 2.0...


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# ***Training Loop***

In [None]:
EPOCHS = 30
best_acc = 0.0
print("Starting Validation Training Pipeline...")

for epoch in range(EPOCHS):
    model.train()
    train_correct = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        train_correct += (outputs.logits.argmax(1) == labels).sum().item()

    # Validation
    model.eval()
    val_correct = 0
    val_loss = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs.logits, labels).item()
            val_correct += (outputs.logits.argmax(1) == labels).sum().item()

    train_acc = train_correct / len(train_split)
    val_acc = val_correct / len(val_split)
    avg_val_loss = val_loss / len(val_loader)
    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_wav2vec.pth")

print(f"\nBest Validation Accuracy: {best_acc:.4f}")

Starting Validation Training Pipeline...
Epoch 1/30 | Train Acc: 0.1258 | Val Acc: 0.1379
Epoch 2/30 | Train Acc: 0.2331 | Val Acc: 0.2759
Epoch 3/30 | Train Acc: 0.2975 | Val Acc: 0.3103
Epoch 4/30 | Train Acc: 0.3926 | Val Acc: 0.3448
Epoch 5/30 | Train Acc: 0.4908 | Val Acc: 0.4655
Epoch 6/30 | Train Acc: 0.5184 | Val Acc: 0.4483
Epoch 7/30 | Train Acc: 0.6135 | Val Acc: 0.5345
Epoch 8/30 | Train Acc: 0.6810 | Val Acc: 0.5345
Epoch 9/30 | Train Acc: 0.7393 | Val Acc: 0.7069
Epoch 10/30 | Train Acc: 0.7883 | Val Acc: 0.7759
Epoch 11/30 | Train Acc: 0.8589 | Val Acc: 0.8276
Epoch 12/30 | Train Acc: 0.8681 | Val Acc: 0.7931
Epoch 13/30 | Train Acc: 0.9172 | Val Acc: 0.7586
Epoch 14/30 | Train Acc: 0.9387 | Val Acc: 0.8103
Epoch 15/30 | Train Acc: 0.9479 | Val Acc: 0.8793
Epoch 16/30 | Train Acc: 0.9632 | Val Acc: 0.8448
Epoch 17/30 | Train Acc: 0.9264 | Val Acc: 0.8621
Epoch 18/30 | Train Acc: 0.9724 | Val Acc: 0.8103
Epoch 19/30 | Train Acc: 0.9847 | Val Acc: 0.8276
Epoch 20/30 | Trai

**Train–Validation Split and Class Imbalance Handling**

A stratified split is used, and class imbalance is mitigated
using sample duplication and weighted loss.


# ***Evaluation & Inference***

*Before deploying the model, I have to verify its performance. I loaded the best weights and generate a Confusion Matrix to inspect class-specific errors*

In [None]:
# Load Best Weights
model.load_state_dict(torch.load("best_wav2vec.pth", map_location=device))
model.eval()

# Generate Confusion Matrix
y_true, y_pred = [], []
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        y_pred.extend(outputs.logits.argmax(1).cpu().numpy())
        y_true.extend(labels.cpu().numpy())

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_map.keys()))

# Generate Submission 1 (Validation Model)
test_ds = Wav2VecDataset(test_df, AUDIO_DIR, augment=False)
test_loader = DataLoader(test_ds, batch_size=8, shuffle=False)

preds_list = []
with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds_list.extend(outputs.logits.argmax(1).cpu().numpy())

inv_label_map = {v: k for k, v in label_map.items()}
pred_labels = [inv_label_map[p] for p in preds_list]

submission = pd.DataFrame({"Id": test_df["Id"], "label": pred_labels})
submission.to_csv("submission_wav2vec_val.csv", index=False)
print("submission_wav2vec_val.csv generated successfully!")


Classification Report:
              precision    recall  f1-score   support

     neutral       1.00      0.75      0.86         4
        calm       1.00      1.00      1.00         7
       happy       1.00      0.86      0.92         7
         sad       0.89      1.00      0.94         8
       angry       1.00      1.00      1.00         8
     fearful       1.00      1.00      1.00         8
     disgust       1.00      1.00      1.00         8
   surprised       0.89      1.00      0.94         8

    accuracy                           0.97        58
   macro avg       0.97      0.95      0.96        58
weighted avg       0.97      0.97      0.96        58

submission_wav2vec_val.csv generated successfully!


# **Production Training (Full Dataset)**

*To achieve maximum performance I perform a Production Run. I combined the Training and Validation sets to utilize 100% of the available data. I initialize the model with the weights from the previous step (best_wav2vec.pth) and fine-tune it for a few epochs at a very low learning rate. This adapts the model to the full data distribution without destroying the learned features*

In [None]:
print("\n Starting Production Phase")
full_train_ds = Wav2VecDataset(train_df, AUDIO_DIR, augment=True)
production_loader = DataLoader(full_train_ds, batch_size=CONFIG["BATCH_SIZE"], shuffle=True)

# Reload Model
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    CONFIG["MODEL_NAME"], num_labels=8, use_safetensors=True
).to(device)

if os.path.exists("best_wav2vec.pth"):
    model.load_state_dict(torch.load("best_wav2vec.pth"))
    print("Loaded best validation weights. Fine-tuning...")

model.freeze_feature_extractor()

# Lower Learning Rate for Fine-Tuning
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=1e-2)
ACCUM_STEPS = 4

print("Starting Production Training (10 Epochs)...")

for epoch in range(10):
    model.train()
    running_loss = 0.0
    optimizer.zero_grad()

    for i, (inputs, labels) in enumerate(production_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs.logits, labels) / ACCUM_STEPS
        loss.backward()

        if (i + 1) % ACCUM_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()

        running_loss += loss.item() * ACCUM_STEPS

    print(f"Epoch {epoch+1} | Prod Loss: {running_loss/len(production_loader):.4f}")

# Final Submission Generation
model.eval()
preds_list = []
with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds_list.extend(outputs.logits.argmax(1).cpu().numpy())

final_labels = [inv_label_map[p] for p in preds_list]
submission_final = pd.DataFrame({"Id": test_df["Id"], "label": final_labels})
submission_final.to_csv("submission_production_final.csv", index=False)
print("submission_production_final.csv generated!")


 Starting Production Phase


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded best validation weights. Fine-tuning...
Starting Production Training (10 Epochs)...
Epoch 1 | Prod Loss: 0.1683
Epoch 2 | Prod Loss: 0.1447
Epoch 3 | Prod Loss: 0.1300
Epoch 4 | Prod Loss: 0.1275
Epoch 5 | Prod Loss: 0.1192
Epoch 6 | Prod Loss: 0.1235
Epoch 7 | Prod Loss: 0.1086
Epoch 8 | Prod Loss: 0.1317
Epoch 9 | Prod Loss: 0.1010
Epoch 10 | Prod Loss: 0.0930
submission_production_final.csv generated!


# ***References***



[1] A. Baevski et al., “wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations,”
in Advances in Neural Information Processing Systems (NeurIPS), 2020.  
https://arxiv.org/abs/2006.11477

[2] Livingstone, S. R., & Russo, F. A., “The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS),”
PLOS ONE, 2018.  
https://doi.org/10.1371/journal.pone.0196391

[3] Kaggle Dataset: *Speech Emotion Recognition (RAVDESS)*.  
Available at: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio

[4] Hugging Face Transformers Documentation.  
https://huggingface.co/docs/transformers/model_doc/wav2vec2
