# Step 1: Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from dataclasses import dataclass
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
# from transformers import WavLMForSequenceClassification, WavLMProcessor

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
import librosa
import torchaudio
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, Wav2Vec2Processor, TrainingArguments, Trainer
from datasets import Dataset
import torch
from torch import nn
from torch.amp import autocast, GradScaler
from typing import Dict, Any, Union


  from .autonotebook import tqdm as notebook_tqdm


# Step 2: Data Loading and Preparation
## 2.1: Load the Data

In [2]:
def load_data_ravdess(data_dir):
    emotion_labels = {
        '01': 'neutral',
        '02': 'calm',
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fearful',
        '07': 'disgust',
        '08': 'surprised'
    }
    file_list = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.wav'):
                parts = file.split('-')
                emotion = emotion_labels.get(parts[2])
                file_list.append({'file_path': os.path.join(root, file), 'emotion': emotion})
    return pd.DataFrame(file_list)

def load_data_cremad(data_dir):
    emotion_labels = {
        'ANG': 'angry',
        'DIS': 'disgust',
        'FEA': 'fearful',
        'HAP': 'happy',
        'NEU': 'neutral',
        'SAD': 'sad'
    }
    file_list = []
    for file in os.listdir(data_dir):
        if file.endswith('.wav'):
            parts = file.split('_')
            emotion = emotion_labels.get(parts[2])
            if emotion:
                file_list.append({'file_path': os.path.join(data_dir, file), 'emotion': emotion})
    return pd.DataFrame(file_list)

def load_data_tess(data_dir):
    emotion_map = {
        'angry': 'angry',
        'disgust': 'disgust',
        'fear': 'fearful',
        'happy': 'happy',
        'ps': 'surprised',  # Pleasant surprise
        'sad': 'sad',
        'neutral': 'neutral'
    }
    file_list = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.wav'):
                emotion = file.split('_')[2].split('.')[0]
                emotion_label = emotion_map.get(emotion)
                if emotion_label:
                    file_list.append({'file_path': os.path.join(root, file), 'emotion': emotion_label})
    return pd.DataFrame(file_list)

def load_data_savee(data_dir):
    emotion_map = {
        'a': 'angry',
        'd': 'disgust',
        'f': 'fearful',
        'h': 'happy',
        'n': 'neutral',
        'sa': 'sad',
        'su': 'surprised'
    }
    file_list = []
    for file in os.listdir(data_dir):
        if file.endswith('.wav'):
            emotion_code = file.split('_')[1][:2]
            emotion_label = emotion_map.get(emotion_code)
            if emotion_label:
                file_list.append({'file_path': os.path.join(data_dir, file), 'emotion': emotion_label})
    return pd.DataFrame(file_list)



In [3]:
ravdess_df = load_data_ravdess('data/RAVDESS')
cremad_df = load_data_cremad('data/CREMA-D')
tess_df = load_data_tess('data/TESS')
savee_df = load_data_savee('data/SAVEE')

# Combine datasets
data_df = pd.concat([ravdess_df, cremad_df, tess_df, savee_df], ignore_index=True)



## 2.2: Encode the labels

In [4]:
# Standardize emotion labels
emotion_list = ['angry', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised', 'calm']

# Handle any missing emotions in datasets
data_df = data_df[data_df['emotion'].isin(emotion_list)]

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(emotion_list)
data_df['label'] = label_encoder.transform(data_df['emotion'])
num_classes = len(label_encoder.classes_)


In [5]:
# save the data_df
data_df.to_csv('data_df.csv', index=False)

In [20]:
data_df.head()

Unnamed: 0,file_path,emotion,label
0,data/RAVDESS/Actor_05/03-01-08-02-01-01-05.wav,surprised,7
1,data/RAVDESS/Actor_05/03-01-06-02-01-01-05.wav,fearful,3
2,data/RAVDESS/Actor_05/03-01-06-02-01-02-05.wav,fearful,3
3,data/RAVDESS/Actor_05/03-01-02-01-02-02-05.wav,calm,1
4,data/RAVDESS/Actor_05/03-01-05-01-01-02-05.wav,angry,0


In [25]:
# group by emotion
emotion_counts = data_df['emotion'].value_counts().sort_index()
emotion_counts

emotion
angry        1863
calm          192
disgust      1863
fearful      1863
happy        1863
neutral      1583
sad          1923
surprised     652
Name: count, dtype: int64

## 2.3: Data Splitting

In [6]:
train_df, temp_df = train_test_split(data_df, test_size=0.3, stratify=data_df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f'Training samples: {len(train_df)}') 
print(f'Validation samples: {len(val_df)}')
print(f'Test samples: {len(test_df)}')


Training samples: 8261
Validation samples: 1770
Test samples: 1771


In [None]:
train_df.head()

Unnamed: 0,file_path,emotion,label
4834,data/CREMA-D/1051_MTI_ANG_XX.wav,angry,0
5876,data/CREMA-D/1036_IOM_DIS_XX.wav,disgust,2
4623,data/CREMA-D/1069_ITH_ANG_XX.wav,angry,0
7472,data/CREMA-D/1039_IOM_FEA_XX.wav,fearful,3
452,data/RAVDESS/Actor_07/03-01-02-01-01-02-07.wav,calm,1


# Step 3: Baseline Models
## 3.1: Feature Extraction for Baseline Models
### 3.1.1: Data augmentation

In [None]:
def load_audio(file_path, target_sr=16000):
    y, sr = librosa.load(file_path, sr=None)  # Load with original sampling rate
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    return y, sr

def normalize_audio(y):
    rms = np.sqrt(np.mean(y**2))
    if rms > 0:
        y_normalized = y / rms
    else:
        y_normalized = y
    return y_normalized

def augment_audio(y, sr):
    augmented_data = []
    
    # Original
    augmented_data.append(y)
    
    # Add noise
    noise = np.random.randn(len(y))
    y_noise = y + 0.005 * noise
    augmented_data.append(y_noise)
    
    # Time stretching
    y_stretch = librosa.effects.time_stretch(y, rate=0.9)
    augmented_data.append(y_stretch)
    y_stretch = librosa.effects.time_stretch(y, rate=1.1)
    augmented_data.append(y_stretch)
    
    # Pitch shifting
    y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=2)
    augmented_data.append(y_shift)
    y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=-2)
    augmented_data.append(y_shift)
    
    # Reverberation (simple simulation)
    y_reverb = librosa.effects.preemphasis(y)
    augmented_data.append(y_reverb)
    
    return augmented_data


### 3.1.2: Feature extraction

In [None]:

def extract_features(file_path, n_mfcc=40, target_sr=16000, augment=False):
    y, sr = load_audio(file_path, target_sr=target_sr)
    y = normalize_audio(y)
    
    if augment:
        augmented_audios = augment_audio(y, sr)
    else:
        augmented_audios = [y]
    
    features = []
    for augmented_y in augmented_audios:
        # Ensure consistent length by trimming or padding
        max_length = target_sr * 3  # 3 seconds
        # Trim or pad audio
        if len(augmented_y) > max_length:
            augmented_y = augmented_y[:max_length]
        else:
            augmented_y = np.pad(augmented_y, (0, max_length - len(augmented_y)), mode='constant')
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=augmented_y, sr=sr, n_mfcc=n_mfcc)
        mfccs = np.mean(mfccs.T, axis=0)
        
        # Extract additional features if needed
        chroma = librosa.feature.chroma_stft(y=augmented_y, sr=sr)
        chroma = np.mean(chroma.T, axis=0)
        spectral_contrast = librosa.feature.spectral_contrast(y=augmented_y, sr=sr)
        spectral_contrast = np.mean(spectral_contrast.T, axis=0)
        
        # Concatenate all features
        feature_vector = np.concatenate([mfccs, chroma, spectral_contrast])
        features.append(feature_vector)
    
    return features


## 3.1.3: Extract Features for All Samples

In [None]:
def extract_features_for_df(df):
    features = []
    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc='Extracting features'):
        feature = extract_features(row['file_path'], augment=False)
        features.append(feature)
    return np.array(features)

In [None]:

# Extract features
X_train = extract_features_for_df(train_df)
X_val = extract_features_for_df(val_df)
X_test = extract_features_for_df(test_df)

# Get labels
y_train = train_df['label'].values
y_val = val_df['label'].values
y_test = test_df['label'].values


In [None]:
# reshape data
X_train = np.array(X_train).reshape(len(X_train), -1)
X_val = np.array(X_val).reshape(len(X_val), -1)
X_test = np.array(X_test).reshape(len(X_test), -1)

In [None]:
# Print the shape of the feature arrays
print(f'X_train shape: {X_train.shape}')
print(f'X_val shape: {X_val.shape}')
print(f'X_test shape: {X_test.shape}')

# Print a sample of the feature arrays
print('Sample features from X_train:')
print(X_train[0])

print('Sample features from X_val:')
print(X_val[0])

print('Sample features from X_test:')
print(X_test[0])

### 3.1.4: Feature Scaling

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train = scaler.fit_transform(X_train)

# Transform the validation data using the fitted scaler
X_val = scaler.transform(X_val)

# Transform the test data using the fitted scaler
X_test = scaler.transform(X_test)


## 3.2: Train and Evaluate Baseline Models

### 3.2.1: Logistic Regression

In [None]:
# Initialize model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Train model
lr_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred_lr = lr_model.predict(X_val)

# Evaluate
val_accuracy_lr = accuracy_score(y_val, y_val_pred_lr)
val_f1_lr = f1_score(y_val, y_val_pred_lr, average='weighted')

print(f'Logistic Regression - Validation Accuracy: {val_accuracy_lr:.4f}, F1 Score: {val_f1_lr:.4f}')


### 3.4.2: Random Forest

In [None]:
# Initialize model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train model
rf_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred_rf = rf_model.predict(X_val)

# Evaluate
val_accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
val_f1_rf = f1_score(y_val, y_val_pred_rf, average='weighted')

print(f'Random Forest - Validation Accuracy: {val_accuracy_rf:.4f}, F1 Score: {val_f1_rf:.4f}')


In [None]:
import random

import IPython.display as ipd

# Select random samples
num_samples = 5
random_indices = random.sample(range(len(test_df)), num_samples)
sample_df = test_df.iloc[random_indices]

# Extract features for the selected samples
sample_features = np.array([extract_features(row['file_path'])[0] for _, row in sample_df.iterrows()])
sample_features = scaler.transform(sample_features)

# Get true labels and predictions
sample_labels = sample_df['label'].values
sample_predictions_lr = lr_model.predict(sample_features)
sample_predictions_rf = rf_model.predict(sample_features)

# Display the samples
for i, (index, row) in enumerate(sample_df.iterrows()):
    print(f"Sample {i+1}:")
    print(f"Path: {row['file_path']}")
    print(f"True Label: {label_encoder.inverse_transform([sample_labels[i]])[0]}")
    print(f"Logistic Regression Prediction: {label_encoder.inverse_transform([sample_predictions_lr[i]])[0]}")
    print(f"Random Forest Prediction: {label_encoder.inverse_transform([sample_predictions_rf[i]])[0]}")
    ipd.display(ipd.Audio(row['file_path']))
    print("\n")

In [None]:
# Logistic Regression
y_test_pred_lr = lr_model.predict(X_test)
test_accuracy_lr = accuracy_score(y_test, y_test_pred_lr)
test_f1_lr = f1_score(y_test, y_test_pred_lr, average='weighted')
print(f'Logistic Regression - Test Accuracy: {test_accuracy_lr:.4f}, F1 Score: {test_f1_lr:.4f}')

# Random Forest
y_test_pred_rf = rf_model.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
test_f1_rf = f1_score(y_test, y_test_pred_rf, average='weighted')
print(f'Random Forest - Test Accuracy: {test_accuracy_rf:.4f}, F1 Score: {test_f1_rf:.4f}')


# Step 4: Advanced Models

## 4.1: Prepare Data

In [8]:
from ser_wav2vec import SpeechEmotionRecognition

In [9]:
df_train_val = pd.concat([train_df, val_df], ignore_index=True)
# save the df_train_val
df_train_val.to_csv('df_train_val.csv', index=False)

In [12]:
args = {
    'batch_size': 32,
    'lr': 1e-4,
    'epochs': 20,
    'gradient_accumulation_steps': 4,
    'checkpoint_dir': 'checkpoints',
    'checkpoint': None
}

In [13]:
ser = SpeechEmotionRecognition(
    df=df_train_val,
    model_name='facebook/wav2vec2-base',
    batch_size=args['batch_size'],
    lr=args['lr'],
    num_epochs=args['epochs'],
    checkpoint_dir=args['checkpoint_dir'],
    gradient_accumulation_steps=args['gradient_accumulation_steps'],
    checkpoint_path=args['checkpoint']
)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
ser.train()

Epoch 1/20
----------


Training: 100%|██████████| 251/251 [01:57<00:00,  2.13it/s]


Training Loss: 1.5015
Checkpoint saved at checkpoints/model_epoch_1.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  8.03it/s]


Validation Loss: 1.0209, Validation Accuracy: 0.6567
Epoch 2/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.8850
Checkpoint saved at checkpoints/model_epoch_2.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  8.01it/s]


Validation Loss: 0.8070, Validation Accuracy: 0.7205
Epoch 3/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.09it/s]


Training Loss: 0.6303
Checkpoint saved at checkpoints/model_epoch_3.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  7.99it/s]


Validation Loss: 0.6323, Validation Accuracy: 0.7867
Epoch 4/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.4623
Checkpoint saved at checkpoints/model_epoch_4.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  8.02it/s]


Validation Loss: 0.6393, Validation Accuracy: 0.7912
Epoch 5/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.11it/s]


Training Loss: 0.3920
Checkpoint saved at checkpoints/model_epoch_5.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  8.00it/s]


Validation Loss: 0.7181, Validation Accuracy: 0.7723
Epoch 6/20
----------


Training: 100%|██████████| 251/251 [01:58<00:00,  2.11it/s]


Training Loss: 0.3157
Checkpoint saved at checkpoints/model_epoch_6.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  8.02it/s]


Validation Loss: 0.6136, Validation Accuracy: 0.8216
Epoch 7/20
----------


Training: 100%|██████████| 251/251 [01:58<00:00,  2.11it/s]


Training Loss: 0.2933
Checkpoint saved at checkpoints/model_epoch_7.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  8.01it/s]


Validation Loss: 0.6619, Validation Accuracy: 0.8067
Epoch 8/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.11it/s]


Training Loss: 0.2063
Checkpoint saved at checkpoints/model_epoch_8.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  8.00it/s]


Validation Loss: 0.6066, Validation Accuracy: 0.8186
Epoch 9/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.1536
Checkpoint saved at checkpoints/model_epoch_9.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Validation Loss: 0.6812, Validation Accuracy: 0.8142
Epoch 10/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.11it/s]


Training Loss: 0.1474
Checkpoint saved at checkpoints/model_epoch_10.pt


Validation: 100%|██████████| 63/63 [00:07<00:00,  8.02it/s]


Validation Loss: 0.8501, Validation Accuracy: 0.7877
Epoch 11/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.1191
Checkpoint saved at checkpoints/model_epoch_11.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.78it/s]


Validation Loss: 0.6991, Validation Accuracy: 0.8226
Epoch 12/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.1260
Checkpoint saved at checkpoints/model_epoch_12.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.78it/s]


Validation Loss: 0.6654, Validation Accuracy: 0.8201
Epoch 13/20
----------


Training: 100%|██████████| 251/251 [02:00<00:00,  2.09it/s]


Training Loss: 0.1095
Checkpoint saved at checkpoints/model_epoch_13.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.78it/s]


Validation Loss: 0.8650, Validation Accuracy: 0.7828
Epoch 14/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.11it/s]


Training Loss: 0.0977
Checkpoint saved at checkpoints/model_epoch_14.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.80it/s]


Validation Loss: 0.7575, Validation Accuracy: 0.8211
Epoch 15/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.0833
Checkpoint saved at checkpoints/model_epoch_15.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.70it/s]


Validation Loss: 0.7452, Validation Accuracy: 0.8236
Epoch 16/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.09it/s]


Training Loss: 0.0959
Checkpoint saved at checkpoints/model_epoch_16.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.83it/s]


Validation Loss: 0.7283, Validation Accuracy: 0.8196
Epoch 17/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.1068
Checkpoint saved at checkpoints/model_epoch_17.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.79it/s]


Validation Loss: 0.7090, Validation Accuracy: 0.8191
Epoch 18/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.0798
Checkpoint saved at checkpoints/model_epoch_18.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.82it/s]


Validation Loss: 0.8337, Validation Accuracy: 0.8201
Epoch 19/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.0685
Checkpoint saved at checkpoints/model_epoch_19.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.84it/s]


Validation Loss: 0.7212, Validation Accuracy: 0.8336
Epoch 20/20
----------


Training: 100%|██████████| 251/251 [01:59<00:00,  2.10it/s]


Training Loss: 0.0532
Checkpoint saved at checkpoints/model_epoch_20.pt


Validation: 100%|██████████| 63/63 [00:08<00:00,  7.79it/s]

Validation Loss: 0.7962, Validation Accuracy: 0.8226





In [15]:
ser_best = SpeechEmotionRecognition(
    df=df_train_val,
    model_name='facebook/wav2vec2-base',
    batch_size=args['batch_size'],
    lr=args['lr'],
    num_epochs=args['epochs'],
    checkpoint_dir=args['checkpoint_dir'],
    gradient_accumulation_steps=args['gradient_accumulation_steps'],
    checkpoint_path='checkpoints/model_epoch_19.pt'
)



Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model from checkpoint: checkpoints/model_epoch_19.pt


In [16]:
ser_best.evaluate_test_set(test_df)

Testing: 100%|██████████| 56/56 [00:06<00:00,  8.15it/s]

Test Accuracy: 0.8154, Test F1 Score: 0.8146



