# Step 1: Import Libraries

In [2]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from dataclasses import dataclass
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
# from transformers import WavLMForSequenceClassification, WavLMProcessor

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


# Step 2: Data Loading and Preparation
## 2.1: Load the Data

In [3]:
def load_data_ravdess(data_dir):
    emotion_labels = {
        '01': 'neutral',
        '02': 'calm',
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fearful',
        '07': 'disgust',
        '08': 'surprised'
    }
    file_list = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.wav'):
                parts = file.split('-')
                emotion = emotion_labels.get(parts[2])
                file_list.append({'path': os.path.join(root, file), 'emotion': emotion})
    return pd.DataFrame(file_list)

def load_data_cremad(data_dir):
    emotion_labels = {
        'ANG': 'angry',
        'DIS': 'disgust',
        'FEA': 'fearful',
        'HAP': 'happy',
        'NEU': 'neutral',
        'SAD': 'sad'
    }
    file_list = []
    for file in os.listdir(data_dir):
        if file.endswith('.wav'):
            parts = file.split('_')
            emotion = emotion_labels.get(parts[2])
            if emotion:
                file_list.append({'path': os.path.join(data_dir, file), 'emotion': emotion})
    return pd.DataFrame(file_list)

def load_data_tess(data_dir):
    emotion_map = {
        'angry': 'angry',
        'disgust': 'disgust',
        'fear': 'fearful',
        'happy': 'happy',
        'ps': 'surprised',  # Pleasant surprise
        'sad': 'sad',
        'neutral': 'neutral'
    }
    file_list = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.wav'):
                emotion = file.split('_')[2].split('.')[0]
                emotion_label = emotion_map.get(emotion)
                if emotion_label:
                    file_list.append({'path': os.path.join(root, file), 'emotion': emotion_label})
    return pd.DataFrame(file_list)

def load_data_savee(data_dir):
    emotion_map = {
        'a': 'angry',
        'd': 'disgust',
        'f': 'fearful',
        'h': 'happy',
        'n': 'neutral',
        'sa': 'sad',
        'su': 'surprised'
    }
    file_list = []
    for file in os.listdir(data_dir):
        if file.endswith('.wav'):
            emotion_code = file.split('_')[1][:2]
            emotion_label = emotion_map.get(emotion_code)
            if emotion_label:
                file_list.append({'path': os.path.join(data_dir, file), 'emotion': emotion_label})
    return pd.DataFrame(file_list)



In [4]:
ravdess_df = load_data_ravdess('data/RAVDESS')
cremad_df = load_data_cremad('data/CREMA-D')
tess_df = load_data_tess('data/TESS')
savee_df = load_data_savee('data/SAVEE')

# Combine datasets
data_df = pd.concat([ravdess_df, cremad_df, tess_df, savee_df], ignore_index=True)



## 2.2: Encode the labels

In [5]:
# Standardize emotion labels
emotion_list = ['angry', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised', 'calm']

# Handle any missing emotions in datasets
data_df = data_df[data_df['emotion'].isin(emotion_list)]

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(emotion_list)
data_df['label'] = label_encoder.transform(data_df['emotion'])
num_classes = len(label_encoder.classes_)


## 2.3: Data Splitting

In [6]:
train_df, temp_df = train_test_split(data_df, test_size=0.3, stratify=data_df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f'Training samples: {len(train_df)}')
print(f'Validation samples: {len(val_df)}')
print(f'Test samples: {len(test_df)}')


Training samples: 8261
Validation samples: 1770
Test samples: 1771


# Step 3: Baseline Models
## 3.1: Feature Extraction for Baseline Models
### 3.1.1: Data augmentation

In [None]:
def load_audio(file_path, target_sr=16000):
    y, sr = librosa.load(file_path, sr=None)  # Load with original sampling rate
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    return y, sr

def normalize_audio(y):
    rms = np.sqrt(np.mean(y**2))
    if rms > 0:
        y_normalized = y / rms
    else:
        y_normalized = y
    return y_normalized

def augment_audio(y, sr):
    augmented_data = []
    
    # Original
    augmented_data.append(y)
    
    # Add noise
    noise = np.random.randn(len(y))
    y_noise = y + 0.005 * noise
    augmented_data.append(y_noise)
    
    # Time stretching
    y_stretch = librosa.effects.time_stretch(y, rate=0.9)
    augmented_data.append(y_stretch)
    y_stretch = librosa.effects.time_stretch(y, rate=1.1)
    augmented_data.append(y_stretch)
    
    # Pitch shifting
    y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=2)
    augmented_data.append(y_shift)
    y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=-2)
    augmented_data.append(y_shift)
    
    # Reverberation (simple simulation)
    y_reverb = librosa.effects.preemphasis(y)
    augmented_data.append(y_reverb)
    
    return augmented_data


### 3.1.2: Feature extraction

In [None]:

def extract_features(file_path, n_mfcc=40, target_sr=16000, augment=False):
    y, sr = load_audio(file_path, target_sr=target_sr)
    y = normalize_audio(y)
    
    if augment:
        augmented_audios = augment_audio(y, sr)
    else:
        augmented_audios = [y]
    
    features = []
    for augmented_y in augmented_audios:
        # Ensure consistent length by trimming or padding
        max_length = target_sr * 3  # 3 seconds
        # Trim or pad audio
        if len(augmented_y) > max_length:
            augmented_y = augmented_y[:max_length]
        else:
            augmented_y = np.pad(augmented_y, (0, max_length - len(augmented_y)), mode='constant')
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=augmented_y, sr=sr, n_mfcc=n_mfcc)
        mfccs = np.mean(mfccs.T, axis=0)
        
        # Extract additional features if needed
        chroma = librosa.feature.chroma_stft(y=augmented_y, sr=sr)
        chroma = np.mean(chroma.T, axis=0)
        spectral_contrast = librosa.feature.spectral_contrast(y=augmented_y, sr=sr)
        spectral_contrast = np.mean(spectral_contrast.T, axis=0)
        
        # Concatenate all features
        feature_vector = np.concatenate([mfccs, chroma, spectral_contrast])
        features.append(feature_vector)
    
    return features


## 3.1.3: Extract Features for All Samples

In [None]:
def extract_features_for_df(df):
    features = []
    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc='Extracting features'):
        feature = extract_features(row['path'], augment=True)
        features.append(feature)
    return np.array(features)

In [None]:

# Extract features
X_train = extract_features_for_df(train_df)
X_val = extract_features_for_df(val_df)
X_test = extract_features_for_df(test_df)

# Get labels
y_train = train_df['label'].values
y_val = val_df['label'].values
y_test = test_df['label'].values


In [None]:
# reshape data
X_train = np.array(X_train).reshape(len(X_train), -1)
X_val = np.array(X_val).reshape(len(X_val), -1)
X_test = np.array(X_test).reshape(len(X_test), -1)

In [None]:
# Print the shape of the feature arrays
print(f'X_train shape: {X_train.shape}')
print(f'X_val shape: {X_val.shape}')
print(f'X_test shape: {X_test.shape}')

# Print a sample of the feature arrays
print('Sample features from X_train:')
print(X_train[0])

print('Sample features from X_val:')
print(X_val[0])

print('Sample features from X_test:')
print(X_test[0])

### 3.1.4: Feature Scaling

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train = scaler.fit_transform(X_train)

# Transform the validation data using the fitted scaler
X_val = scaler.transform(X_val)

# Transform the test data using the fitted scaler
X_test = scaler.transform(X_test)


## 3.2: Train and Evaluate Baseline Models

### 3.2.1: Logistic Regression

In [None]:
# Initialize model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Train model
lr_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred_lr = lr_model.predict(X_val)

# Evaluate
val_accuracy_lr = accuracy_score(y_val, y_val_pred_lr)
val_f1_lr = f1_score(y_val, y_val_pred_lr, average='weighted')

print(f'Logistic Regression - Validation Accuracy: {val_accuracy_lr:.4f}, F1 Score: {val_f1_lr:.4f}')


### 3.4.2: Random Forest

In [None]:
# Initialize model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train model
rf_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred_rf = rf_model.predict(X_val)

# Evaluate
val_accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
val_f1_rf = f1_score(y_val, y_val_pred_rf, average='weighted')

print(f'Random Forest - Validation Accuracy: {val_accuracy_rf:.4f}, F1 Score: {val_f1_rf:.4f}')


In [None]:
import random

import IPython.display as ipd

# Select random samples
num_samples = 5
random_indices = random.sample(range(len(test_df)), num_samples)
sample_df = test_df.iloc[random_indices]

# Extract features for the selected samples
sample_features = np.array([extract_features(row['path'])[0] for _, row in sample_df.iterrows()])
sample_features = scaler.transform(sample_features)

# Get true labels and predictions
sample_labels = sample_df['label'].values
sample_predictions_lr = lr_model.predict(sample_features)
sample_predictions_rf = rf_model.predict(sample_features)

# Display the samples
for i, (index, row) in enumerate(sample_df.iterrows()):
    print(f"Sample {i+1}:")
    print(f"Path: {row['path']}")
    print(f"True Label: {label_encoder.inverse_transform([sample_labels[i]])[0]}")
    print(f"Logistic Regression Prediction: {label_encoder.inverse_transform([sample_predictions_lr[i]])[0]}")
    print(f"Random Forest Prediction: {label_encoder.inverse_transform([sample_predictions_rf[i]])[0]}")
    ipd.display(ipd.Audio(row['path']))
    print("\n")

In [None]:
# Logistic Regression
y_test_pred_lr = lr_model.predict(X_test)
test_accuracy_lr = accuracy_score(y_test, y_test_pred_lr)
test_f1_lr = f1_score(y_test, y_test_pred_lr, average='weighted')
print(f'Logistic Regression - Test Accuracy: {test_accuracy_lr:.4f}, F1 Score: {test_f1_lr:.4f}')

# Random Forest
y_test_pred_rf = rf_model.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
test_f1_rf = f1_score(y_test, y_test_pred_rf, average='weighted')
print(f'Random Forest - Test Accuracy: {test_accuracy_rf:.4f}, F1 Score: {test_f1_rf:.4f}')


# Step 4: Advanced Models

## 4.1: Prepare Data

In [7]:
import torch
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

# Set model parameters
model_name_or_path = "facebook/wav2vec2-base"
label_list = label_encoder.classes_
num_labels = len(label_list)

# Load configuration
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
config.pooling_mode = "mean"  # Set pooling mode to "mean"

# Load processor
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")


The target sampling rate: 16000


## 4.2: Initialize the Precessor and Model

In [8]:
from datasets import Dataset
from tqdm import tqdm
import numpy as np

# Define a function to load and resample audio
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = librosa.load(path)
    speech = librosa.resample(speech_array, orig_sr=sampling_rate, target_sr=target_sampling_rate)
    return speech

# Define a function to map labels
def label_to_id(label, label_list):
    return label_list.index(label) if label in label_list else -1

# Define the preprocessing function
def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples['path']]
    target_list = [label_to_id(label, label_list.tolist()) for label in examples['emotion']]
    
    result = processor(speech_list, sampling_rate=target_sampling_rate, return_tensors="pt", padding=True)
    result["labels"] = torch.tensor(target_list)

    return result

# Convert pandas dataframe to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply preprocessing with tqdm progress bar
train_dataset = train_dataset.map(preprocess_function, batch_size=100, batched=True, num_proc=4, desc='Processing train dataset')
eval_dataset = eval_dataset.map(preprocess_function, batch_size=100, batched=True, num_proc=4, desc='Processing eval dataset')
test_dataset = test_dataset.map(preprocess_function, batch_size=100, batched=True, num_proc=4, desc='Processing test dataset')


Processing train dataset (num_proc=4): 100%|██████████| 8261/8261 [00:54<00:00, 152.05 examples/s] 
Processing eval dataset (num_proc=4): 100%|██████████| 1770/1770 [00:21<00:00, 81.66 examples/s]  
Processing test dataset (num_proc=4): 100%|██████████| 1771/1771 [00:20<00:00, 84.44 examples/s]  


In [9]:
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel, Wav2Vec2Model
import torch.nn as nn

# Define classification head
class Wav2Vec2ClassificationHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features):
        x = self.dropout(features)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

# Define the Wav2Vec2 model for speech classification
class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(self, hidden_states, mode="mean"):
        if mode == "mean":
            return torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            return torch.sum(hidden_states, dim=1)
        elif mode == "max":
            return torch.max(hidden_states, dim=1)[0]
        else:
            raise ValueError("Invalid pooling mode. Choose from ['mean', 'sum', 'max']")

    def forward(self, input_values, attention_mask=None, labels=None):
        outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

        pooled_output = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else logits


In [16]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Load the model
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path, config=config)
model.freeze_feature_extractor()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2_clf_output",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    num_train_epochs=3,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
    gradient_accumulation_steps=2,
)

# Data collator to dynamically pad inputs
data_collator = DataCollatorWithPadding(
    tokenizer=processor.feature_extractor,  # Replace `processor` with `processor.feature_extractor`
    padding=True  # Default behavior is to pad to the longest sequence in the batch
)


Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:

# Define compute metrics function
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
# Evaluate on the test set
predictions = trainer.predict(test_dataset)
test_metrics = predictions.metrics
print(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}, F1 Score: {test_metrics['test_f1']:.4f}")

# To analyze specific predictions
from sklearn.metrics import classification_report

y_true = test_dataset["labels"]
y_pred = np.argmax(predictions.predictions, axis=-1)

# Print classification report
print(classification_report(y_true, y_pred, target_names=label_list))


## 4.3: Create DataLoaders

In [None]:
# Create Datasets
train_dataset = EmotionDataset(train_df, processor, augment=False)
val_dataset = EmotionDataset(val_df, processor, augment=False)
test_dataset = EmotionDataset(test_df, processor, augment=False)

# Create DataLoaders
def collate_fn(batch):
    input_values = torch.stack([item['input_values'].squeeze() for item in batch])
    attention_mask = torch.stack([item['attention_mask'].squeeze() for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {'input_values': input_values, 'attention_mask': attention_mask, 'labels': labels}


train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

for batch in train_loader:
    input_values = batch['input_values']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    print(input_values.shape, attention_mask.shape, labels.shape)
    break

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()
num_epochs = 5


In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_values = batch['input_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_values=input_values, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}')

    # Validation
    model.eval()
    val_labels = []
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            input_values = batch['input_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_values=input_values, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            val_labels.extend(labels.cpu().numpy())
            val_preds.extend(preds.cpu().numpy())

    val_accuracy = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average='weighted')
    print(f'Validation Accuracy: {val_accuracy:.4f}, F1 Score: {val_f1:.4f}')
