# RAVDESS Dataset Setup

In [3]:
! pip install wget
! pip install moviepy

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=9d6b8e4c50c1de63c40e3cefc2d91602ee68cb5805c177b9dfd6cca6c0246780
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
import wget
import zipfile
import os
actors = 12 # 24 max
for x in range(1,actors+1):
  num_str = None
  if(x < 10):
    num_str = "0" + str(x)
  else:
    num_str = str(x)
  url = "https://zenodo.org/records/1188976/files/Video_Speech_Actor_" + num_str + ".zip?download=1"
  print("Downloading: ", url)
  filename = wget.download(url)
  print("Unzipping: ", filename)
  with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall('./Data/')
  print("Deleting zip: ", filename)
  os.remove(filename)

Downloading:  https://zenodo.org/records/1188976/files/Video_Speech_Actor_01.zip?download=1
Unzipping:  Video_Speech_Actor_01.zip
Deleting zip:  Video_Speech_Actor_01.zip
Downloading:  https://zenodo.org/records/1188976/files/Video_Speech_Actor_02.zip?download=1
Unzipping:  Video_Speech_Actor_02.zip
Deleting zip:  Video_Speech_Actor_02.zip
Downloading:  https://zenodo.org/records/1188976/files/Video_Speech_Actor_03.zip?download=1
Unzipping:  Video_Speech_Actor_03.zip
Deleting zip:  Video_Speech_Actor_03.zip
Downloading:  https://zenodo.org/records/1188976/files/Video_Speech_Actor_04.zip?download=1
Unzipping:  Video_Speech_Actor_04.zip
Deleting zip:  Video_Speech_Actor_04.zip
Downloading:  https://zenodo.org/records/1188976/files/Video_Speech_Actor_05.zip?download=1
Unzipping:  Video_Speech_Actor_05.zip
Deleting zip:  Video_Speech_Actor_05.zip
Downloading:  https://zenodo.org/records/1188976/files/Video_Speech_Actor_06.zip?download=1
Unzipping:  Video_Speech_Actor_06.zip
Deleting zip:  

# Dataset preprocessing

In [None]:
# extract and save audio
import moviepy

actors = os.listdir("./Data/")
training_files = []
testing_files = []
for actor in actors:
  files = os.listdir("./Data/"+ actor)
  for filename in files:
    file_split_dash = filename.split("-")
    file_split_dot = filename.split(".")
    if((file_split_dash[0] == "01") and (file_split_dot[1] == "mp4")):
      print("making audio file: ", filename)
      # make audio file
      video_clip = VideoFileClip("./Data/" + actor + "/" + filename)
      audio_part = video_clip.audio
      audio_temp_path = "./Data/" + actor + "/" + file_split_dot[0] + ".wav"
      audio_part.write_audiofile(audio_temp_path)


In [None]:
# File naming convention

# Each of the 7356 RAVDESS files has a unique filename. The filename consists of a 7-part numerical identifier (e.g., 02-01-06-01-02-01-12.mp4). These identifiers define the stimulus characteristics:

# Filename identifiers

# Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
# Vocal channel (01 = speech, 02 = song).
# Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
# Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
# Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
# Repetition (01 = 1st repetition, 02 = 2nd repetition).
# Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

# Filename example: 02-01-06-01-02-01-12.mp4

# Video-only (02)
# Speech (01)
# Fearful (06)
# Normal intensity (01)
# Statement "dogs" (02)
# 1st Repetition (01)
# 12th Actor (12)
# Female, as the actor ID number is even.

import torch
import torchvision.transforms as transforms
import os
import random
import moviepy
import numpy as np

class CreateDataset(torch.utils.data.Dataset):
  def __init__(self, filenames):
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.files = filenames.copy()
    # Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
    self.emotions = ["Neutral","Calm","Happy","Sad","Angry","Fearful","Disgust","Surprised"]

  def __getitem__(self, index):
    filename = self.files[index]
    filename_split = filename.split("-")
    # print(filename)
    label_str = self.emotions[int(filename_split[2])-1]
    # video_clip = VideoFileClip(filename)
    # video_array = np.array([frame for frame in video_clip.iter_frames()])

    # audio_part = video_clip.audio
    # audio_temp_path = "./temp_audio.wav"  # Temporary path for the audio
    # audio_part.write_audiofile(audio_temp_path)

    # Step 3: Load audio as a numpy array using librosa
    audio_filename = "." + filename.split(".")[1] + ".wav"
    # print(audio_filename)
    audio_data, sample_rate = librosa.load(audio_filename, sr=None)
    # os.remove(audio_temp_path)
    # audio_array = audio_part.to_soundarray(fps=30.0) # Change fps if needed
    # if audio_array.ndim == 2: # mono channel
    #     audio_array = audio_array.mean(axis=1)
    # wav_arr, _ = librosa.load(audio_array)
    return {"video":None, "audio":audio_data,"sampling_rate":sample_rate, "label_str":label_str}
    # return {"video":video, "audio":audio, "label_classes":label_classes, "label_id":label_id, "label_str":label_str, "gender":gender}

  def __len__(self):
    return len(self.files)

def CreateDatasets(training_split_decimal):
  actors = os.listdir("./Data/")
  training_files = []
  testing_files = []
  training_split = int(training_split_decimal * 60)
  for actor in actors:
    files = os.listdir("./Data/"+actor)
    num_files = len(files)
    indexes = random.sample(range(num_files), num_files)
    counter = 0
    for i in indexes:
      file_split_dash = files[i].split("-")
      file_split_dot = files[i].split(".")
      if((file_split_dash[0] == "01") and (file_split_dot[1] == "mp4")):
        if(counter < training_split):
          training_files.append("./Data/" + actor + "/" + files[i])
        else:
          testing_files.append("./Data/" + actor + "/" + files[i])
        counter = counter + 1
  train = CreateDataset(training_files)
  test = CreateDataset(testing_files)
  return train, test

RAVDESS_training_set, RAVDESS_testing_set = CreateDatasets(0.8)


In [None]:
# Test RAVDESS datasets are working properly
import librosa
from IPython.display import Audio
print(len(RAVDESS_training_set))
file = RAVDESS_training_set[1]
print(file)

# Display audio player
Audio(data =file["audio"], rate = file["sampling_rate"])
# file["video"].ipython_display(width=280)

96
./Data/Actor_02/01-01-04-01-01-02-02.mp4
./Data/Actor_02/01-01-04-01-01-02-02.wav
{'video': None, 'audio': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 'sampling_rate': 44100, 'label_str': 'Sad'}


# Models

In [None]:
from transformers import AutoModelForAudioClassification
import librosa, torch

#load model
odyssey_model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes", trust_remote_code=True).cuda()

Some weights of the model checkpoint at microsoft/wavlm-large were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [None]:

#get mean/std
mean = model.config.mean
std = model.config.std
counter = 0
total = 20
for i in range(total):
  #load an audio file
  item = training_set[i]
  # print(item["label_str"])
  raw_wav = item["audio"]
  label_answer = item["label_str"]
  #normalize the audio by mean/std
  norm_wav = (raw_wav - mean) / (std+0.000001)

  #generate the mask
  mask = torch.ones(1, len(norm_wav))

  #batch it (add dim)
  wavs = torch.tensor(norm_wav).unsqueeze(0)

  #predict
  with torch.no_grad():
      pred = model(wavs, mask)

  label_strs = ["Angry", "Sad", "Happy", "Surprise", "Fear", "Disgust", "Contempt", "Neurtral"]
  #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
  #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
# dataset Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).

  #convert logits to probability
  answer = torch.argmax(torch.nn.functional.softmax(pred, dim=1))
  pred_label = label_strs[answer]
  ref_label = label_answer
  print("Prediction: ", pred_label)
  print("Reference: ", ref_label)
  if(pred_label == "Contempt"):
    pred_label = "Calm"
  if(pred_label == "Surprise"):
    pred_label == "Surprised"
  if(pred_label == "Fear"):
    pred_label == "Fearful"
  if((pred_label == ref_label)):
    counter = counter + 1
    print("correct")

print("Percent correct: ", counter/total)

./Data/Actor_02/01-01-04-01-01-01-02.mp4
./Data/Actor_02/01-01-04-01-01-01-02.wav




Prediction:  Neurtral
Reference:  Sad
./Data/Actor_02/01-01-04-01-01-02-02.mp4
./Data/Actor_02/01-01-04-01-01-02-02.wav
Prediction:  Sad
Reference:  Sad
correct
./Data/Actor_02/01-01-03-02-01-02-02.mp4
./Data/Actor_02/01-01-03-02-01-02-02.wav
Prediction:  Neurtral
Reference:  Happy
./Data/Actor_02/01-01-02-02-01-02-02.mp4
./Data/Actor_02/01-01-02-02-01-02-02.wav
Prediction:  Sad
Reference:  Calm
./Data/Actor_02/01-01-03-01-01-01-02.mp4
./Data/Actor_02/01-01-03-01-01-01-02.wav
Prediction:  Neurtral
Reference:  Happy
./Data/Actor_02/01-01-06-01-02-02-02.mp4
./Data/Actor_02/01-01-06-01-02-02-02.wav
Prediction:  Sad
Reference:  Fearful
./Data/Actor_02/01-01-04-01-02-01-02.mp4
./Data/Actor_02/01-01-04-01-02-01-02.wav
Prediction:  Sad
Reference:  Sad
correct
./Data/Actor_02/01-01-06-01-01-01-02.mp4
./Data/Actor_02/01-01-06-01-01-01-02.wav
Prediction:  Neurtral
Reference:  Fearful
./Data/Actor_02/01-01-06-02-02-01-02.mp4
./Data/Actor_02/01-01-06-02-02-01-02.wav
Prediction:  Sad
Reference:  Fe

In [None]:
# Training on WavLM

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForAudioClassification

processor = AutoProcessor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor


# CONFIG and MODEL SETUP
model_name = 'amiriparian/ExHuBERT'
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
model = AutoModelForAudioClassification.from_pretrained(model_name, trust_remote_code=True,
                                                        revision="b158d45ed8578432468f3ab8d46cbe5974380812")

# Freezing half of the encoder for further transfer learning
model.freeze_og_encoder()

sampling_rate = 16000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



# Example application from a local audiofile
import numpy as np
import librosa
import torch.nn.functional as F
# Sample taken from the Toronto emotional speech set (TESS) https://tspace.library.utoronto.ca/handle/1807/24487
waveform, sr_wav = librosa.load("YAF_date_angry.wav")
# Max Padding to 3 Seconds at 16k sampling rate for the best results
waveform = feature_extractor(waveform, sampling_rate=sampling_rate,padding = 'max_length',max_length = 48000)
waveform = waveform['input_values'][0]
waveform = waveform.reshape(1, -1)
waveform = torch.from_numpy(waveform).to(device)
with torch.no_grad():
    output = model(waveform)
    output = F.softmax(output.logits, dim = 1)
    output = output.detach().cpu().numpy().round(2)
    print(output)

    # [[0.      0.      0.      1.      0.      0.]]
    #          Low          |          High                 Arousal
    # Neg.     Neut.   Pos. |  Neg.    Neut.   Pos          Valence
    # Disgust, Neutral, Kind| Anger, Surprise, Joy          Example emotions


In [None]:

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import librosa
import io
from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor

# CONFIG and MODEL SETUP
model_name = 'amiriparian/ExHuBERT'
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
model = AutoModelForAudioClassification.from_pretrained(model_name, trust_remote_code=True,
                                                        revision="b158d45ed8578432468f3ab8d46cbe5974380812")

# Replacing Classifier layer
model.classifier = nn.Linear(in_features=256, out_features=7)
# Freezing the original encoder layers and feature encoder (as in the paper) for further transfer learning
model.freeze_og_encoder()
model.freeze_feature_encoder()
model.train()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define a custom dataset class
class EmotionDataset(Dataset):
    def __init__(self, dataframe, feature_extractor, max_length):
        self.dataframe = dataframe
        self.feature_extractor = feature_extractor
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        # emotion = torch.tensor(row['label'], dtype=torch.int64)  # For the IEMOCAP example
        emotion = torch.tensor(row['emotion'], dtype=torch.int64)  # EmoDB specific

        # Decode audio bytes from the Huggingface dataset with librosa
        audio_bytes = row['audio']['bytes']
        audio_buffer = io.BytesIO(audio_bytes)
        audio_data, samplerate = librosa.load(audio_buffer, sr=16000)

        # Use the feature extractor to preprocess the audio. Padding/Truncating to 3 seconds gives better results
        audio_features = self.feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding="max_length",
                                                truncation=True, max_length=self.max_length)

        audio = audio_features['input_values'].squeeze(0)
        return audio, emotion

# Load your DataFrame. Samples are shown for EmoDB and IEMOCAP from the Huggingface Hub
df = pd.read_parquet("hf://datasets/renumics/emodb/data/train-00000-of-00001-cf0d4b1ae18136ff.parquet")
# splits = {'session1': 'data/session1-00000-of-00001-04e11ca668d90573.parquet', 'session2': 'data/session2-00000-of-00001-f6132100b374cb18.parquet', 'session3': 'data/session3-00000-of-00001-6e102fcb5c1126b4.parquet', 'session4': 'data/session4-00000-of-00001-e39531a7c694b50d.parquet', 'session5': 'data/session5-00000-of-00001-03769060403172ce.parquet'}
# df = pd.read_parquet("hf://datasets/Zahra99/IEMOCAP_Audio/" + splits["session1"])

# Dataset and DataLoader
dataset = EmotionDataset(df, feature_extractor, max_length=3 * 16000)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Training setup
criterion = nn.CrossEntropyLoss()
lr = 1e-5
non_frozen_parameters = [p for p in model.parameters() if p.requires_grad]
optim = torch.optim.AdamW(non_frozen_parameters, lr=lr, betas=(0.9, 0.999), eps=1e-08)

# Function to calculate accuracy
def calculate_accuracy(outputs, targets):
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == targets).sum().item()
    return correct / targets.size(0)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(device), targets.to(device)

        optim.zero_grad()
        outputs = model(inputs).logits
        loss = criterion(outputs, targets)
        loss.backward()
        optim.step()

        total_loss += loss.item()
        total_correct += (outputs.argmax(1) == targets).sum().item()
        total_samples += targets.size(0)

    epoch_loss = total_loss / len(dataloader)
    epoch_accuracy = total_correct / total_samples
    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {epoch_loss:.4f}, Average Accuracy: {epoch_accuracy:.4f}')

# Example outputs:
# Epoch [3/3], Average Loss: 0.4572, Average Accuracy: 0.8249 for IEMOCAP
# Epoch [3/3], Average Loss: 0.1511, Average Accuracy: 0.9850 for EmoDB


In [None]:
https://github.com/ilucasgoncalves/AuxFormer

# MSP podcast

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/Odyssey')

Mounted at /content/drive


In [2]:
! unzip  -o -q '/content/drive/MyDrive/Odyssey/Dev_MSP.zip' -d "./"

In [None]:

import torch
import torchvision.transforms as transforms
import os
import random
import moviepy
import numpy as np

class CreateMSPDataset(torch.utils.data.Dataset):
  def __init__(self, root):
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.root = root
    #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
    self.emotions = ["Angry","Sad","Happy","Surprise","Fear","Disgust","Contempt","Neutral"]
    # Angry (A), Sad (S), Happy (H), Surprise (U), Fear (F), Disgust (D), Contempt (C), Neutral (N)
    self.em_abr = ["A", "S", "H", "U","F","D","C","N"]
    self.files = os.listdir(self.root)

  def __getitem__(self, index):
    filename = self.files[index]
    filename_split = filename.split("-")
    label_num = self.em_abr.index(filename_split[2])
    label_str = self.emotions[label_num]
    audio_filename = "." + filename.split(".")[1] + ".wav"
    audio_data, sample_rate = librosa.load(self.root + filename, sr=None)
    return {"audio":audio_data,"sampling_rate":sample_rate, "label_str":label_str}

  def __len__(self):
    return len(self.files)

MSP_dev_set = CreateMSPDataset("./Dev_MSP/")

In [None]:
import librosa
from IPython.display import Audio

file = MSP_dev_set[1]
print(file)

# Display audio player
Audio(data =file["audio"], rate = file["sampling_rate"])

{'audio': array([ 0.00344849,  0.00314331,  0.00167847, ..., -0.00082397,
       -0.00076294,  0.00012207], dtype=float32), 'sampling_rate': 16000, 'label_str': 'Neutral'}


In [None]:
import random

def test_baseline_accuracy(model, dataset, test_points=100):
  model.eval()

  #get mean/std
  mean = model.config.mean
  std = model.config.std
  counter = 0
  numbers = random.sample(range(0, len(dataset)), test_points)

  for i in numbers:
    #load an audio file
    item = MSP_dev_set[i]
    # print(item["label_str"])
    raw_wav = item["audio"]
    label_answer = item["label_str"]
    #normalize the audio by mean/std
    norm_wav = (raw_wav - mean) / (std+0.000001)
    #generate the mask
    mask = torch.ones(1, len(norm_wav)).cuda()
    #batch it (add dim)
    wavs = torch.tensor(norm_wav).unsqueeze(0).cuda()

    #predict
    with torch.no_grad():
        pred = model(wavs, mask)

    label_strs = ["Angry", "Sad", "Happy", "Surprise", "Fear", "Disgust", "Contempt", "Neurtral"]

    #convert logits to probability
    answer = torch.argmax(torch.nn.functional.softmax(pred, dim=1))
    pred_label = label_strs[answer]
    ref_label = label_answer
    # print("Prediction: ", pred_label)
    # print("Reference: ", ref_label)

    if((pred_label == ref_label)):
      counter = counter + 1
  return counter/test_points

print("Percent correct: ", test_baseline_accuracy(odyssey_model, MSP_dev_set,len(MSP_dev_set)))



Percent correct:  0.3018056189296656
