In [None]:
# pip3 install -r requirements_speechbrain.txt

# Pre-trained models:

| Pre-trained model | Name in the article | URL |
| :- | :- | :- |
| X-vector | x-vector | https://huggingface.co/speechbrain/spkrec-xvect-voxceleb  |   

# 1. Extract embeddings

The code used to extract embeddings is consistent across all the databases studied: EMOVOME, RAVDESS, and IEMOCAP. 

The only variations are due to the different file structures of each database.

## 1.1 EMOVOME

In [None]:
# Libraries
import os  # For interacting with the file system
import time  # For measuring time intervals
import librosa  # For audio processing
import numpy as np  # For numerical operations
import torch  # For tensor computations
# import torchaudio  # For audio loading 
from speechbrain.pretrained import EncoderClassifier  # Import EncoderClassifier from SpeechBrain pretrained models

# Load pretrained model
classifier = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-xvect-voxceleb",  # Model source identifier
    savedir="C:/Users/lugoza/Documents/AnacondaFiles/HELIOS_final/data/VOSOME/embeddings/speechbrain-x-vector/models/"  # Directory containing model checkpoints
)

# Calculate embeddings per audio file
path = 'C:/Users/lugoza/Documents/AnacondaFiles/HELIOS_final/data/VOSOME/audios'  # Path to audio files
path_save = 'data/VOSOME/embeddings/speechbrain-x-vector/audio_embeddings/'  # Path to save embeddings

tic = time.time()  # Start timing

# Iterate through each audio file in the specified path
for audio in os.listdir(path):
    id_audio = audio.split('.')[0]  # Extract the audio file ID
    
    # Load audio using librosa due to potential issues with torchaudio's ogg file support
    signal, fs = librosa.load(path+'/'+audio, sr=16000, mono=True)
    
    # Convert the audio waveform to a Torch tensor and prepare it for model input
    waveform = torch.tensor(signal, dtype=torch.float32).unsqueeze(0)
    
    # Generate embeddings using the loaded classifier model
    embeddings = classifier.encode_batch(waveform)
    x = embeddings.numpy()  # Convert embeddings to NumPy array
        
    # Save embeddings to a file using the audio file ID as the filename
    with open(path_save + id_audio + '.npy', 'wb') as f:
        np.save(f, x)

toc = time.time()  # End timing
print('Duration:', round((toc - tic) / 60, 2), 'min')  # Print the total duration of processing in minutes


## 1.2 RAVDESS

In [None]:
# Libraries
import os  # For interacting with the file system
import time  # For measuring time intervals
import librosa  # For audio processing
import numpy as np  # For numerical operations
import torch  # For tensor computations
# import torchaudio  # For audio loading 
from speechbrain.pretrained import EncoderClassifier  # Import EncoderClassifier from SpeechBrain pretrained models

# Load pretrained model
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", 
                                                savedir= "C:/Users/lugoza/Documents/AnacondaFiles/HELIOS_final/data/RAVDESS/embeddings/speechbrain-x-vector/models/")    

# Calculate embeddings per audio file
path = 'C:/Users/lugoza/Documents/AnacondaFiles/HELIOS_final/data/RAVDESS/audios' # Path to audio files
path_save = 'data/RAVDESS/embeddings/speechbrain-x-vector/audio_embeddings/' # Path to save embeddings

tic = time.time() # Start timing

# Iterate through each audio file in the specified path
for folder in os.listdir(path):
    for audio in os.listdir(path+'/'+folder+'/'):
        id_audio = audio.split('.')[0] # Extract the audio file ID
        
        # Load audio using librosa due to potential issues with torchaudio's ogg file support
        signal, fs = librosa.load(path+'/'+folder+'/'+audio, sr=16000, mono=True)
        
        # Convert the audio waveform to a Torch tensor and prepare it for model input
        waveform = torch.tensor(signal, dtype=torch.float32).unsqueeze(0)
        
        # Generate embeddings using the loaded classifier model
        embeddings = classifier.encode_batch(waveform)
        x = embeddings.numpy() # Convert embeddings to NumPy array

        # Save embeddings to a file using the audio file ID as the filename
        with open(path_save+id_audio+'.npy', 'wb') as f:
            np.save(f, x)

toc = time.time() # End timing
print('Duration:',round((toc-tic)/60,2),'min') # Print the total duration of processing in minutes

## 1.3 IEMOCAP

In [None]:
# Libraries
import os  # For interacting with the file system
import time  # For measuring time intervals
import librosa  # For audio processing
import numpy as np  # For numerical operations
import torch  # For tensor computations
# import torchaudio  # For audio loading 
from speechbrain.pretrained import EncoderClassifier  # Import EncoderClassifier from SpeechBrain pretrained models

# Load pretrained model
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", 
                                            savedir= "C:/Users/lugoza/Documents/AnacondaFiles/HELIOS_final/data/IEMOCAP/embeddings/speechbrain-x-vector/models/")    

# Calculate embeddings per audio file
path = 'D:/lugoza/IEMOCAP/' # Path to audio files
path_save = 'data/IEMOCAP/embeddings/speechbrain-x-vector/audio_embeddings/' # Path to save embeddings

tic = time.time() # Start timing

# Iterate through each audio file in the specified path
for session in [folder for folder in os.listdir(path) if folder.startswith('Session')]:
    for improv in os.listdir(path + session +'/sentences/wav/'):
        for file in [f for f in os.listdir(path + session +'/sentences/wav/'+improv+'/') if f.endswith('.wav')]:
            path_file = path + session +'/sentences/wav/'+improv+'/'+file    
            id_audio = file.split('.')[0] # Extract the audio file ID
            
            # Load audio using librosa due to potential issues with torchaudio's ogg file support
            signal, fs = librosa.load(path_file, sr=16000, mono=True)
            
            # Convert the audio waveform to a Torch tensor and prepare it for model input
            waveform = torch.tensor(signal, dtype=torch.float32).unsqueeze(0)
            
            # Generate embeddings using the loaded classifier model
            embeddings = classifier.encode_batch(waveform)
            x = embeddings.numpy() # Convert embeddings to NumPy array

            # Save embeddings to a file using the audio file ID as the filename
            with open(path_save+id_audio+'.npy', 'wb') as f:
                np.save(f, x)

toc = time.time() # End timing
print('Duration:',round((toc-tic)/60,2),'min') # Print the total duration of processing in minutes