In [1]:
# pip3 install -r requirements_huggingface.txt

# Pre-trained models:

| Pre-trained model | Name in the article | URL |
| :- | :- | :- |
| facebook/wav2vec2-xls-r-300m | w2v2-xlsr-128 | https://huggingface.co/facebook/wav2vec2-xls-r-300m |
| facebook/wav2vec2-large-xlsr-53 | w2v2-xlsr-53 | https://huggingface.co/facebook/wav2vec2-large-xlsr-53 |
| facebook/wav2vec2-large-xlsr-53-spanish | w2v2-xlsr-53-spa | https://huggingface.co/facebook/wav2vec2-large-xlsr-53-spanish |
| facebook/wav2vec2-large-robust | w2v2-L-robust | https://huggingface.co/facebook/wav2vec2-large-robust |
| facebook/hubert-large-ll60k | hubert-L | https://huggingface.co/facebook/hubert-large-ll60k |
| Microsoft's UniSpeech-SAT-Large | unispeech-L | https://huggingface.co/microsoft/unispeech-sat-large |

# 1. Example

## 1.1 Calculate embedding for an audio

In [5]:
import torch  # Import the PyTorch library for tensor computations
import numpy as np  # Import the NumPy library for numerical operations
import librosa  # Import the Librosa library for audio processing
import os  # Import the OS library for interacting with the file system
import time  # Import the Time library for timing operations
from transformers import AutoFeatureExtractor, AutoModel  # Import Hugging Face's transformers for feature extraction and model loading

# Audio example
path = 'D:/lugoza/Databases/RAVDESS/audios/Actor_01/03-01-01-01-01-01-01.wav'

# Model to be used
model_name = "facebook/wav2vec2-xls-r-300m"

# Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the audio file
input_audio, sample_rate = librosa.load(path, sr = 16000)
# duration = librosa.get_duration(y=input_audio, sr=sample_rate)

# Load pre-trained model
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Extract audio features and move to the appropriate device
audio_embeddings = feature_extractor(input_audio, return_tensors="pt", sampling_rate=sample_rate).to(device)

# Perform inference without computing gradients
with torch.no_grad():
    outputs = model(audio_embeddings.input_values)
                
# Detach the output and move to CPU if necessary, then convert to NumPy array
if device == 'cpu':
    x = outputs.last_hidden_state.detach().numpy()
else:
    x = outputs.last_hidden_state.detach().cpu().numpy()
                    
print(x.shape)

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2Model: ['quantizer.weight_proj.weight', 'project_q.bias', 'quantizer.weight_proj.bias', 'project_hid.weight', 'project_q.weight', 'project_hid.bias', 'quantizer.codevectors']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(1, 164, 1024)


## 1.2 Load embeddings and plot dimensions

In [7]:
import numpy as np 

# Path to the embeddings directory
path_embeddings = 'D:/lugoza/Databases/RAVDESS/embeddings/'
example_file = '03-01-01-01-01-01-01.npy'

# List of model IDs used
model_ids = [
    "facebook/wav2vec2-xls-r-300m",
    "facebook/wav2vec2-large-xlsr-53",
    "facebook/wav2vec2-large-xlsr-53-spanish",
    "facebook/wav2vec2-large-robust",
    "facebook/hubert-large-ll60k",
    "microsoft/unispeech-sat-large",
]

# Loop through each model
for model_name in model_ids:
    # Define the path to save embeddings
    path = path_embeddings + model_name.replace("/", "-") + '/audio_embeddings/'
    
    # Load the embeddings from the specified file
    emb = np.load(path + example_file)
    
    # Print the shape of the loaded embeddings
    print(model_name, ':', emb.shape)

facebook/wav2vec2-xls-r-300m : (1, 164, 1024)
facebook/wav2vec2-large-xlsr-53 : (1, 164, 1024)
facebook/wav2vec2-large-xlsr-53-spanish : (1, 164, 1024)
facebook/wav2vec2-large-robust : (1, 164, 1024)
facebook/hubert-large-ll60k : (1, 164, 1024)
microsoft/unispeech-sat-large : (1, 164, 1024)


# 2. Extract embeddings

The code used to extract embeddings is consistent across all the databases studied: EMOVOME, RAVDESS, and IEMOCAP. 

The only variations are due to the different file structures of each database.

## 2.1 EMOVOME

In [None]:
import torch  # Import the PyTorch library for tensor computations
import numpy as np  # Import the NumPy library for numerical operations
import librosa  # Import the Librosa library for audio processing
import os  # Import the OS library for interacting with the file system
import time  # Import the Time library for timing operations
from transformers import AutoFeatureExtractor, AutoModel  # Import Hugging Face's transformers for feature extraction and model loading

# Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# List of model IDs to be used
model_ids = [
    "facebook/wav2vec2-xls-r-300m",
    "facebook/wav2vec2-large-xlsr-53",
    "facebook/wav2vec2-large-xlsr-53-spanish",
    "facebook/wav2vec2-large-robust",
    "facebook/hubert-large-ll60k",
    "microsoft/unispeech-sat-large",
]

# Define the database and path to audio files
database = 'EMOVOME' 
path = 'data/' + database + '/audios'

# Loop through each model
for model_name in model_ids:
    tic = time.time()  # Start timing

    # Define the path to save embeddings
    path_save = 'data/' + database + '/embeddings/' + model_name.replace("/", "-") + '/audio_embeddings/'
    
    # Create the directory if it doesn't exist
    if not os.path.exists(path_save):
        os.makedirs(path_save)
        
    # Load pre-trained models
    print('MODEL: ', model_name)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)

    # Calculate embeddings for each audio file
    for audio in os.listdir(path):
        # Load the audio file
        input_audio, sample_rate = librosa.load(path + '/' + audio, sr=16000)
            # It is possible to cut the audio to a specific duration using the duration parameter as follows:
            # input_audio, sample_rate = librosa.load(path+'/'+audio, sr = 16000, duration = 30)
            
        # Extract audio features and move to the appropriate device
        audio_embeddings = feature_extractor(input_audio, return_tensors="pt", sampling_rate=sample_rate).to(device)
        
        # Perform inference without computing gradients
        with torch.no_grad():
            outputs = model(audio_embeddings.input_values)
        
        # Detach the output and move to CPU if necessary, then convert to NumPy array
        if device == 'cpu':
            x = outputs.last_hidden_state.detach().numpy()
        else:
            x = outputs.last_hidden_state.detach().cpu().numpy()
        
        # Save the embeddings to a file
        id_audio = audio.split('.')[0]  # Extract the base name of the audio file
        with open(path_save + id_audio + '.npy', 'wb') as f:
            np.save(f, x)
    
    toc = time.time()  # End timing
    print(model_name, ' - Duration:', round((toc - tic) / 60, 2), 'min')  # Print the duration of processing

## 2.2 RAVDESS

In [None]:
import torch  # Import the PyTorch library for tensor computations
import numpy as np  # Import the NumPy library for numerical operations
import librosa  # Import the Librosa library for audio processing
import os  # Import the OS library for interacting with the file system
import time  # Import the Time library for timing operations
from transformers import AutoFeatureExtractor, AutoModel  # Import Hugging Face's transformers for feature extraction and model loading

# Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# List of model IDs to be used
model_ids = [
    "facebook/wav2vec2-xls-r-300m",
    "facebook/wav2vec2-large-xlsr-53",
    "facebook/wav2vec2-large-xlsr-53-spanish",
    "facebook/wav2vec2-large-robust",
    "facebook/hubert-large-ll60k",
    "microsoft/unispeech-sat-large",
]

# Define the database and path to audio files
database = 'RAVDESS'
path = 'data/' + database + '/audios'

# Loop through each model
for model_name in model_ids:
    tic = time.time()  # Start timing

    # Define the path to save embeddings
    path_save = 'data/' + database + '/embeddings/' + model_name.replace("/", "-") + '/audio_embeddings/'
    
    # Create the directory if it doesn't exist
    if not os.path.exists(path_save):
        os.makedirs(path_save)
        
    # Load pre-trained models
    print('MODEL: ', model_name)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)

    # Calculate embeddings for each audio file
    for folder in os.listdir(path):
        for audio in os.listdir(path+'/'+folder+'/'):
            # Load the audio file
            input_audio, sample_rate = librosa.load(path+'/'+folder+'/'+audio, sr=16000)
                # It is possible to cut the audio to a specific duration using the duration parameter as follows:
                # input_audio, sample_rate = librosa.load(path+'/'+audio, sr = 16000, duration = 30)

            # Extract audio features and move to the appropriate device
            audio_embeddings = feature_extractor(input_audio, return_tensors="pt", sampling_rate=sample_rate).to(device)

            # Perform inference without computing gradients
            with torch.no_grad():
                outputs = model(audio_embeddings.input_values)

            # Detach the output and move to CPU if necessary, then convert to NumPy array
            if device == 'cpu':
                x = outputs.last_hidden_state.detach().numpy()
            else:
                x = outputs.last_hidden_state.detach().cpu().numpy()

            # Save the embeddings to a file
            id_audio = audio.split('.')[0]  # Extract the base name of the audio file
            with open(path_save + id_audio + '.npy', 'wb') as f:
                np.save(f, x)
    
    toc = time.time()  # End timing
    print(model_name, ' - Duration:', round((toc - tic) / 60, 2), 'min')  # Print the duration of processing

## 2.3 IEMOCAP

In [None]:
import torch  # Import the PyTorch library for tensor computations
import numpy as np  # Import the NumPy library for numerical operations
import librosa  # Import the Librosa library for audio processing
import os  # Import the OS library for interacting with the file system
import time  # Import the Time library for timing operations
from transformers import AutoFeatureExtractor, AutoModel  # Import Hugging Face's transformers for feature extraction and model loading

# Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# List of model IDs to be used
model_ids = [
    "facebook/wav2vec2-xls-r-300m",
    "facebook/wav2vec2-large-xlsr-53",
    "facebook/wav2vec2-large-xlsr-53-spanish",
    "facebook/wav2vec2-large-robust",
    "facebook/hubert-large-ll60k",
    "microsoft/unispeech-sat-large",
]

# Define the database and path to audio files
database = 'RAVDESS'
path = 'data/' + database + '/audios'

# Loop through each model
for model_name in model_ids:
    tic = time.time()  # Start timing

    # Define the path to save embeddings
    path_save = 'data/' + database + '/embeddings/' + model_name.replace("/", "-") + '/audio_embeddings/'
    
    # Create the directory if it doesn't exist
    if not os.path.exists(path_save):
        os.makedirs(path_save)
        
    # Load pre-trained models
    print('MODEL: ', model_name)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)

    # Calculate embeddings for each audio file
    for session in [folder for folder in os.listdir(path) if folder.startswith('Session')]:
        for improv in os.listdir(path + session +'/sentences/wav/'):
            for file in [f for f in os.listdir(path + session +'/sentences/wav/'+improv+'/') if f.endswith('.wav')]:
                
                path_file = path + session +'/sentences/wav/'+improv+'/'+file    
                
                # Load the audio file
                input_audio, sample_rate = librosa.load(path_file, sr = 16000)
                    # It is possible to cut the audio to a specific duration using the duration parameter as follows:
                    # input_audio, sample_rate = librosa.load(path+'/'+audio, sr = 16000, duration = 30)
                
                # Extract audio features and move to the appropriate device
                audio_embeddings = feature_extractor(input_audio, return_tensors="pt", sampling_rate=sample_rate).to(device)
                
                # Perform inference without computing gradients
                with torch.no_grad():
                    outputs = model(audio_embeddings.input_values)
                    
                # Detach the output and move to CPU if necessary, then convert to NumPy array
                if device == 'cpu':
                    x = outputs.last_hidden_state.detach().numpy()
                else:
                    x = outputs.last_hidden_state.detach().cpu().numpy()
                    
                # Save the embeddings to a file
                id_audio = file.split('.')[0]  # Extract the base name of the audio file
                with open(path_save+id_audio+'.npy', 'wb') as f:
                    np.save(f, x)
                    
    toc = time.time()  # End timing
    print(model_name, ' - Duration:', round((toc - tic) / 60, 2), 'min')  # Print the duration of processing