https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim

In [1]:
import os
import inspect
from math import floor
from tqdm import tqdm

import torch
import torch.nn as nn
import numpy as np
import librosa
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)

In [2]:
class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x


class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config).to(device)
        self.classifier = RegressionHead(config).to(device)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):

        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits



device = 'cuda'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)

# dummy signal
sampling_rate = 16000
signal = np.zeros((1, sampling_rate), dtype=np.float32)


def process_func(
    x: np.ndarray,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict emotions or extract embeddings from raw audio signal."""

    # run through processor to normalize signal
    # always returns a batch, so we just get the first entry
    # then we put it on the device
    y = processor(x, sampling_rate=sampling_rate)
    y = y['input_values'][0]
    y = y.reshape(1, -1)
    y = torch.from_numpy(y).to(device)

    # run through model
    with torch.no_grad():
        y = model(y)[0 if embeddings else 1]

    # convert to numpy
    y = y.detach().cpu().numpy()

    return y


audio, sr = librosa.load('/kaggle/input/audio-abaw5/batch1/batch1/108-15-640x480.mp3', sr=16000)
l = process_func(audio, sr, embeddings=True)
l.shape

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/661M [00:00<?, ?B/s]

Some weights of EmotionModel were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-02-12 13:01:40.209332: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-12 13:01:40.209450: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-12 13:01:40.333484: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory 

(1, 1024)

In [3]:
data_dir = '/kaggle/input/audio-abaw5'
folders = ['batch1', 'batch2', 'new_vids']

names = []
global_features = []

step = 24000 #1.5sec

In [4]:
def process_signal(local_name, audio, step=24000, sr=16000):
    names = []
    features = []
    th = floor(audio.shape[0] / step)

    for s in range(th):
        with torch.no_grad():
            signal = process_func(audio[step*s:(s+1)*step], sr, embeddings=True)
        features.append(signal[0])
        names.append(f'{local_name}/{str(s+1).zfill(5)}')

    if audio[step*th:].shape[0] > 0:
        new_step = step - audio[step*th:].shape[0]
        with torch.no_grad():
            signal = process_func(audio[step*th - new_step:], sr, embeddings=True)
        features.append(signal[0])
        names.append(f'{local_name}/{str(th+1).zfill(5)}')
        
    return names, features

In [5]:
for folder in folders:
    dirpath=os.path.join(data_dir, folder, folder)
    print(f'in {folder}')

    for filename in tqdm(os.listdir(dirpath)):
        fn, ext = os.path.splitext(os.path.basename(filename))
        if ext.lower()=='.mp3':
            local_name = f'{fn}'
            
            audio, sr = librosa.load(os.path.join(dirpath, filename), sr=16000)
            nn, fea = process_signal(local_name, audio, step)
            
            names += nn
            
            if len(global_features):
                global_features=np.concatenate((global_features, fea),axis=0)
            else:
                global_features = fea
                
        else:
            print(filename)

in batch1


100%|██████████| 475/475 [16:47<00:00,  2.12s/it]


in batch2


100%|██████████| 73/73 [02:27<00:00,  2.03s/it]


in new_vids


100%|██████████| 50/50 [03:37<00:00,  4.35s/it]


In [6]:
global_features.shape, len(names)

((73460, 1024), 73460)

In [7]:
import pickle

filename2featuresAll={img_name: gl_feature for img_name, gl_feature 
                      in zip(names, global_features)}

with open('wav2vec_large_robast_fea.pickle', 'wb') as handle:
    pickle.dump(filename2featuresAll, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
from IPython.display import FileLink

FileLink('wav2vec_large_robast_fea.pickle')