In [29]:
import numpy as np
import pandas as pd
import torch
import torchaudio
import librosa
import os
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchaudio.transforms import MelSpectrogram
import matplotlib.pyplot as plt

import tqdm
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)

In [30]:
def audio_to_melspectrogram(audio_path):
    # 오디오 파일 로드
    y, sr = librosa.load(audio_path, sr=None)
    
    # Mel-spectrogram 계산
    S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
    
    # 로그 스케일로 변환
    log_S = librosa.power_to_db(S, ref=np.max)
    
    return log_S

In [31]:
class AudioMelSpectrogramDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        """
        Args:
            dataframe (pandas.DataFrame): DataFrame containing the audio file paths and labels.
            root_dir (string): Directory with all the audio files.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # 파일 경로 구성
        audio_path = os.path.join(self.root_dir, self.dataframe.iloc[idx]['path'])
        label = self.dataframe.iloc[idx]['label']
        
        # 오디오 파일을 Mel-spectrogram으로 변환
        mels = audio_to_melspectrogram(audio_path)
        
        sample = {'melspectrogram': mels, 'label': label}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample

In [32]:
train_df = pd.read_csv('data/train.csv')
root_dir = '/mnt/data/train'  # 오디오 파일들의 기본 디렉토리 경로
dataset = AudioMelSpectrogramDataset(dataframe=train_df, root_dir=root_dir)

In [33]:
class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x

In [34]:
class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):

        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits

In [35]:
# load model from hub
device = 'cuda'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)

# dummy signal
sampling_rate = 16000
signal = np.zeros((1, sampling_rate), dtype=np.float32)


preprocessor_config.json: 100%|██████████| 214/214 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config.json: 100%|██████████| 2.34k/2.34k [00:00<?, ?B/s]
vocab.json: 100%|██████████| 2.00/2.00 [00:00<00:00, 17.4kB/s]
model.safetensors: 100%|██████████| 661M/661M [00:58<00:00, 11.3MB/s] 
Some weights of EmotionModel were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
def process_func(
    x: np.ndarray,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict emotions or extract embeddings from raw audio signal."""

    # run through processor to normalize signal
    # always returns a batch, so we just get the first entry
    # then we put it on the device
    y = processor(x, sampling_rate=sampling_rate)
    y = y['input_values'][0]
    y = y.reshape(1, -1)
    y = torch.from_numpy(y).to(device)

    # run through model
    with torch.no_grad():
        y = model(y)[0 if embeddings else 1]

    # convert to numpy
    y = y.detach().cpu().numpy()

    return y


In [37]:
print(process_func(signal, sampling_rate))

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
print(process_func(signal, sampling_rate, embeddings=True))