In [39]:
from pathlib import Path

import torch
import torchaudio
from librosa.filters import mel as librosa_mel

# Using the same parameters as in HiFiGAN
F_MIN = 0
F_MAX = 8000
HOP_SIZE = 256
WIN_SIZE = 1024
N_FFT = 1024
N_MELS = 80
SAMPLE_RATE = 22050


def spectral_normalize_torch(magnitudes: torch.Tensor) -> torch.Tensor:
    output = torch.log(torch.clamp(magnitudes, min=1e-5))
    return output


def mel_spectrogram(y: torch.Tensor,
                    n_fft: int = N_FFT, num_mels: int = N_MELS,
                    sample_rate: int= SAMPLE_RATE, hop_size: int = HOP_SIZE,
                    win_size: int = WIN_SIZE, fmin: int = F_MIN, fmax: int = F_MAX,
                    center: bool = False) -> torch.Tensor:

    hann_window, mel_basis = {}, {}

    if fmax not in mel_basis:
        mel = librosa_mel(sample_rate, n_fft, num_mels, fmin, fmax)
        mel_basis[f"{fmax}_{y.device}"] = torch.from_numpy(mel).float().to(y.device)
        hann_window[f"{y.device}"] = torch.hann_window(win_size).to(y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1),
                                (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
                                mode='reflect')
    print(y.shape)

    spec = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size,
                      win_length=win_size, window=hann_window[str(y.device)],
                      center=center, pad_mode='reflect',
                      normalized=False, onesided=True)
    print(spec.shape)

    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-9)
    spec = torch.matmul(mel_basis[f"{fmax}_{y.device}"], spec)
    spec = spectral_normalize_torch(spec)

    return spec

In [40]:
input_dir = "../data/processed/mfa_inputs"

path = Path(input_dir)

filepath_list = list(path.rglob('*.flac'))

for file in filepath_list:
    wave_tensor, _ = torchaudio.load(file)

    print(wave_tensor.shape)
    mels_tensor = mel_spectrogram(wave_tensor, center=False)  # [n_channels x n_mels x time]
    print(mels_tensor.shape)
    break

torch.Size([1, 64543])
torch.Size([1, 1, 65311])
torch.Size([1, 513, 252, 2])
torch.Size([1, 80, 252])


In [21]:
print(256 * 252)
print(256 * 253)

64512
64768


In [4]:
from torchaudio.transforms import MelSpectrogram


HOP_SIZE = 256
N_FFT = 1024
N_MELS = 80  # required by HiFi-GAN
NORMALIZED = False
SAMPLE_RATE = 22050
WIN_SIZE = 1024

transformer = MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=N_FFT,
    win_length=WIN_SIZE,
    hop_length=HOP_SIZE,
    f_min=F_MIN,
    f_max=F_MAX,
    n_mels=N_MELS,
    normalized=NORMALIZED,
    # norm = 'slaney',
)

new_tensor = transformer(wave_tensor)
print(new_tensor.shape)

torch.Size([1, 80, 253])
