# Task 1
Implement a PyTorch layer for extraction of logarithms of Mel-scale Filterbank energies

In [1]:
import torch
import torchaudio
import matplotlib.pyplot as plt
from torchaudio.transforms import MelSpectrogram
from melbanks import LogMelFilterBanks
import random

In [2]:
random_seed = 42
random.seed(random_seed)

In [3]:
# loading data
wav_path = 'C:/Users/NUC/Documents/ITMO/Speech_Gen/Digital_Signal_Processing/data/cat/00f0204f_nohash_1.wav'
signal, sr = torchaudio.load(wav_path)

print(f"Audio shape: {signal.shape}")
print(f"Sample rate: {sr} Hz")

Audio shape: torch.Size([1, 16000])
Sample rate: 16000 Hz


In [4]:
# defining melbanks
log_mel_fbanks = LogMelFilterBanks(
    n_fft=400,
    samplerate=sr,
    hop_length=160,
    n_mels=80,
    f_max_hz=sr / 2.0 
)

# mel spectrogram
mel_spec_torchaudio = MelSpectrogram(
    sample_rate=sr,
    n_fft=400,
    hop_length=160,
    n_mels=80,
    power=2.0,
)

log_mel_output = log_mel_fbanks(signal)
mel_output_torchaudio = mel_spec_torchaudio(signal)
log_mel_torchaudio = 10 * torch.log10(torch.clamp(mel_output_torchaudio, min=1e-10))

power_spectrogram torch.Size([1, 201, 101])
mel_spectrogram torch.Size([1, 80, 101])


In [48]:
delta = torch.abs(log_mel_output - log_mel_torchaudio)
print(f"Mean delta: {delta.mean().item():.3f}")
print(f"Max delta: {delta.max().item():.3f}")

Mean delta: 0.000
Max delta: 0.000


We see that the outputs of both our function and torchvision function are identical

In [44]:
melspec = torchaudio.transforms.MelSpectrogram(
    hop_length=160,
    n_mels=80
)(signal)
logmelbanks = LogMelFilterBanks()(signal)

assert torch.log(melspec + 1e-6).shape == logmelbanks.shape
assert torch.allclose(torch.log(melspec + 1e-6), logmelbanks)