In [2]:
from src.datasets import NoisySpeechDataset

In [3]:
dataset = NoisySpeechDataset(train=False, quiet=True)

In [4]:
noisy_t, clean_t = dataset[0]
noisy_arr = noisy_t.detach().numpy()
clean_arr = clean_t.detach().numpy()

In [19]:
# Original scipy base method
import numpy as np
from scipy import signal

SAMPLING_RATE = 16000
SAMPLES_PER_SEGMENT = 512


freqs, times, spectral_frames = signal.stft(
    noisy_arr,
    fs=SAMPLING_RATE,
    nperseg=SAMPLES_PER_SEGMENT
)
print(freqs[0:5], freqs[-5:-1])
print(times[0:5], times[-5:-1])
print(spectral_frames.shape)

[  0.    31.25  62.5   93.75 125.  ] [7875.   7906.25 7937.5  7968.75]
[0.    0.016 0.032 0.048 0.064] [1.984 2.    2.016 2.032]
(257, 129)


In [23]:
# Librosa method
from librosa.core import stft

SAMPLE_RATE = 16000
HOP_MS = 16
WIN_MS = 64

def ms_to_steps(ms):
    return int((1e-3 * ms) * SAMPLE_RATE)


coeffs = stft(
    noisy_arr,
    n_fft=ms_to_steps(WIN_MS),
    hop_length=ms_to_steps(HOP_MS),
    win_length=ms_to_steps(WIN_MS),
    window='hann',
)

In [24]:
coeffs.shape

(513, 129)

In [25]:
coeffs

array([[-3.8170463e-01+0.0000000e+00j,  5.8743566e-01+0.0000000e+00j,
         1.0969898e+00+0.0000000e+00j, ...,
        -4.4178982e+00+0.0000000e+00j, -6.0868096e-01+0.0000000e+00j,
         1.6001073e+00+0.0000000e+00j],
       [ 5.4170847e-01-1.6696713e-17j, -3.2782310e-01+1.0085300e+00j,
        -1.0631933e+00-4.7694153e-01j, ...,
         2.8235607e+00+1.5835314e+00j,  8.1642896e-02+2.1850958e+00j,
        -1.7461599e+00-1.1610535e-02j],
       [ 6.6638374e-01+2.7105054e-17j, -4.1649649e-01-1.6669800e+00j,
        -3.0108503e-01+9.2678517e-01j, ...,
        -2.3111209e-01-4.8843044e-01j,  2.3051031e-01-9.1283572e-01j,
         9.3570346e-01+5.6139571e-03j],
       ...,
       [ 2.4495318e-03+7.5894152e-18j, -1.2216974e-03-8.0257587e-07j,
        -4.7389676e-06-1.2796473e-07j, ...,
        -7.9117831e-07-1.8381310e-07j, -1.4918080e-03+1.5368349e-05j,
         2.9675795e-03-3.6469399e-05j],
       [-2.4633657e-03+9.7578196e-18j,  5.5223713e-06+1.2294801e-03j,
        -4.4301228e-06

In [26]:
spectral_frames

array([[-2.19692869e-04+0.0000000e+00j,  6.25014945e-04+0.0000000e+00j,
         2.45093880e-03+0.0000000e+00j, ...,
        -9.81527939e-03+0.0000000e+00j, -5.30591700e-04+0.0000000e+00j,
         2.85857869e-03+0.0000000e+00j],
       [-4.21637640e-04+2.9261959e-05j,  1.25417742e-03+4.1377000e-03j,
         8.77997663e-04-3.3612500e-03j, ...,
         3.89209180e-03+7.0631143e-04j, -1.35651277e-03+2.5718592e-03j,
        -2.55861203e-03-6.4563740e-04j],
       [ 1.47187657e-05-1.1308646e-03j, -1.77633180e-03-2.7862319e-03j,
        -3.03893141e-03+3.8255930e-03j, ...,
         1.70364883e-03+1.1462708e-03j,  2.37039221e-03-9.9327776e-04j,
         2.52100895e-03+9.9459756e-04j],
       ...,
       [-3.55063821e-06+1.2406130e-08j, -5.13824716e-08+1.6177458e-08j,
         1.17953896e-07+2.5054243e-08j, ...,
         1.27001165e-08-8.3479632e-09j,  6.85184220e-09-7.2261946e-10j,
        -1.11542276e-05-7.0759960e-08j],
       [ 3.54272061e-06-1.1981451e-08j, -4.57764884e-08+3.3419036e-0