In [25]:
from src.datasets import NoisySpeechDataset

In [26]:
dataset = NoisySpeechDataset(train=False, quiet=True)

In [27]:
noisy_t, clean_t = dataset[0]
noisy_arr = noisy_t.detach().numpy()
clean_arr = clean_t.detach().numpy()

In [47]:
# Original scipy base method
import numpy as np
from scipy import signal

SAMPLING_RATE = 16000
HOP_MS = 16
WIN_MS = 64

def ms_to_steps(ms):
    return int((1e-3 * ms) * SAMPLING_RATE)


num_segment = ms_to_steps(WIN_MS)
num_overlap = num_segment - ms_to_steps(HOP_MS)

freqs, times, spectral_frames = signal.stft(
    noisy_arr,
    fs=SAMPLING_RATE,
    nperseg=num_segment,
    noverlap=num_overlap,
    
)
print(window.shape)
print(freqs[0:5], freqs[-5:-1])
print(times[0:5], times[-5:-1])
print(spectral_frames.shape)

(1024,)
[  0.    31.25  62.5   93.75 125.  ] [7875.   7906.25 7937.5  7968.75]
[0.    0.016 0.032 0.048 0.064] [1.984 2.    2.016 2.032]
(257, 129)


In [34]:
# Librosa method
from librosa.core import stft

SAMPLE_RATE = 16000
HOP_MS = 16
WIN_MS = 64

def ms_to_steps(ms):
    return int((1e-3 * ms) * SAMPLE_RATE)


coeffs = stft(
    noisy_arr,
    n_fft=ms_to_steps(WIN_MS),
    hop_length=ms_to_steps(HOP_MS),
    win_length=ms_to_steps(WIN_MS),
    window='hann',
)

In [35]:
coeffs.shape

(513, 129)

In [36]:
coeffs

array([[-3.8170463e-01+0.0000000e+00j,  5.8743566e-01+0.0000000e+00j,
         1.0969898e+00+0.0000000e+00j, ...,
        -4.4178982e+00+0.0000000e+00j, -6.0868096e-01+0.0000000e+00j,
         1.6001073e+00+0.0000000e+00j],
       [ 5.4170847e-01-1.6696713e-17j, -3.2782310e-01+1.0085300e+00j,
        -1.0631933e+00-4.7694153e-01j, ...,
         2.8235607e+00+1.5835314e+00j,  8.1642896e-02+2.1850958e+00j,
        -1.7461599e+00-1.1610535e-02j],
       [ 6.6638374e-01+2.7105054e-17j, -4.1649649e-01-1.6669800e+00j,
        -3.0108503e-01+9.2678517e-01j, ...,
        -2.3111209e-01-4.8843044e-01j,  2.3051031e-01-9.1283572e-01j,
         9.3570346e-01+5.6139571e-03j],
       ...,
       [ 2.4495318e-03+7.5894152e-18j, -1.2216974e-03-8.0257587e-07j,
        -4.7389676e-06-1.2796473e-07j, ...,
        -7.9117831e-07-1.8381310e-07j, -1.4918080e-03+1.5368349e-05j,
         2.9675795e-03-3.6469399e-05j],
       [-2.4633657e-03+9.7578196e-18j,  5.5223713e-06+1.2294801e-03j,
        -4.4301228e-06

In [37]:
spectral_frames

array([[-3.7692592e-04+0.0000000e+00j,  1.1839274e-03+0.0000000e+00j,
         2.1425581e-03+0.0000000e+00j, ...,
        -8.6287074e-03+0.0000000e+00j, -1.8241982e-03+0.0000000e+00j,
         1.5735916e-03+0.0000000e+00j],
       [ 5.3317967e-04-4.3651296e-04j, -6.2611594e-04+1.9620659e-03j,
        -2.0765495e-03-9.3152642e-04j, ...,
         5.5147670e-03+3.0928347e-03j,  8.5023028e-05+3.6472313e-03j,
        -1.7062395e-03-4.1752681e-04j],
       [ 6.4659788e-04+8.7925268e-04j, -7.7279686e-04-3.2253144e-03j,
        -5.8805669e-04+1.8101273e-03j, ...,
        -4.5139081e-04-9.5396570e-04j,  1.0421512e-03-1.9085754e-03j,
         9.1503357e-04+8.0469763e-04j],
       ...,
       [-1.7753752e-06+5.4936078e-09j,  8.9524167e-07-3.0825638e-09j,
        -9.2581418e-09-2.5818425e-10j, ...,
        -1.5548554e-09-3.6265710e-10j,  2.7934250e-06+6.1145431e-09j,
        -5.5909668e-06-1.7837269e-08j],
       [ 1.7618679e-06-2.1840880e-09j,  9.8950768e-09-8.8133078e-07j,
        -8.6570200e-09