In [1]:
from scipy import signal
import numpy as np
import librosa

#### 1. Extract signals from files

In [8]:
import soundfile as sf

wav_file_path = 'audioset/train/airport-lisbon-1000-40000-a.wav' 
wav_data, wav_fs = sf.read(wav_file_path)
wav_data[:3], wav_data.shape, wav_fs   # 10s, single channel

(array([-0.08497941, -0.09747314, -0.09184897]), (441000,), 44100)

In [17]:
import wavio

wav_info = wavio.read(wav_file_path)
wav_data2 = wav_info.data.astype(float) / np.power(2, wav_info.sampwidth*8-1)
wav_data2 = np.asarray(wav_data2).squeeze(1)
wav_fs2 = wav_info.rate
wav_data2[:3], wav_data2.shape, wav_fs2

(array([-0.08497941, -0.09747314, -0.09184897]), (441000,), 44100)

#### 2. Extract features from signals

In [56]:
window_duration = 0.04
window_shift = 0.02
window_length = int(window_duration*wav_fs)
window_overlap = int((window_duration-window_shift)*wav_fs)
# spectrogram
[f,t,X] = signal.spectral.spectrogram(
            wav_data, window='hamming',
            nperseg=window_length, noverlap=window_overlap, nfft=window_length,
            detrend=False, return_onesided=True, mode='magnitude')
f.shape, t.shape, X.shape       # X:[f.dim, t.dim(frames)]

((883,), (499,), (883, 499))

In [34]:
# mel filter banks
n_mels = 26
melW = librosa.filters.mel(
        sr=wav_fs, n_fft=window_length,
        n_mels=n_mels, fmin=0., fmax=wav_fs/2)
melW.shape                      # melW:[n_mels, f.dim]

(26, 883)

In [40]:
melW[0,:20]

array([0.        , 0.00113935, 0.0022787 , 0.00341804, 0.00455739,
       0.00569674, 0.0066656 , 0.00552626, 0.00438691, 0.00324756,
       0.00210821, 0.00096886, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ],
      dtype=float32)

In [44]:
melW /= np.max(melW,axis=-1)[:,None]    # normalization, [:,None]: 1dim->2dim
melW[0,:20]

array([0.        , 0.17092949, 0.34185898, 0.5127885 , 0.68371797,
       0.85464746, 1.        , 0.8290705 , 0.65814096, 0.4872115 ,
       0.316282  , 0.14535251, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ],
      dtype=float32)

In [45]:
# filtering
melX = np.dot(melW, X)          # melX:[n_mels, t.dim(frames)]
melX.shape

(26, 499)

In [47]:
# cut stft to 400
stftX = X[0:400]

In [75]:
f2,t2,zxx = signal.stft(
                wav_data, window='hamming',
                nperseg=window_length, noverlap=window_overlap, nfft=window_length,
                detrend=False,return_onesided=True)
stftX2 = np.abs(zxx)                     # the dimension of final feature:200 or 400

In [78]:
stftX2.shape

(883, 501)

In [77]:
stftX[:10,0]

array([0.17631703, 0.39032633, 0.62441752, 0.6423795 , 0.34250796,
       0.10943047, 0.11875023, 0.45517339, 0.36079344, 0.51178501])

In [81]:
stftX2[:10,1]*36

array([0.17642805, 0.39057211, 0.6248107 , 0.642784  , 0.34272363,
       0.10949938, 0.11882501, 0.45546001, 0.36102062, 0.51210727])

In [82]:
melX

array([[2.16861219, 2.63984489, 2.91289399, ..., 2.98063838, 3.53350432,
        3.215227  ],
       [1.92295319, 1.97248856, 1.76308336, ..., 1.69178101, 3.11441518,
        3.45216024],
       [2.40597872, 2.32051641, 2.62985598, ..., 1.89839503, 2.05477052,
        3.09736428],
       ...,
       [0.17434594, 0.15688637, 0.35344306, ..., 0.12835427, 0.09454472,
        0.10329606],
       [0.10528685, 0.08091289, 0.22138715, ..., 0.08235832, 0.06762544,
        0.06677239],
       [0.07506163, 0.05373003, 0.18861271, ..., 0.06493378, 0.06393153,
        0.05414197]])

In [83]:
np.swapaxes(melX,0,1)

array([[2.16861219, 1.92295319, 2.40597872, ..., 0.17434594, 0.10528685,
        0.07506163],
       [2.63984489, 1.97248856, 2.32051641, ..., 0.15688637, 0.08091289,
        0.05373003],
       [2.91289399, 1.76308336, 2.62985598, ..., 0.35344306, 0.22138715,
        0.18861271],
       ...,
       [2.98063838, 1.69178101, 1.89839503, ..., 0.12835427, 0.08235832,
        0.06493378],
       [3.53350432, 3.11441518, 2.05477052, ..., 0.09454472, 0.06762544,
        0.06393153],
       [3.215227  , 3.45216024, 3.09736428, ..., 0.10329606, 0.06677239,
        0.05414197]])

In [90]:
from scipy.fftpack import dct

n_remain = 13
fbanks = np.swapaxes(melX,0,1)
mfcc = dct(np.log10(fbanks), type=2, axis=1, norm='ortho')[:, :n_remain]

In [93]:
mfcc[0]

array([-1.26782557,  2.0172743 , -0.06742619,  0.26595663, -0.06648767,
        0.14682215, -0.29875233,  0.22213856, -0.18849633, -0.08055461,
        0.05189451,  0.07799852,  0.00247447])

In [91]:
mfcc.shape

(499, 13)

In [92]:
from Extract_feature import extract_mfcc

wav_file_path = 'audioset/train/airport-lisbon-1000-40000-a.wav' 
result = extract_mfcc(wav_file_path)

In [96]:
result[0]

array([-1.26782557,  2.0172743 , -0.06742619,  0.26595663, -0.06648767,
        0.14682215, -0.29875233,  0.22213856, -0.18849633, -0.08055461,
        0.05189451,  0.07799852,  0.00247447])