# Preprocessing for ASR

In [32]:
!pip install numpy scipy librosa matplotlib



In [33]:
import numpy as np
import librosa
from IPython.display import Audio

In [34]:
speech, sr = librosa.load("./harvard.wav")
Audio(data=speech, rate=sr)

In [35]:
speech.shape

(404754,)

## Pre-emphasizing

In [36]:
pre_emp = np.append(speech[0], speech[1:] - 0.95 * speech[:-1])

In [37]:
Audio(data=pre_emp, rate=sr)

## Framing
frame size=40ms, overlap between consecutive frame = 20ms

$$
num\_frames = ceil((signal\_length - frame\_size) / frame\_stride) + 1
$$

$$
padded\_length = (num\_frames - 1) * frame\_stride + frame\_size
$$

In [38]:
frame_size = 0.04
frame_stride = 0.02

frame_length, frame_stride = frame_size * sr, frame_stride * sr

num_frames = int(np.ceil(float(np.abs(speech.shape[0] - frame_length)) / frame_stride)) + 1

padded_length = (num_frames - 1) * frame_stride + frame_length

padded_speech = np.append(pre_emp, np.zeros((int(padded_length) - pre_emp.shape[0],)))

In [39]:
padded_speech.shape

(404838,)

In [40]:
frames = np.ascontiguousarray(
    np.lib.stride_tricks.as_strided(padded_speech, shape=(num_frames, int(frame_length)),
                                         strides=(int(frame_stride) * 8, 8))
)

In [41]:
frames

array([[ 5.53311384e-06, -1.66371428e-05,  2.95971131e-06, ...,
        -9.17000580e-06,  1.99504448e-05, -4.67651262e-06],
       [ 1.12119997e-05, -1.13348042e-05, -1.93983760e-06, ...,
        -2.23425400e-06, -8.80175867e-06,  1.01614060e-05],
       [-8.52265111e-06, -1.53098313e-06, -9.91345587e-07, ...,
         2.97862380e-06, -5.30990837e-06, -3.74315005e-06],
       ...,
       [-2.48975120e-05, -1.19332457e-04, -2.05356162e-04, ...,
        -1.10447407e-04, -1.71212247e-04, -1.84787787e-04],
       [ 2.60423345e-04, -4.65009362e-05, -4.56840498e-05, ...,
        -1.39917858e-04,  1.78541231e-04,  3.23059503e-04],
       [ 1.64254452e-04,  2.20812391e-04,  1.89378334e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

## Windowing

Using Hamming window. 

$$
w[n] = 0.54 - 0.46cos(\frac{2 \pi n}{N - 1})
$$

In [42]:
frames *= np.hamming(frames.shape[1])

In [43]:
frames

array([[ 4.42649107e-07, -1.33116606e-06,  2.36915401e-07, ...,
        -7.34029563e-07,  1.59626897e-06, -3.74121009e-07],
       [ 8.96959973e-07, -9.06916936e-07, -1.55277781e-07, ...,
        -1.78844869e-07, -7.04243662e-07,  8.12912476e-07],
       [-6.81812089e-07, -1.22496561e-07, -7.93540358e-08, ...,
         2.38429285e-07, -4.24854788e-07, -2.99452004e-07],
       ...,
       [-1.99180096e-06, -9.54799258e-06, -1.64381024e-05, ...,
        -8.84096080e-06, -1.36989827e-05, -1.47830229e-05],
       [ 2.08338676e-05, -3.72061889e-06, -3.65686172e-06, ...,
        -1.11999759e-05,  1.42853872e-05,  2.58447602e-05],
       [ 1.31403562e-05,  1.76675745e-05,  1.51591284e-05, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

## Fourier Transform and Power Spectrum

In [44]:
NFFT = 512

magnitudes = np.absolute(np.fft.rfft(frames, NFFT))

In [45]:
power = magnitudes ** 2 / NFFT

## Filter Bank

Mel-scale:

$$m = 2595log_{10} (1 + \frac{f}{700})$$

$$f = 700(10 ^ {m/2595} - 1)$$

`nFilter = 40`

$$
H_m(k)=\left\{\begin{array}{cl}
0 & k<f(m-1) \\
\frac{k-f(m-1)}{f(m)-f(m-1)} & f(m-1) \leq k \leq f(m) \\
\frac{f(m+1)-k}{f(m+1)-f(m)} & f(m) \leq k \leq f(m+1) \\
0 & k>f(m+1)
\end{array}\right.
$$

In [46]:
n_filter = 40

low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + sr / 2 / 700))

mel_point = np.linspace(low_freq_mel, high_freq_mel, n_filter+2)

hz_point = 700 * (10 ** (mel_point / 2595) - 1)

In [47]:
bin = np.floor((NFFT + 1) * hz_point / sr)

In [48]:
fbank = np.zeros((n_filter, int(np.floor(NFFT / 2 + 1))))

In [49]:
for m in range (1, n_filter+1):
    f_m_minus = int(bin[m - 1])
    f_m = int(bin[m])
    f_m_plus = int(bin[m + 1])

    for k in range(f_m_minus, f_m):
        fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
    for k in range(f_m, f_m_plus):
        fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])

# filter_banks = np.dot(power, fbank.T)
filter_banks = np.einsum("ij,kj->ik", power, fbank)

filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)

In [50]:
filter_banks = np.log10(filter_banks)

## MFCC

cepstral coefficients 2-13 are retained, and the rest are discarded

In [51]:
from scipy.fftpack import dct

In [52]:
mfcc = dct(filter_banks, type=2, axis=-1, norm='ortho')

In [53]:
mfcc = mfcc[:, 1:13]

In [54]:
mfcc_energy = np.zeros((mfcc.shape[0], 13))

In [55]:
mfcc_energy[:, :12] = mfcc

In [56]:
mfcc_energy[:, 12] = np.einsum("ij->i", mfcc ** 2)

### Dynamic MFCC

add the first and the second order derivatives

In [57]:
def delta(coef):
    diff = np.diff(coef, axis=0)

    return np.vstack([np.zeros((1, coef.shape[1])), diff])

In [58]:
mfcc_delta = delta(mfcc_energy)

mfcc_delta

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.16634268,  0.41079667,  1.06011831, ..., -0.0943707 ,
        -0.30376271,  0.28167977],
       [ 0.49178833,  0.09995767, -0.66482632, ...,  0.17045016,
         0.20589399, -7.59774477],
       ...,
       [-0.70122984, -0.1031761 , -0.71688482, ..., -0.28682096,
        -0.27393121,  2.24425812],
       [ 0.04591486, -0.79757704, -0.82582848, ...,  0.21207494,
         0.40663174,  1.44886919],
       [ 0.02770716,  0.53351531,  0.74189197, ..., -0.43473919,
        -0.36958147, -2.95834389]])

In [59]:
mfcc_dd = delta(mfcc_delta)

mfcc_dd

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.16634268,  0.41079667,  1.06011831, ..., -0.0943707 ,
        -0.30376271,  0.28167977],
       [ 0.65813101, -0.310839  , -1.72494463, ...,  0.26482086,
         0.5096567 , -7.87942454],
       ...,
       [-0.70535024, -0.70473455, -1.81912387, ..., -0.27829625,
        -0.25091978,  2.90989225],
       [ 0.7471447 , -0.69440094, -0.10894366, ...,  0.4988959 ,
         0.68056296, -0.79538894],
       [-0.0182077 ,  1.33109234,  1.56772045, ..., -0.64681414,
        -0.77621321, -4.40721308]])

In [60]:
dynamicMFCC = np.concatenate([mfcc_energy, mfcc_delta, mfcc_dd], axis=1)

## Mean Normalization

In [61]:
filter_banks -= np.mean(filter_banks, axis=0) + 1e-8

In [62]:
mfcc -= np.mean(mfcc, axis=0) + 1e-8