- **required packages**
    - wave
    - numpy
    - pandas
    - os

- **Class struture**
    - The goal is to convert audio data that has meaningful features for machine learning 
        - **Mel Filter Bank** aims to increase the resolution for information aroudn low frequencies, based on the cognitive characteristics of humans' speech recognition
        - **Cepstrum Analysis** extract formants(fundaemental frequencies) information from audio of lower frequency range
        
    - Class | FeatureExtractor
        - function | __init__  
            - self, sampling_frequency, frame_length, num_mel_bins, num_ceps, lifter_coef, low_frequency, high_frequecy, dither
            - assign attributes
        - function | Herz2Mel
            - self, herz
            - convert the unit of frequency from Herz to Mel to use Mel Filter
        - function | MakeMelFilterBank
            - self
            - creat mel filter based on low and high frequency range
        - function | ExtractWindow
            - self, waveform, start_index, num_samples
            - extract window(a part of waveform), add noise(dithering), pre-emphasize high frequencies, remove DC component, apply hamming function(to smooth edges)
        - function | ComputeFBANK
            - self, waveform
            - extract features using Mel Filter Bank
        - function | MakeDCTMatrix
            - self
            - process Discrete Consine Transformation
        - function | MakeLifter
            - self, lifter_coef
            - 

In [49]:
class FeatureExtractor():
    def __init__(self, 
                sample_frequency = 16000, 
                frame_length = 25, 
                frame_shift = 10, 
                num_mel_bins = 23, 
                num_ceps = 13, 
                lifter_coef = 22, 
                low_frequency = 20, 
                high_frequency = 8000, 
                dither = 1.0):
        self.sample_frequency = sample_frequency
        self.frame_size = int(sample_frequency * frame_length * 0.001)
        self.frame_shift = int(sample_frequecy * frame_shift * 0.001)
        self.num_mel_bins = num_mel_bins
        self.num_ceps = num_ceps
        self.lifter_coef = lifter_coef
        self.low_frequency = low_frequency
        self.high_frequency = high_frequency
        self.dither_coef = dither
        
        self.fft_size = 1
        while self.fft_size < self.frame_size:
            self.fft_size *= 2
            
        self.mel_filter_bank = self.MakeMelFilterBank()
        self.dct_matrix = self.MakeDCTMatrix()
        self.lifter = self.MakeLifter()
        
        
    def Herz2Mel(self, herz):
        return (1127.0 * np.log(1.0 + herz / 700))
    
    def MakeMelFilterBank(self):
        # set frequency boundaries
        mel_high_freq = self.Herz2Mel(self.high_frequency)
        mel_low_freq = self.Herz2Mel(self.low_frequency)
        # mark points of filters
        mel_points = np.linspace(mel_low_freq, 
                                 mel_high_freq, 
                                 self.num_mel_bins + 2)
        # create mel filter bank
        dim_spectrum = int(self.fft_size / 2) + 1
        mel_filter_bank = np.zeros((self.num_mel_bins, dim_spectrum))
        for m in range(self.num_mel_bins):
            left_mel = mel_points[m]
            center_mel = mel_points[m+1]
            right_mel = mel_points[m+2]
            
            for n in range(dim_spectrum):
                freq = 1.0 * n * self.sample_frequency / 2 / dim_spectrum
                mel = self.Herz2Mel(freq)
                if mel > left_mel and mel < right_mel:
                    if mel <= center_mel:
                        weight = (mel - left_mel) / (center_mel - left_mel)
                    else:
                        weight = (right_mel - mel) / (right_mel - center_mel)
                mel_filter_bank[m][n] = weight
        return mel_filter_bank
    
    def ExtractWindow(self, waveform, start_index, num_samples):
        # clipping a waveform
        window = waveform[start_index:start_index + self.frame_size].copy()
        # add random noise)
        if self.dither_coef > 0:
            window = window + np.random.rand(self.frame_size) * (2 * self.dither_coef) - self.dither_coef
        # remove DC Component
        window = window - np.mean(window)
        # convert as log-power
        power = np.sum(window ** 2 )
        # flooring
        if power < 1E-10:
            power = 1E-10
        log_power = np.log(power)
        
        # emphasize high frequencies for possible data loss
        window = np.convolve(window, np.array([1.0, -0.97]), mode = 'same')
        window[0] -= 0.97*window[0]
        
        # apply hamming function
        window *= np.hamming(self.frame_size)
        
        return window, log_power
    

    def ComputeFBANK(self, waveform):
        # get/calculate the number of samples and fraes
        num_samples = np.size(waveform)
        num_frames = (num_samples - self.frame_size) // self.frame_shift + 1
        fbank_features = np.zeros((num_frames, self.num_mel_bins))
        log_power = np.zeros(num_frames)
        
        for frame in range(num_frames):
            # for each frame, extract window and run fast fourier transform, and then calculate fbank features
            start_index = frame * self.frame_shift
            window, log_pow = self.ExtractWindow(waveform, start_index, num_samples)
            spectrum = np.fft.fft(window, n = self.fft_size)
            spectrum = spectrum[:int(self.fft_size / 2) + 1]
            spectrum = np.abs(spectrum) ** 2
            
            fbank = np.dot(spectrum, self.mel_filter_bank.T)
            fbank[fbank < 0.1] = 0.1
            # fbank_features[frame] = fbank
            # log_power[frame] = log_pow
            
        return fbank_features, log_power
    
    
    def MakeDCTMatrix(self):
        # create a matrix for discrete consine transformation
        N = self.num_mel_bins
        dct_matrix = np.zeros((self.num_ceps, self.num_mel_bins))
        for k in range(self.num_ceps):
            if k == 0:
                dct_matrix[k] = np.ones(self.num_mel_bins) * 1.0 / np.sqrt(N)
            else:
                dct_matrix[k] = np.sqrt(2 / N) * np.cos(((2.0 * np.arange(N) + 1) * k * np.pi) / (2 * N))
        
        return dct_matrix
    
    
    def MakeLifter(self):
        # make lifter
        Q = self.lifter_coef
        l = np.arange(self.num_ceps)
        lifter = 1.0 + 0.5 * Q * np.sin(np.pi * l / Q)
        
        return lifter
    
    def ComputeMFCC(self, waveform):
        # calculate fbank_features
        fbank_features, log_power = self.ComputeFBANK(waveform)
        # conduct discrete consine transformation
        mfcc = np.dot(fbank, self.dct_matrix.T)
        # liftering
        mfcc *= self.lifter
        mfcc[:, 0] = log_power
        
        return mfcc