code 3.4 01_compute_fbank.py: 학습/개발/평가 데이터에 대한 로그 Mel 필터 뱅크 특성 계산하기

In [None]:
class FeatureExtractor():
    def __init__(self, sampling_frequency = 16000, 
                frame_length = 25, frame_shift = 10, 
                num_mel_bins = 23, num_ceps = 13, 
                lifter_coef = 22, low_frequency = 20, 
                high_frequency = 8000, dither = 1.0):
        self.sample_freq = sample_frequency
        self.frame_size = int(sample_frequency * frame_length * 0.001)
        self.num_mel_bins = num_mel_bins
        self.num_ceps = num_ceps
        self.lifter_coef = lifter_coef
        self.low_frequency = low_frequency 
        self.high_frequency = high_frequency 
        self.dither_coef = dither
        
        self.fft_size = 1
        while self.fft_size < self.frame_size:
            self.fft_size *= 2
            
        self.mel_filter_bank = self.MakeMelFilterBank()
        
    def Herz2Mel(self, herz):
        return (1127.0 * np.log(1.0 + herz / 700))
    

    def MakeMelFilterBank(self):
        mel_high_freq = self.Herz2Mel(self.high_frequency)
        mel_low_freq = self.Herz2Mel(self.low_frequency)
        mel_points = np.linspace(mel_low_freq, 
                                mel_high_freq, 
                                self.num_mel_bins + 2)
        dim_spectrium = int(self.fft_size / 2) + 1
        
        mel_filter_bank = np.zeros((self.num_mel_bins, 
                                   dim_spectrum))
        
        for m in range(self.num_mel_bins):
            left_mel = mel_points[m]
            center_mel = mel_points[m+1]
            right_mel = mel_points[m+2]
            
            for n in range(dim_spectrum):
                freq = 1.0 * n * self.sample_freq / 2 / dim_spectrum
                mel = self.Herz2Mel(freq)
                if mel > left_mel and mel < right_mel:
                    if mel <= center_mel:
                        weight = (mel - left_mel) / (center_mel - left_mel)
                    else:
                        weight = (right_mel - mel) / (right_mel - center_mel)
                    mel_filter_bank[m][n] = weight
                    
        return mel_filter_bank
    
    def ExtractWindow(self, waveform, start_index, num_samples):
        window = waveform[start_index:start_index + self.frame_size].copy()
        if self.dither_coef > 0:
            window = window + np.random.rand(self.frame_size) * (2 * self.dither_coef) - self.dither_coef
            window = window - np.mean(window)
            power = np.sum(window ** 2)
            if power < 1E-10:
                power = 1E-10
                
            log_power = np.log(power)
            
            window = np.convolve(window, np.array([1.0, -0.97]), 
                                mode = 'same')
            window[0] -= 0.97*window[0]
            
            window *= np.hamming(self.frame_size)
            
            return window, log_power
        
    def ComputeFBANK(self, waveform):
        num_samples = np.size(waveform)
        num_frames = (num_samples - self.frame_size) // self.frame_shift + 1
        fbank_features = np.zeros((num_frames, self.num_mel_bins))
        log_power = np.zeros(num_frames)
        
        for frame in range(num_frames):
            start_index = frame * self.frame_shift
            window, log_pow = self.ExtractWindow(waveform, start_index, num_samples)
            
            spectrum = np.fft.fft(window, n = self.fft_size)
            spectrum = spectrum[:int(self.fft_size / 2) + 1]
            spectrum = np.abs(spectrum) ** 2
            
            fbank = np.dot(spectrum, self.mel_filter_bank.T)
            
            fbank[fbank < 0.1] = 0.1
            fbank_features[frame] = np.log(fbank)
            log_power[frame] = log_pow
            
        return fbank_features, log_power
    
if __name__ == '__main__':
    train_small_wav_scp  = '../data/label/train_small/wav.scp'
    train_small_out_dir = './fbank/train_small'
    train_large_wav_scp = '../label/train_large/wav.scp'
    train_large_out_dir = './fbank/train_large'
    dev_wav_scp = '../data/label/dev/wav.scp'
    dev_out_dir = './fbank/dev'
    test_wav_scp = '../data/label/test/wav.scp'
    test_out_dir = '.fbank/test'
    
    sample_frequency = 16000
    frame_length = 25
    frame_shift = 10
    low_frequency = 20
    high_frequency = sample_frequency / 2
    num_mel_bins = 40
    dither = 1.0
    
    feat_extractor = FeatureExtractor(sample_frequency = sample_frequency, 
                                     frame_length = frame_length, 
                                     frame_shift = frame_shift, 
                                     num_mel_bins = num_mel_bins, 
                                     low_frequency = low_frequency, 
                                     high_frequency = high_frequency, 
                                     dither = dither)
    
    wav_scp_list = [train_small_wav_scp, train_large_wav_scp, 
                   dev_wav_scp, test_wav_scp]
    out_dir_list = [train_small_out_dir, train_large_out_dir, 
                   dev_out_dir, test_out_dir]
    
    for (wav_scp, out_dir) in zip(wav_scp_list, out_dir_list):
        print('Input wav_scp: %s' % (wav_scp))
        print('Output out_dir: %s' % (out_dir))
        
        feat_scp = os.path.join(out_dir, 'feats.scp')
        
        with open(wav_scp, mode = 'r') as file_wav, open(feat_scp, mode = 'w') as file_feat:
            for line in file_wav:
                parts = line.split()
                utterance_id = parts[0]
                wav_path = parts[1]
                
                with wave.open(wav_path) as wav:
                    num_samples = wav.getnframes()
                    waveform = wav.readframes(num_samples)
                    waveform = np.frombuffer(waveform, dtype = np.int16)
                    fbank, log_power = feat_extractor.ComputeFBANK(waveform)
                    
                (num_frames, num_dims) = np.shape(fbank)
                out_file = os.path.splitext(os.path.basename(wav_path))[0]
                out_file = os.path.join(os.path.abspath(out_dir), 
                                       out_file + '.bin')
                fbank = fbank.astype(np.float32)
                fbank.tofile(out_file)
                
                file_feat.write('%s %s %d %d\n' % 
                               (utterance_id, out_file, 
                               num_frames, num_dims))

In [None]:
class FeatureExtractor():
    def __init__(self, sample_frequency = 16000, 
                frame_length = 25, frame_shift = 10, 
                num_mel_bins = 23, num_ceps = 13, 
                lifter_coef = 22, low_frequency = 20, high_frequency = 8000, dither = 1.0):
        self.sample_freq = sample_frequency 
        sef.frame_size = int(sample_frequency * frame_length * 0.001)
        self.frame_shift = int(sample_frequency * frame_shift * 0.001)
        self.num_mel_bins = num_mel_bins
        self.num_ceps = num_ceps
        self.lifter_coef = lifter_coef
        self.low_frequency = low_frequency 
        self.high_frequency = high_frequency 
        self.dither_coef = dither
        
        self.fft_size = 1
        whiel self.fft_size < self.frame_size:
            self.fft_size *= 2
            
        self.mel_filter_bank = self.MakeMelFilterBank()
        
    
    def Herz2Mel(self, herz):
        return (1127.0 * np.log(1.0 + herz / 700))
    
    def MakeMelFilterBank(self):
        mel_high_freq = self.Herz2Mel(self.high_frequency)
        mel_low_freq = self.Herz2Mel(self.low_frequency)
        mel_points = np.linspace(mel_low_freq, 
                                mel_high_freq, 
                                self.num_mel_bins + 2)
        dim_spectrum = int(self.fft_size / 2) + 1
        
        mel_filter_bank = np.zeros((self.num_mel_bins, dim_spectrum))
        for m in range(self.num_mel_bins):
            left_mel = mel_points[m]
            center_mel = mel_points[m+1]
            right_mel = mel_points[m+2]
            for n in range(dim_spectrum):
                freq = 1.0 * n * self.sample_freq / 2/ dim_spectrum
                mel = self.Herz2Mel(freq)
                if mel > left_mel and mel < right_mel:
                    if mel <= center_mel:
                        weight = (mel - left_mel) / (right_mel - center_mel)
                    else:
                        weight = (right_mel - mel) / (right_mel - center_mel)
                    mel_filter_bank[m][n] = weight
        
        return mel_filter_bank
        
        
    def ExtratWindow(self, waveform, start_index, num_samples):
        if self.dither_coef > 0:
            window = window + np.random.rand(self.frame_size) * (2 * self.dither_coef) - self.dither_coef
            window = window - np.mean(window)
            power = np.sum(window ** 2)
            if power < 1E - 10:
                power = 1E-10
            log_power = np.log(power)
            
            window = np.convolve(window, np.array([1.0, -0.97]), mode = 'same')
            window[0] -= 0.97 * window[0]
            
            window *= np.hamming(self.frame_size)
            
            return window, log_power
        
    def ComputeFBANK(self, waveform):
        num_samples = np.size(waveform)
        num_frames = (num_samples - self.frame_size) // self.frame_shift + 1
        fbank_features  = np.zeros((num_frames, self.num_mel_bins))
        log_power = np.zeros(num_frames)
        
        for frame in range(num_frames):
            start_index = frame * self.frame_shift
            window, log_pow = self.ExtractWindow(waveform, start_index, num_samples)
            
            spectrum = np.fft.fft(window, n = self.fft_size)
            spectrum = spectrum[:int(self.fft_sze / 2) + 1]
            spectrum = np.abs(spectrum) ** 2
            
            fbank = np.dot(spectrum, self.mel_filter_bank.T)
            fbank[fbank < 0.1] = 0.1
            fbank_features[frame] np.log(fbank)
            low_power[frame] = log_pow
            
        return fbank_features, log_power
    
if __name__ == '__main__':
    train_small_wav_scp = '../data/label/train_sall/wav.scp'
    train_small_out_dir = './fbank/train_small'
    train_large_wav_scp = '../data/label/train_large/wav.scp'
    train_large_out_dir = './fbank/train_large'
    dev_wav_scp = '../data/label/dev/wav/scp'
    dev_out_dir = './fbank/dev'
    test_wav_scp = '../data/label/test/wav.scp'
    test_out_dir = './fbank/test'
    
    sample_frequency = 16000
    frame_length = 25
    frame_shift = 10
    low_frequency = 20
    high_frequency = sample_frequency / 2
    num_mel_bins = 40
    dither = 1.0
    
    feat_extractor = FeatureExtractor(sample_frequency = sample_frequency, 
                                     frame_length = frame_length, 
                                     frame_shift = frame_shift, 
                                     num_mel_bins = num_mel_bins, 
                                     low_frequency = low_frequency, 
                                     high_frequency = high_frequency, 
                                     dither = dither)
    
    wav_scp_list = [train_small_wav_scp,  
                   train_large_wav_scp,  
                   dev_wav_scp,  
                   test_wav_scp]    
    out_dir_list = [ train_small_out_dir, 
                    train_large_out_dir, 
                    dev_out_dir, 
                    test_out_dir]
    
    for (wav_scp, out_dir) in zip(wav_scp_list, out_dir_list):
        print('Input wav scp: %s' % (wav_scp))
        print('Output directory: %s' % (out_dir))
        
        feat_scp = os.path.join(out_dir, 'feats.scp')
        
        with open(wav_scp, mode = 'r') as file_wav, open(out_dir, mode = 'w') as file_feat:
            for line in file_wav:
                parts = line.split()
                utterance_id = parts[0]
                wav_path = parts[1]
                
                with wave.open(wav_path) as wav:
                    num_samples = wav.getnframes()
                    waveform = wav.readframes(num_samples)
                    waveform = np.frombuffer(waveform, dtype = np.int16)
                    fbank, log_power = feat_extractor.ComputeFBANK(waveform)
                    
                (num_frames, num_dims) = np.shape(fbank)
                out_file = os.path.splitext(os.path.basename(wav_path))[0]
                out_file = os.path.join(os.path.abspath(out_dir), out_file + '.bin')
                fbank = fbank.astype(np.float32)
                fbank.tofile(out_file)
                file_feat.write('%s %s %d %d\n' % (utterance_id, out_file, num_frames, num_dims))
                
                 