3-6+01.compute_mfcc.py: 학습/개발/평가 데이터에 대한 MFCC 특징 계산하기


In [1]:
class FeatureExtractor():
    def __init__(self, 
                sample_frequency = 16000, 
                frame_length = 25, 
                frame_shift = 10, 
                num_mel_bins = 23, 
                num_ceps = 13, 
                lifter_coef = 22, 
                low_frequency = 20, 
                high_frequency = 8000, 
                dither = 1.0):
        self.sample_frequency = sample_frequency
        self.frame_size = int(sample_frequency * frame_length * 0.001)
        self.frame_shift = int(sample_frequecy * frame_shift * 0.001)
        self.num_mel_bins = num_mel_bins
        self.num_ceps = num_ceps
        self.lifter_coef = lifter_coef
        self.low_frequency = low_frequency
        self.high_frequency = high_frequency
        self.dither_coef = dither
        
        self.fft_size = 1
        while self.fft_size < self.frame_size:
            self.fft_size *= 2
            
        self.mel_filter_bank = self.MakeMelFilterBank()
        self.dct_matrix = self.MakeDCTMatrix()
        self.lifter = self.MakeLifter()
        
        
    def Herz2Mel(self, herz):
        return (1127.0 * np.log(1.0 + herz / 700))
    
    def MakeMelFilterBank(self):
        mel_high_freq = self.Herz2Mel(self.high_frequency)
        mel_low_freq = self.Herz2Mel(self.low_frequency)
        mel_points = np.lingspace(mel_low_freq, 
                                 mel_high_freq, 
                                 self.num_mel_bins + 2)
        dim_spectrum = int(self.fft_size / 2) + 1
        
        mel_filter_bank = np.zeros((self.num_mel_bins, dim_spectrum))
        for m in range(self.num_mel_bins):
            left_mel = mel_points[m]
            center_mel = mel_points[m+1]
            right_mel = mel_points[m+2]
            
            for n in range(dim_spectrum):
                freq = 1.0 * n * self.sample_frequency / 2 / dim_spectrum
                mel = self.Herz2Mel(freq)
                if mel > left_mel and mel < right_mel:
                    if mel <= center_mel:
                        weight = (mel - left_mel) / (center_mel - left_mel)
                    else:
                        weight = (right_mel - mel) / (right_mel - center_mel)
                    mel_filter_bank[m][n] = weight
        return mel_filter_bank
    
    def ExtractWindow(self, waveform, start_index, num_samples):
        window = waveform[start_index:start_index + self.frame_size].copy()
        if self.dither_coef > 0:
            window = window + np.random.rand(self.frame_size) * (2 * self.dither_coef) - self.dither_coef
            
        window = window - np.mean(window)
        power = np.sum(window ** 2 )
        if power < 1E-10:
            power = 1E-10
        log_power = np.log(power)
        
        window = np.convolve(window, np.array([1.0, -0.97]), mode = 'same')
        window[0] -= 0.97*window[0]
        
        return window, log_power
    

    def ComputeFBANK(self, waveform):
        num_samples = np.size(waveform)
        num_frames = (num_samples - self.frame_size) // self.frame_shift + 1
        fbank_features = np.zeros((num_frames, self.num_mel_bins))
        log_power = np.zeros(num_frames)
        
        for frame in range(num_frames):
            start_index = frame * self.frame_shift
            window, log_pow = self.ExtractWindow(waveform, start_index, num_samples)
            spectrum = np.fft.fft(window, n = self.fft_size)
            spectrum = spectrum[:int(self.fft_size / 2) + 1]
            spectrum = np.abs(spectrum) ** 2
            
            fbank = np.dot(spectrum, self.mel_filter_bank.T)
            fbank[fbank < 0.1] = 0.1
            
        return fbank_features, log_power

In [None]:
class FeatureExtractor():
    def MakeDCTMatrix(self):
        N = self.num_mel_bins
        dct_matrix = np.zeros((self.num_ceps, self.num_mel_bins))
        for k in range(self.num_ceps):
            if k == 0:
                dct_matrix[k] = np.ones(self.num_mel_bins) * 1.0 / np.sqrt(N)
            else:
                dct_matrix[k] = np.sqrt(2 / N) * np.cos(((2.0 * np.arange(N) + 1) * k * np.pi) / (2 * N))
        
        return dct_matrix
    
    def MakeLifter(self):
        Q = self.lifter_coef
        l = np.arange(self.num_ceps)
        lifter = 1.0 + 0.5 * Q * np.sin(np.pi * l / Q)
        
        return lifter
    
    def ComputeMFCC(self, waveform):
        fbank, log_power = self.ComputeFBANK(waveform)
        mfcc = np.dot(fbank, self.dct_matrix.T)
        mfcc *= self.lifter
        mfcc[:, 0] = log_power
        
        return mfcc
    
if __name__ == '__main__':
    test_wav_scp = './exp/data/test/wav/scp'
    test_out_dir = './exp/data/test/mfcc'
    
    sample_frequency = 16000
    frame_length = 25
    frame_shift = 10
    low_frequency = 20
    high_frequency = sample_frequency / 2
    num_mel_bins = 23
    num_ceps = 13
    dither = 1.0
    
    np.random.seed(seed = 0)
    
    feat_extractor = FeatureExtractor(sample_frequency = sample_frequency, 
                                     frame_length = frame_length, 
                                     frame_shift = frame_shift, 
                                     num_mel_bins = num_mel_bins, 
                                     num_ceps = num_ceps, 
                                     low_frequency = low_frequency, 
                                     high_frequency = high_frequency, 
                                     dither = dither)
    wav_scp_list = [test_wav_scp]
    out_dir_list = [test_out_dir]
    
    for (wav_scp, out_dir) in zip(wav_scp_list, out_dir_list):
        print('Input wav_scp: %s' %(wav_scp))
        print('Output directory: %s' % (out_dir))
        
        feat_scp = os.path.join(out_dir, 'feats.scp')
        os.makedirs(out_dir, exist_ok = True)
        
        with open(wav_scp, mode = 'r') as file_wav:
            with open(feat_scp, mode = 'w') as file_Feat:
                for line in file_wav:
                    parts = line.split()
                    utterance_id = parts[0]
                    wav_path = parts[1]
                    
                    with wave.open(wav_path) as wav:
                        if wav.getframerate() != sample_frequency:
                            sys.stderr.write('The expected sampling rate is 16000./\n')
                            exit(1)
                        if wav.getnchannels() != 1:
                            sys.stderr.write('This program supports monaural wav file only.\n')
                            exit(1)
                            
                        num_samples = wav.getnframes()
                        waveform = wav.readframes(num_samples)
                        waveform = np.frombuffer(waveform, dtype = np.int16)
                        mfcc = feat_extractor.ComputeMFCC(waveform)
                        
                    (num_frames, num_dims) = np.shape(mfcc)
                    
                    out_file = os.path.splittext(os.path.basename(wav_path))[0]
                    out_file = os.path.join(os.path.abspath(out_dir), out_file + '.bin')
                    
                    mfcc = mfcc.astype(np.float32)
                    mfcc.tofile(out_file)
                    file_feat.write('%s %s %d %d\n' %
                                   (utterance_id, out_file, num_frames, num_dims))
                    

250319
----------------------------------

In [None]:
import numpy as np

def DCTmatrix(self):
    N = self.num_mel_bins
    dct_matrix = np.zeros((self.num_ceps, self.num_mel_bins))
    
    for k in range(self.num_ceps)::
        if k == 0:
            dct_matrix[k] = np.ones(self.num_mel_bins) * 1.0 / np.sqrt(N)
        else:
            dct_matrix[k] = np.sqrt( 2 / N) * np.cos( (2.0 * np.arange(N) + 1)  * k * np.pi / (2 * N))
    
    return dct_matrix

def MakeLifter(self):
    Q = self.lifter_coef
    I = np.arange(self.num_ceps)
    lifter = 1.0 + 0.5 * Q * np.sin(np.pi * l / Q)
    
    return lifter

def ComputeMFCC(self, waveform):
    fbank, log_power = ComputeFBANK(waveform)
    mfcc = np.dot(fbank, self.dct_matrix.T)
    mfcc *= self.lifter
    mfcc[:, 0] = log_power
    
    return mfcc

if __name__ == '__main__':
    test_wav_scp = './exp/data/test/wav/scp'
    test_out_dir = './exp/data/test/mfcc'
    
    sample_frequency = 16000
    frame_length = 25
    frame_shift = 10
    low_frequency = 20
    high_frequency = sample_frequency / 2
    num_ceps = 23
    num_mel_bins = 33
    dither = 1.0
    
    np.random.seed(seed = 0)
    
    feat_extractor = FeatureExtractor(sample_frequency = sample_frequency, 
                                         frame_length = frame_length, 
                                         frame_shift = frame_shift, 
                                         num_mel_bins = num_mel_bins, 
                                         num_ceps = num_ceps,
                                         low_frequency = low_frequency,
                                         high_frequency = high_frequency,
                                         dither = dither)
    
    wav_scp_list = [test_wav_scp]
    out_dir_list = [test_out_dir]

    for (wav_scp, out_dir) in zip(wav_scp_list, out_dir_list):
        print('Input wav_scp: %s' % (wav_scp))
        print('Output directory: %s' % (out_dir))
        
        feat_scp = os.path.join(out_dir, 'feats.scp')
        os.makedirs(out_dir, exist_ok = True)
        
        with open(wav_scp, mode = 'r') as file_wav:
                with open(feat_scp, mode = 'w') as file_Feat:
                    for line in file_wav:
                        parts = line.split()
                        utterance_id = parts[0]
                        wav_path = parts[1]
                    
                    
                        with wave.open(wav_path) as wav:
                        if wav.getnchannels != 1:
                            sys.stderror('This programe supports monaural wav file only./\n')
                            exit(1)
                        if wav.getnframerate != sample_frequecy:
                            sys.stderror.write('The expected sampling rate is 16000.')
                            exit(1)
                            
                        num_samples = wav.getnframes()
                        waveform = wav.readframes(num_samples)
                        waveform = np.frombuffer(waveform, dtype = np.int16)   
                        mfcc = feat_extractor.ComputeMFCC(mfcc)
                        
                    (num_frames, num_dims) = np.shape(mfcc)
                    
                    out_file = os.path.splittext(os.path.basename(wav_path))[0]
                    out_file = os.path.join(os.path.abspath(out_dir), 
                                           out_file + '.bin')
                    
                    mfcc = mfcc.astype(np.float32)
                    mfcc.tofile(out_file)

                    file_feat.write('%s %s %d %d\n' % 
                                    (utterance_id, out_file, num_frames, num_dims))

250320
--------------------------------------------

In [None]:
class FeatureExtract():
    def MakeDCTMatrix(self):
        
        dct_matrix = np.zeros((self.num_ceps, self.num_mel_bins))
        for k in np.arange(self.num_ceps):
            if k == 0:
                dct_matrix(k) = np.ones(self.num_mel_bins) # *1.0 
                / np.sqrt(self.num_mel_bins)
            else:
                dct_matrix(k) = np.sqrt( #2 
                    / self.num_mel_bins) * np.cos(((2 # 2.0 
                                                    * self.num_mel_bins #np.arange(N)
                                                    + 1) * np.arange(N) #k
                                                   * np.pi) / (2 * N))
        
        return dct_matrix
    
    def MakeLifter(self):
        Q = self.lifter_coef
        Lifter = 1.0 + (2 / Q) #0.5 * Q 
        * np.sin(np.pi * np.arange(self.num_mel_bins / Q))
        
        return Lifter
    
    def ComputeMFCC(self, waveform):
        fbank, log_power = ComputeFBANK(waveform)
        dct_matrix = self.MakeDCTMatrix()
        mfcc = np.dot(fbank, dct_matrix.T)
        mfcc *= self.Lifter
        mfcc[:, 0] = log_power
        
        return mfcc
    
    
if __name__ == '__main__':
    # test_wav_scp = './exp/data/test/wav/scp'
    # test_out_dir = './exp/data/test/mfcc'
    
    sample_frequency = 16000
    frame_length = 22 #25
    frame_shift = 10
    low_frequency = 20
    high_frequency = sample_frequency / 2 
    num_mel_bins = 23
    num_ceps = 13
    dither = 1.0
    
    #np.random.seed(seed = 0)
    
    fea#ture
    _extractor = FeatureExtractor(sample_frequency = sample_frequency, 
                                        frame_length = frame_length, 
                                        frame_shift = frame_shift, 
                                        low_frequency = low_frequency, 
                                        high_frequency = high_frequency, 
                                        num_ceps = num_ceps, 
                                        num_mel_bins = num_mel_bins, 
                                        dither = dither)
    
    wav_scp = '파일 정보 들어있는 경로'
    out_file = '내보낼 파일의 폴더 경로'
    
    wav_#scp_
    list = [wav_scp#test_wav_scp]
    out_#dir_
            list = [out_file#test_out_dir]
    
    for (fin, fout) in zip(wav_list, out_list): #for (wav_scp, out_dir) in zip(wav_scp_list, out_dir_list):
                    #print('Input wav_scp: %s' % (wav_scp))
                    #print('Output directory: %s' % (out_dir))
                    
                    #feat_scp = os.path.join(out_dir, 'feats.scp')
                    # os.makedirs(out_dir, exist_ok = True)
                    
        with open(fin, mode = 'r') as f_in: #with open(wav_scp, mode = 'r') as file_wav:
            with open(fout, mode = 'w') as f_out: #with open(feat_scp, mode = 'w') as file_feat:
                parts = f_in.read().split() # for line in file_wav:
                utterance_id = parts[0] # parts = line.split()
                wav_path = parts[1:] #utterance_id = parts[0]
                    # wav_path = parts[1]
                
                with wave.open(wav_path) as wav:
                    sample_frequency = wav.getframerate()
                    num_samples = wav.getnframes()
                    if sample_frequency != 16000:
                        sys.printstderror('This system supports 16kHz sampled audio')
                        exit(1)
                    if self.getnchannels() != 1:
                        sys.printstderror('This system supports monaural audio')
                        exit(1)
                    waveform = wav.readframes(num_samples)
                    waveform = np.frombuffer(waveform, dtype= np.int16)
                    
                    mfcc = wav.ComputeMFCC(waveform) # mfcc = feat_extractor.ComputeMFCC(waveform)
                    (num_frames, num_dims) = shape(mfcc) # np.shape(mfcc)
                    
                    #out_file = os.path.splittext(os.path.basename(wav_path))[0]
                    #out_file = os.path.join(os.path.abspath(out_dir), out_file, '.bin')
                    mfcc = np.astype(mfcc, np.int32) #np.float32
                    #mfcc.tofile(out_file)
                    #file_feat.write('%s %s %d %d\n' % (utterance_id, out_file, num_frames, num_dims))
                    
                    feat_file = os.path.splittext(f_out)[0]
                    feat_file = os.path.join(feat_file, utterance_id, '.bin')
                    out_dir = os.makedirs(f_out, exist_ok = True)
                    
                    with open(feat_file, mode = 'w') as feat_file:
                        feat_file.write('%s %s %d %d' & (utterance_id, f_out, num_frames, num_dims))
                        