<h1 style = "font-size:3rem;color:darkcyan"> Preprocessing data</h1>
to train a variational autoencoder

In [5]:
# import libraries
import numpy as np
import librosa
import os
import json

In [32]:
class PreprocessingPipeline:
    def __init__(self, 
                 save_path,
                 json_path,
                 audio_duration = 0.74,
                 min_duration = 0.1,
                 n_fft = 512, 
                 hop_size = 256,
                 sample_rate=22050):
        
        self.dataset_path = save_path
        self.json_path = json_path
        self.n_fft = n_fft
        self.hop_size = hop_size
        self.sample_rate = sample_rate
        self.audio_duration = int(audio_duration * sample_rate)
        self.min_duration = int(min_duration * sample_rate)
        self.min_max_values = {}
        
        self.data  = {
        'mappings' : [],  # corresponding digit
        'labels' : [],    # corresponding number
        'log_spectrogram' : [],      # extracted spectrum
        'filenames' : [],  # original filenames
        'min_max_values' : [] # for denormalizing
        }
        
    def process_dataset(self):
        # load data 
        n_folders = len(os.listdir(self.dataset_path))
 
        for i, (dirpath, dirnames, filenames) in enumerate(os.walk(root)):
            if dirpath is not root:
                print(f'processing folder {i} out of {n_folders}')

                for file in filenames:
                    file_path = os.path.join(dirpath + '/' + file)
                    
                    # load audio
                    signal = self._load_audio(file_path)
                    
                    # ignore unusable audio
                    if len(signal) < self.min_duration:
                        continue
                    
                    if len(signal) > self.audio_duration:
                        signal = signal[:self.audio_duration]
                        
                    # zero pad
                    if len(signal) < self.audio_duration:
                        signal = self._right_pad(signal)

                    # normalize
                    signal = self._normalize(signal)
                    
                    # extract log spectrogram
                    log_spectrogram = self._extract_log_spectrogram(signal)
                    
                    # save
                    self.data['labels'].append(int(file[0]))
                    self.data['log_spectrogram'].append(log_spectrogram.tolist()) 
                    self.data['filenames'].append(file)
                    self.data['min_max_values'].append(np.array(log_spectrogram.min(), log_spectrogram.max()).tolist())
            
                if i == 1:
                    break
                
        print(f'Saving dataset as {self.json_path}...')                
        with open(self.json_path, 'w') as fp:
            json.dump(self.data, fp, indent = 4)
        print(f'Done saving ') 
                    
    def _load_audio(self, file_path):
        signal, _ = librosa.load(file_path,
                                 sr = self.sample_rate,
                                 mono=True)[:self.audio_duration]
        return signal
    
    def _right_pad(self, signal, mode = 'constant'):
        num_missing_samples = self.audio_duration - len(signal)
        return np.pad(signal, 
                      (0, num_missing_samples), 
                      mode = mode)
    
    def _left_pad(self, signal, mode = 'constant'):
        num_missing_samples = self.audio_duration - len(signal)
        return np.pad(signal, 
                      (num_missing_samples, 0), 
                      mode = mode)
        
    def _normalize(self, signal, min_val = 0, max_val = 1):
        norm_signal = (signal - signal.min()) / (signal.max() - signal.min())
        norm_signal = norm_signal * (max_val - min_val) + min_val
        return norm_signal

    def _denormalize(self, signal, signal_min, original_max):
        signal = (norm_signal - min_val) / (max_val - min_val)
        signal = signal * (original_max - original_min) + original_min
        return signal
        
    def _extract_log_spectrogram(self, signal):
        stft = librosa.stft(signal,
                            n_fft = self.n_fft,
                            hop_length = self.hop_size)[:-1]
        
        spectrogram = np.abs(stft)
        log_spectrogram = librosa.amplitude_to_db(spectrogram)
        return log_spectrogram

In [33]:
root = '../../../Datasets/Speech/Digits/AudioMNIST/data/'
preprocessingPipeline = PreprocessingPipeline(root, 'data.json')
preprocessingPipeline.process_dataset()

processing folder 1 out of 60
Saving dataset as data.json...
Done saving 
