<h1 style = "font-size:3rem;color:darkcyan"> Generate Audio </h1>
& compare original audio with the reconstructed audio

In [80]:
# import libraries
import numpy as np
import json
import librosa
import os
import soundfile as sf
import pickle
import IPython

In [3]:
from autoencoder import VAE

# Sound gen class

In [93]:
class SoundGenerator:
    
    def __init__(self, vae, hop_length):
        self.vae = vae
        self.hop_length = hop_length
        self.min_val = 0
        self.max_val = 1
        
    def generate(self, spectrograms, min_max_values):
        generated_spectrograms, latent_representations = self.vae.reconstruct(spectrograms)
        signals = self.convert_spectrograms_to_audio(generated_spectrograms, min_max_values)
        return signals, latent_representations
    
    def convert_spectrograms_to_audio(self, spectrograms, min_max_values):
        signals = []
        for spectrogram, min_max_value in zip(spectrograms, min_max_values):
            # reshape: get rid of third dim
            log_spectrogram = spectrogram[:, :, 0]
    
            # apply denormalization
            denorm_log_spectrogram = self._denormalize(log_spectrogram,
                                                       min_max_value[0],
                                                       min_max_value[1])
            # log to linear spectrogram
            spec = librosa.db_to_amplitude(denorm_log_spectrogram)
            
            # apply STFT using Griffin Lim
            signal = librosa.istft(spec, hop_length = self.hop_length)
            
            # append signal to 'signals' array
            signals.append(signal)
            
        return signals
        
    def _denormalize(self, norm_signal, original_min, original_max):
        signal = (norm_signal - self.min_val) / (self.max_val - self.min_val)
        signal = signal * (original_max - original_min) + original_min
        return signal
        

# Initialize sound generator

In [94]:
vae = VAE.load('model')
vae.summary()
hop_length = 256
sound_generator = SoundGenerator(vae, hop_length)

# sample spectrograms + min_max_values
# generate audio for sampled spectrograms
# convert spectrograms to audio (for comparison)
# save audio signal

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 256, 64, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 encoder_conv_layer_1 (Conv2D)  (None, 128, 32, 512  5120        ['encoder_input[0][0]']          
                                )                                                                 
                                                                                                  
 encoder_relu_1 (ReLU)          (None, 128, 32, 512  0           ['encoder_conv_layer_1[0][0]']   
                                )                                                           

Total params: 2,322,369
Trainable params: 2,319,937
Non-trainable params: 2,432
_________________________________________________________________


# Load spectrograms + min_max_values

In [71]:
def import_dataset(dataset_path):
    with open(dataset_path, 'r') as f:
        data = json.load(f)
    
    # split list into different np arrays
    inputs = np.array(data['log_spectrogram'])
    filenames = np.array(data['filenames'])
    min_max_values = np.array(data['min_max_values'])
    
    # reshape to add one dimension to features for CNN
    inputs = inputs[..., np.newaxis] 
    return inputs, filenames, min_max_values

spectrograms, filenames, min_max_values = import_dataset('data.json')

# Sample spectrograms

In [72]:
def select_spectrograms(spectrograms,
                        filenames,
                        min_max_values,
                        num_spectrograms=2):
    sampled_indexes = np.random.choice(range(len(spectrograms)), num_spectrograms)
    sampled_spectrograms = spectrograms[sampled_indexes]
    filenames = [filenames[index] for index in sampled_indexes]
    sampled_min_max_values = [min_max_values[index] for index in
                           sampled_indexes]
    print(filenames)
    print(sampled_min_max_values)
    return sampled_spectrograms, sampled_min_max_values

sampled_specs , sampled_min_max_values = select_spectrograms(spectrograms, 
                                                            filenames,
                                                            min_max_values,
                                                            5)

['7_01_20.wav', '3_01_40.wav', '6_01_26.wav', '4_01_4.wav', '2_01_25.wav']
[array([-37.51725769,  42.48274231]), array([-37.66090012,  42.33909988]), array([-34.7809639,  45.2190361]), array([-37.01701355,  42.98298645]), array([-37.13897324,  42.86102676])]


# Generate audio for sampled spectrograms

In [95]:
signals, _ = sound_generator.generate(sampled_specs, sampled_min_max_values)

# Convert spec to audio for comparison

In [96]:
original_audio = sound_generator.convert_spectrograms_to_audio(sampled_specs, sampled_min_max_values)

# Save signals

In [97]:
def save_signals(signals, save_dir, sample_rate=22050):
    for i, signal in enumerate(signals):
        save_path = os.path.join(save_dir, str(i) + ".wav")
        sf.write(save_path, signal, sample_rate)
        
save_signals(signals, 'samples/generated')
save_signals(original_audio, 'samples/original')

In [99]:
IPython.display.Audio("samples/original/0.wav")

In [101]:
IPython.display.Audio("samples/generated/0.wav")