In [90]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.optimizers import Adam

import numpy as np
import os

In [91]:
import sys
sys.path.insert(1, '../src/modeling')

from vq_vae import VQ_VAE
from train_vq import *

In [92]:
import sys
sys.path.insert(1, '../src')

from generate import *

In [93]:
SPECTROGRAMS_PATH = "../data/fsdd/spectrograms"

LEARNING_RATE = 0.0005
BATCH_SIZE = 64
EPOCHS = 150

In [94]:
x_train, _ = load_fsdd(SPECTROGRAMS_PATH)
print(len(x_train))
print(len(x_train[0]))

3000
256


In [95]:
data_variance = np.var(x_train / 255.0)

In [96]:
VQVAE = VQ_VAE(
    input_shape=(256, 64, 1),
    conv_filters=(512, 256, 128, 64, 32),
    conv_kernels=(3, 3, 3, 3, 3),
    conv_strides=(2, 2, 2, 2, (2, 1)),
    data_variance=data_variance,
    embeddings_size=256,
    latent_space_dim=128
)
VQVAE.summary()


In [97]:
# VQVAE = VQ_VAE(
#     input_shape=(256, 64, 1),
#     conv_filters=(64, 32),
#     conv_kernels=(3, 3),
#     conv_strides=(2, 2),
#     latent_space_dim=16,
#     data_variance=data_variance,
#     embeddings_size=64
# )
# VQVAE.summary()


In [98]:
# VQVAE = VQ_VAE(
#     input_shape=(256, 64, 1),
#     conv_filters=(256, 128, 64, 32),
#     conv_kernels=(3, 3, 3, 3),
#     conv_strides=(2, 2, 2, 2),
#     latent_space_dim=16,
#     data_variance=data_variance,
#     embeddings_size=128
# )
# VQVAE.summary()


In [99]:
# VQVAE = VQ_VAE(
#     input_shape=(256, 64, 1),
#     conv_filters=(512, 256, 128, 64),
#     conv_kernels=(3, 3, 3, 3),
#     conv_strides=(2, 2, 2, 2),
#     latent_space_dim=16,
#     data_variance=data_variance,
#     embeddings_size=128
# )
# VQVAE.summary()


In [100]:
VQVAE.compile(LEARNING_RATE)

In [101]:
VQVAE.train(x_train, BATCH_SIZE, EPOCHS)

Epoch 1/150
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 1s/step - reconstruction_loss: 246025.0625 - total_loss: 247028.0938 - vq_loss: 992.9213
Epoch 2/150
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 1s/step - reconstruction_loss: 63395.6562 - total_loss: 3028421.2500 - vq_loss: 2965023.7500
Epoch 3/150
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 1s/step - reconstruction_loss: 33263.6641 - total_loss: 127057305600.0000 - vq_loss: 127057264640.0000
Epoch 4/150
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 1s/step - reconstruction_loss: 30184.0918 - total_loss: 4669095018496.0000 - vq_loss: 4669095018496.0000
Epoch 5/150
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 1s/step - reconstruction_loss: 21875.9980 - total_loss: 724785364992.0000 - vq_loss: 724785364992.0000
Epoch 6/150
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 1s/step - reconstruction_loss: 15966.8213 - t

In [102]:
VQVAE.save("../../model/vq_vae")

Model saved successfully in folder: ../../model/vq_vae


In [109]:
sound_generator = SoundGenerator(VQVAE, HOP_LENGTH)

In [110]:
MIN_MAX_VALUES_PATH = "../data/fsdd/min_max_values.pkl"
SAVE_DIR_ORIGINAL = "../samples/vq_vae/original/"
SAVE_DIR_GENERATED = "../samples/vq_vae/generated/"


In [111]:
# Load spectrograms + min max values
with open(MIN_MAX_VALUES_PATH, "rb") as f:
    min_max_values = pickle.load(f)
specs, file_paths = load_fsdd(SPECTROGRAMS_PATH)


In [112]:
# Sample spectrograms + min max values

file_paths_selected = file_paths

sampled_indexes = np.random.choice(range(len(specs)), 5)
sampled_spectrogrmas = specs[sampled_indexes]

file_paths_selected = [file_paths_selected[index] for index in sampled_indexes]
file_paths_selected =  list(map(lambda st: str.replace(st, "\\", "/"), file_paths_selected))
file_paths_selected =  list(map(lambda st: str.replace(st, "..", "."), file_paths_selected))

sampled_min_max_values = [min_max_values[file_path] for file_path in file_paths_selected]

print(file_paths_selected)
print(sampled_min_max_values)

['./data/fsdd/spectrograms/9_lucas_27.wav.npy', './data/fsdd/spectrograms/5_yweweler_44.wav.npy', './data/fsdd/spectrograms/7_nicolas_46.wav.npy', './data/fsdd/spectrograms/2_lucas_5.wav.npy', './data/fsdd/spectrograms/2_lucas_7.wav.npy']
[{'min': -58.139954, 'max': 21.860044}, {'min': -68.934555, 'max': 11.065449}, {'min': -58.532578, 'max': 21.467422}, {'min': -56.122486, 'max': 23.877514}, {'min': -58.133507, 'max': 21.866493}]


In [113]:
# Generate audio for sampled spectrograms
signals, _ = sound_generator.generate(sampled_spectrogrmas, sampled_min_max_values)

original_signals = sound_generator.convert_spectrograms_to_audio(sampled_spectrogrmas, sampled_min_max_values)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


In [114]:
save_signals(signals, SAVE_DIR_GENERATED)
save_signals(original_signals, SAVE_DIR_ORIGINAL)