## Google Colab Setup

Repeat the same process from the last challenge to upload your challenge folder and open your notebook:

1. access your [Google Drive](https://drive.google.com/)
2. go into the Colab Notebooks folder
3. drag and drop this challenge's folder into it
4. right-click the notebook file and select `Open with` $\rightarrow$ `Google Colaboratory`

Don't forget to enable GPU acceleration!

`Runtime` $\rightarrow$ `Change runtime type` $\rightarrow$ `Hardware accelerator` $\rightarrow$ `GPU`

When this is done, run the cells below and get to work!

In [None]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Put Colab in the context of this challenge
import os

# os.chdir allows you to change directories, like cd in the Terminal
os.chdir('/content/drive/MyDrive/Colab Notebooks/data-autoencoders')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import soundfile as sf
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Conv2D, Conv2DTranspose
import soundfile as sf
import glob
from datetime import datetime

In [None]:
# Directory where your .wav files are located
directory = '../raw_data/musicnet/musicnet/test_data_split/'

# Create a file path list for all .wav files in the directory
file_paths = list(glob.glob(directory + '/*.wav'))

# Print the file paths
# for file_path in file_paths:
#     print(file_path)

In [None]:
# Load the .wav files and generate spectrograms
def load_data(file_paths):
    spectrograms = []
    for file_path in file_paths:
        audio, sr = librosa.load(file_path, sr=44100, mono=True)
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
        spectrogram = librosa.power_to_db(spectrogram)
        spectrograms.append(spectrogram)
    return spectrograms

spectrograms = load_data(file_paths)
len(spectrograms)

In [None]:
# Split the data into training and testing sets
X_train, X_test = train_test_split(spectrograms, test_size=0.2, random_state=42)
# we will need these later on for de-normalization
TEST_MIN = np.min(X_test)
TEST_MAX = np.max(X_test)
# Normalize the spectrograms and reshape them
X_train = (X_train - np.min(X_train)) / (np.max(X_train) - np.min(X_train))
X_test = (X_test - np.min(X_test)) / (np.max(X_test) - np.min(X_test))

X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

### functional API to build the autoencoder: 

In [None]:
X_train[0].shape

In [None]:
# # Define the encoder model
input_shape = X_train[0].shape
input_shape

### Chatgpt version 

Architecture 1

In [None]:
# from tensorflow.keras import Sequential
# from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Reshape, Conv2DTranspose
# from tensorflow.keras import Model
# from tensorflow.keras.layers import Input

# input_shape = X_train[0].shape #(128, 2584, 1)
# latent_dimension = 1000

# def build_encoder(latent_dimension):
#     encoder = Sequential()

#     encoder.add(Conv2D(8, (2, 2), input_shape=input_shape, activation='tanh'))
#     encoder.add(MaxPooling2D(2))

#     encoder.add(Conv2D(16, (2, 2), activation='tanh'))
#     encoder.add(MaxPooling2D(2))

#     encoder.add(Conv2D(32, (2, 2), activation='tanh'))
#     encoder.add(MaxPooling2D(2))

#     encoder.add(Flatten())
#     encoder.add(Dense(latent_dimension, activation='tanh'))

#     return encoder

# encoder = build_encoder(latent_dimension)

# def build_decoder(latent_dimension):
#     decoder = Sequential()

#     decoder.add(Dense(16 * 323 * 8, activation='tanh', input_shape=(latent_dimension,)))
#     decoder.add(Reshape((16, 323, 8)))

#     decoder.add(Conv2DTranspose(16, (2, 2), strides=2, padding='same', activation='tanh'))
#     decoder.add(Conv2DTranspose(8, (2, 2), strides=2, padding='same', activation='tanh'))
#     decoder.add(Conv2DTranspose(1, (2, 2), strides=2, padding='same', activation='tanh'))

#     decoder.add(Reshape(input_shape))

#     return decoder

# decoder = build_decoder(latent_dimension)

# def build_autoencoder(encoder, decoder):
#     inp = Input(input_shape)
#     encoded = encoder(inp)
#     decoded = decoder(encoded)
#     autoencoder = Model(inp, decoded)
#     return autoencoder

# autoencoder = build_autoencoder(encoder, decoder)

# def compile_autoencoder(autoencoder):
#     autoencoder.compile(loss='MeanSquaredLogarithmicError', optimizer='adam') 
#     #mse (second best), mae (not good!), logcosh (best), KLDivergence(not good!), MeanSquaredLogarithmicError(not good!)

# compile_autoencoder(autoencoder)


Architecture 2

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Reshape, Conv2DTranspose
from tensorflow.keras import Model
from tensorflow.keras.layers import Input

def build_encoder(latent_dimension):
    encoder = Sequential()

    encoder.add(Conv2D(8, (5,5), input_shape=input_shape, activation='tanh'))
    #encoder.add(MaxPooling2D(2))

    encoder.add(Conv2D(16, (5,5), activation='tanh'))
    #encoder.add(MaxPooling2D(2))

    encoder.add(Conv2D(32, (5,5), activation='tanh'))
    #encoder.add(MaxPooling2D(2))

    encoder.add(Flatten())
    encoder.add(Dense(latent_dimension, activation='tanh'))

    return encoder

encoder = build_encoder(1000)
encoder.summary()

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Reshape, Conv2DTranspose
from tensorflow.keras import Model
from tensorflow.keras.layers import Input

input_shape = X_train[0].shape #(128, 2584, 1)
latent_dimension = 1000



def build_decoder(latent_dimension):
    decoder = Sequential()

    decoder.add(Dense(16 * 323 * 8, activation='tanh', input_shape=(latent_dimension,)))
    decoder.add(Reshape((16, 323, 8)))

    decoder.add(Conv2DTranspose(16, (2, 2), strides=2, padding='same', activation='tanh'))
    decoder.add(Conv2DTranspose(8, (2, 2), strides=2, padding='same', activation='tanh'))
    decoder.add(Conv2DTranspose(1, (2, 2), strides=2, padding='same', activation='tanh'))

    decoder.add(Reshape(input_shape))

    return decoder

decoder = build_decoder(latent_dimension)

def build_autoencoder(encoder, decoder):
    inp = Input(input_shape)
    encoded = encoder(inp)
    decoded = decoder(encoded)
    autoencoder = Model(inp, decoded)
    return autoencoder

autoencoder = build_autoencoder(encoder, decoder)

def compile_autoencoder(autoencoder):
    autoencoder.compile(loss='MeanSquaredLogarithmicError', optimizer='adam') 
    #mse (second best), mae (not good!), logcosh (best), KLDivergence(not good!), MeanSquaredLogarithmicError(not good!)

compile_autoencoder(autoencoder)


In [None]:
encoder.summary()

In [None]:
decoder.summary()

In [None]:
128*2584

In [None]:
autoencoder.summary()

In [None]:
# # Define the encoder model
# input_shape = X_train[0].shape

# input_layer = Input(shape=input_shape)
# encoder = Conv2D(16, (3, 3), activation='relu', padding='same')(input_layer)
# encoder = Conv2D(8, (3, 3), activation='relu', padding='same')(encoder)

# encoder_model = Model(input_layer, encoder)

# # Define the decoder model
# decoder_input = Input(shape=encoder_model.output_shape[1:])
# decoder = Conv2DTranspose(8, (3, 3), activation='relu', padding='same')(decoder_input)
# decoder = Conv2DTranspose(16, (3, 3), activation='relu', padding='same')(decoder)
# decoder = Conv2DTranspose(1, (3, 3), activation='sigmoid', padding='same')(decoder)

# decoder_model = Model(decoder_input, decoder)

# # Combine the encoder and decoder to create the autoencoder
# autoencoder_input = Input(shape=input_shape)
# encoded = encoder_model(autoencoder_input)
# decoded = decoder_model(encoded)

# autoencoder = Model(autoencoder_input, decoded)
# autoencoder.compile(optimizer='adam', loss='logcosh')

In [None]:
#encoder_model.summary()

In [None]:
#decoder_model.summary()

In [None]:
#autoencoder.summary()

## using functions to build encoder, decoder and autoencoder: 

In [None]:
# from tensorflow.keras import layers, models

# # Function to build the encoder
# def build_encoder(latent_space):
#     input_shape = X_train[0].shape  # Specify the input shape of your data
#     input_layer = layers.Input(shape=input_shape)
#     encoder = layers.Conv2D(16, (3, 3), activation='relu', padding='same')(input_layer)
#     encoder = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(encoder)
#     encoder = layers.Flatten()(encoder)
#     latent_output = layers.Dense(latent_space, activation='relu')(encoder)
#     encoder_model = models.Model(input_layer, latent_output)
#     return encoder_model

# # Function to build the decoder
# def build_decoder(latent_space):
#     decoder_input = layers.Input(shape=(latent_space,))
#     decoder = layers.Dense(units=np.prod(latent_shape[1:]), activation='relu')(decoder_input)
#     decoder = layers.Reshape(target_shape=latent_shape[1:])(decoder)
#     decoder = layers.Conv2DTranspose(8, (3, 3), activation='relu', padding='same')(decoder)
#     decoder = layers.Conv2DTranspose(16, (3, 3), activation='relu', padding='same')(decoder)
#     decoder_output = layers.Conv2DTranspose(1, (3, 3), activation='sigmoid', padding='same')(decoder)
#     decoder_model = models.Model(decoder_input, decoder_output)
#     return decoder_model

# # Function to build the autoencoder
# def build_autoencoder(encoder, decoder):
#     autoencoder_input = layers.Input(shape=encoder.input_shape[1:])
#     encoded = encoder(autoencoder_input)
#     decoded = decoder(encoded)
#     autoencoder = models.Model(autoencoder_input, decoded)
#     autoencoder.compile(optimizer='adam', loss='mse')
#     return autoencoder

# # Specify the latent space dimension
# latent_space = 16

# # Build the encoder, decoder, and autoencoder
# encoder = build_encoder(latent_space)
# decoder = build_decoder(latent_space)
# autoencoder = build_autoencoder(encoder, decoder)

In [None]:
# from kapre.losses import *
# spectrogram_loss = SpectrogramLoss()
# perceptual_loss = PerceptualLoss()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

es = EarlyStopping(patience = 5, restore_best_weights=True)

# Get the current date and time for naming our model respectively
date_time_str = datetime.now().strftime("%Y-%m-%d_%H")

# save the best model using checkpoint callback 
checkpoint = ModelCheckpoint(f"sp_autoencoder_{date_time_str}.h5", save_best_only=True)

# Train the autoencoder model
history = autoencoder.fit(X_train, X_train, 
                          epochs=100, batch_size=16, 
                          validation_data=(X_test, X_test),
                          callbacks= [es, checkpoint])

### To load an already trained model

In [None]:
# from tensorflow.keras.models import load_model
# # Load an existing autoencoder model
# autoencoder = load_model('autoencoder_2023-06-06_15.h5')

### Predict using the model

In [None]:
# Generate reconstructed spectrograms using the trained autoencoder
reconstructed_X_test = autoencoder.predict(X_test)
reconstructed_X_train = autoencoder.predict(X_train)

In [None]:
# Plot the training and validation loss curves
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

### Visualize some original vs. autoencoded spectrograms

In [None]:
# Visualize the original and reconstructed spectrograms
index = 0
plt.figure(figsize=(16, 4))
plt.subplot(2, 1, 1)
plt.title('Original Spectrogram (normalized!)')
plt.imshow(X_test[index].squeeze(), cmap='magma', origin='lower')

plt.subplot(2, 1, 2)
plt.title('Reconstructed Spectrogram (normalized!)')
plt.imshow(reconstructed_X_test[index].squeeze(), cmap='magma', origin='lower')

plt.tight_layout()
plt.show()

### reverse transform the reconstructed_spectrograms:

In [None]:
print(X_train.min(), X_train.max())
print(reconstructed_X_train.min(), reconstructed_X_train.max())

In [None]:
print(X_test.min(), X_test.max())
print(reconstructed_X_test.min(), reconstructed_X_test.max())

In [None]:
# spectrogram = reconstructed_X_test[0]
# spectrogram.shape

In [None]:
# #Remove the extra dimension added by np.expand_dims:
# spectrogram = np.squeeze(spectrogram, axis=-1)
#De-normalize the spectrograms back into their original range:
#spectrogram_denorm = spectrogram * (TEST_MAX - TEST_MIN) + TEST_MIN
# Reshape the spectrograms back to their original shape:
# spectrogram_denorm = np.reshape(spectrogram_denorm, X_test.shape)
# spectrogram_denorm = np.expand_dims(spectrogram_denorm, axis=-1)
#spectrogram_denorm.shape 

In [None]:
# print(TEST_MIN, TEST_MAX)
# print(spectrogram_denorm.min(), spectrogram_denorm.max())

In [None]:
# # De-normalize reconstructed_spectrograms back into X_test ranges
# reconstructed_spectrograms_denorm = reconstructed_spectrograms * (np.max(X_test) - np.min(X_test)) + np.min(X_test)

In [None]:
# # Assuming the sampling rate and hop length used to generate the spectrograms
# sampling_rate = 44100
# #duration = 30  # Desired duration in seconds
# #hop_length = int(duration * sampling_rate)
# hop_length = 520 

# # Inverse transform the spectrograms to obtain the audio signals
# reconstructed_audios = []
# for spectrogram in reconstructed_spectrograms_denorm:
#     spectrogram = np.squeeze(spectrogram)
#     spectrogram = librosa.db_to_power(spectrogram)
#     audio = librosa.feature.inverse.mel_to_audio(spectrogram, sr=sampling_rate, hop_length=hop_length) 
#     reconstructed_audios.append(audio)

# # Save the audio signals as .wav files
# for i, audio in enumerate(reconstructed_audios[0:1]):
#     output_path = f'reconstructed_audio_{i}.wav'
#     sf.write(output_path, audio, sampling_rate)


In [None]:
# file_path = file_paths[0]
# file_path
# audio, sr = librosa.load(file_path, sr=44100, mono=True)
# spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
# spectrogram = librosa.power_to_db(spectrogram)

In [None]:
#X_test[0].shape

In [None]:
sampling_rate = 44100
hop_length = 520
i = 5
spectrogram = reconstructed_X_test[i]
spectrogram_denorm = spectrogram * (TEST_MAX - TEST_MIN) + TEST_MIN
spectrogram_denorm = spectrogram_denorm.squeeze()
spectrogram_denorm.shape

spectrogram_rev = librosa.db_to_power(spectrogram_denorm)

reconstructed_audio = librosa.feature.inverse.mel_to_audio(spectrogram_rev, sr=sampling_rate, hop_length=hop_length)
output_path = f'reconstructed_audio_{i}_.wav'
sf.write(output_path, reconstructed_audio, sampling_rate)