In [1]:

!python --version

Python 3.7.0


In [2]:
#import magenta and other dependencies
from scipy.io import wavfile

def save_wav(waveform, path, sample_rate=16000):
    # SciPy's `wavfile.write` saves a numpy array as a WAV file.
    # Ensure waveform is in the range of int16 for compatibility.
    wavfile.write(path, sample_rate, (waveform * 32767).astype(np.int16))


In [3]:
import tensorflow as tf
print(tf.__version__)

2.9.1


In [11]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import librosa
from magenta.models.gansynth.lib import model as lib_model
from magenta.models.gansynth.lib import flags as lib_flags
import matplotlib.pyplot as plt

# paths for our directories
real_spectrograms_dir = r'C:\Users\khafa\Downloads\UrbanSound8K\UrbanSound8K\spectrograms'
output_audio_dir = r'C:\Users\khafa\Downloads\UrbanSound8K\UrbanSound8K\generated_audio'
os.makedirs(output_audio_dir, exist_ok=True)

#parameters
latent_dim = 100
spectrogram_shape = (128, 128)  # Match this with the dimensions of your spectrograms

#load the real specs
def load_spectrograms(spectrogram_dir, target_shape):
    spectrograms = []
    for filename in os.listdir(spectrogram_dir):
        if filename.endswith('.png'):
            img = plt.imread(os.path.join(spectrogram_dir, filename))
            if img.shape[-1] == 4:  # RGBA
                img = img[..., :3].dot([0.2989, 0.5870, 0.1140])
            elif img.shape[-1] == 3:  # RGB
                img = img.dot([0.2989, 0.5870, 0.1140])

            img_resized = tf.image.resize(img[..., np.newaxis], target_shape)  # Add channel dimension
            spectrograms.append(img_resized)
    return np.array(spectrograms)


real_spectrograms = load_spectrograms(real_spectrograms_dir, spectrogram_shape)

# nromalization
real_spectrograms = real_spectrograms / 255.0

# generator and discriminator

# generator
def build_generator(latent_dim, output_shape):
    model = tf.keras.Sequential([
        layers.Dense(256, activation='relu', input_dim=latent_dim),  # Latent vector as input here
        layers.Reshape((16, 16, 1)),
        layers.Conv2DTranspose(128, kernel_size=4, strides=2, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.Conv2DTranspose(64, kernel_size=4, strides=2, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.Conv2DTranspose(1, kernel_size=4, strides=2, padding='same', activation='sigmoid'),
        layers.Reshape((*output_shape, 1))  # Output shape to match spectrogram dimensions with 1 channel
    ])
    return model

# discriminator
def build_discriminator(input_shape):
    model = tf.keras.Sequential([
        layers.Conv2D(64, kernel_size=4, strides=2, padding='same', input_shape=input_shape),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(128, kernel_size=4, strides=2, padding='same'),
        layers.LeakyReLU(alpha=0.2),
        layers.Flatten(),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

generator = build_generator(latent_dim, spectrogram_shape)
discriminator = build_discriminator((spectrogram_shape[0], spectrogram_shape[1], 1))
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#combine the two models
discriminator.trainable = False
gan_input = tf.keras.Input(shape=(latent_dim,))
generated_spectrogram = generator(gan_input)
gan_output = discriminator(generated_spectrogram)
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002), loss='binary_crossentropy')


# training the gan
def train_gan(generator, discriminator, gan, real_spectrograms, latent_dim, epochs=1000, batch_size=32):
    half_batch = int(batch_size / 2)
    for epoch in range(epochs):
        # Discriminator Training: Real vs Fake with Label Noise and Smoothing
        idx = np.random.randint(0, real_spectrograms.shape[0], half_batch)
        real_images = real_spectrograms[idx]
        real_labels = np.ones((half_batch, 1)) * 0.9  # Label smoothing for real labels

        noise = np.random.normal(0, 1, (half_batch, latent_dim))  # Latent vector batch
        fake_images = generator.predict(noise)
        fake_labels = np.zeros((half_batch, 1)) + 0.1  # Noise for fake labels

        d_loss_real = discriminator.train_on_batch(real_images, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_images, fake_labels)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Generator Training: Train to Fool Discriminator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        valid_labels = np.ones((batch_size, 1))  # Generator wants these to be classified as real
        g_loss = gan.train_on_batch(noise, valid_labels)

        print(f"{epoch}/{epochs} [D loss: {d_loss[0]} | D acc.: {100*d_loss[1]}] [G loss: {g_loss}]")

train_gan(generator, discriminator, gan, real_spectrograms, latent_dim)

#convert the finished fake specttrogram to audio
def convert_spectrogram_to_audio(spectrogram):
    # Ensure spectrogram is 2D by squeezing any extra dimensions
    spectrogram = np.squeeze(spectrogram)
    if spectrogram.ndim != 2:
        raise ValueError("Spectrogram must be a 2D array for audio conversion.")

    S = librosa.db_to_power(spectrogram)  # Convert to power
    audio = librosa.feature.inverse.mel_to_audio(S, sr=22050, n_fft=2048, hop_length=512)
    return audio

noise = np.random.normal(0, 1, (1, latent_dim))
generated_spectrogram = generator.predict(noise)[0]
audio = convert_spectrogram_to_audio(generated_spectrogram)
librosa.output.write_wav(os.path.join(output_audio_dir, "generated_audio.wav"), audio, sr=22050)




0/1000 [D loss: 1.5526798367500305 | D acc.: 0.0] [G loss: 0.6007356643676758]
1/1000 [D loss: 0.7052302956581116 | D acc.: 0.0] [G loss: 1.103801965713501]
2/1000 [D loss: 0.5393840819597244 | D acc.: 0.0] [G loss: 1.851409912109375]
3/1000 [D loss: 0.48406440019607544 | D acc.: 0.0] [G loss: 2.7698349952697754]
4/1000 [D loss: 0.43947792053222656 | D acc.: 0.0] [G loss: 3.124368667602539]
5/1000 [D loss: 0.35940201580524445 | D acc.: 0.0] [G loss: 2.491506576538086]
6/1000 [D loss: 0.3278929740190506 | D acc.: 0.0] [G loss: 1.6036906242370605]
7/1000 [D loss: 0.37109918892383575 | D acc.: 0.0] [G loss: 1.6461987495422363]
8/1000 [D loss: 0.36364510655403137 | D acc.: 0.0] [G loss: 2.144848346710205]
9/1000 [D loss: 0.3543195128440857 | D acc.: 0.0] [G loss: 2.231083393096924]
10/1000 [D loss: 0.3325713872909546 | D acc.: 0.0] [G loss: 1.965491771697998]
11/1000 [D loss: 0.326012060046196 | D acc.: 0.0] [G loss: 1.7192751169204712]
12/1000 [D loss: 0.3365990072488785 | D acc.: 0.0] [G