In [6]:
import os
import random
import librosa
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf  # For saving audio files
import librosa.display

# Path to the main folder
base_dir = r"C:\Users\Peter\Documents\Sync\SyncDokumente\FH\MA\3. Semester\SLP24_exercises-1\PStA\PROCESS-V1"

# Preprocessing parameters
SR = 16000       # Sample rate
DURATION = 2.0   # Duration in seconds
N_MELS = 64      # Number of mel bands
HOP_LENGTH = 512 # Hop length for STFT
FIXED_LENGTH = int(SR * DURATION)  # Number of samples for fixed duration

# Function to load all .wav files recursively
def load_audio_files(base_dir):
    audio_files = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".wav"):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Function to preprocess audio files
def preprocess_audio_multiple_segments(file_path, num_segments=3):
    y, sr = librosa.load(file_path, sr=SR)
    segments = []
    
    for _ in range(num_segments):
        if len(y) < FIXED_LENGTH:
            # Pad audio
            padding = FIXED_LENGTH - len(y)
            y = np.pad(y, (0, padding), 'constant')
            segment = y
        else:
            # Randomly select a start point
            max_offset = len(y) - FIXED_LENGTH
            start = random.randint(0, max_offset)
            segment = y[start:start + FIXED_LENGTH]

        # Compute mel-spectrogram
        mel_spec = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        segments.append(mel_spec_db)

    return segments

# Load all .wav file paths
audio_file_paths = load_audio_files(base_dir)
print(f"Found {len(audio_file_paths)} audio files.")

# Preprocess all audio files
data = []
for idx, file_path in enumerate(audio_file_paths):
    mel_specs = preprocess_audio_multiple_segments(file_path, num_segments=3)
    for mel_spec in mel_specs:
        data.append(mel_spec)

data = np.array(data)
print(f"Data shape after converting to np.array: {data.shape}")  # Should be (num_samples, N_MELS, time_steps)

# Normalize data to [-1, 1]
data_min = data.min()
data_max = data.max()
data = (data - data_min) / (data_max - data_min) * 2 - 1

# Add a channel dimension
data = data[..., np.newaxis]
print(f"Data shape after adding channel dimension: {data.shape}")  # Should be (num_samples, N_MELS, time_steps, 1)



Found 471 audio files.
Data shape after converting to np.array: (1413, 64, 63)
Data shape after adding channel dimension: (1413, 64, 63, 1)


In [7]:
def check_sampling_rates(file_paths):
    rates = []
    for file_path in file_paths:
        sr = librosa.get_samplerate(file_path)
        rates.append(sr)
    return rates

rates = check_sampling_rates(audio_file_paths)
unique_rates = set(rates)
print(f"Unique sampling rates in dataset: {unique_rates}")
if len(unique_rates) > 1:
    print("Warning: Dataset contains multiple sampling rates.")

i = check_sampling_rates(audio_file_paths)
print (i[1])

Unique sampling rates in dataset: {16000}
16000


In [8]:

import tensorflow as tf
print(tf.__version__)
print(tf.keras.__version__)
from keras import backend as K
K.clear_session()

2.18.0
3.6.0



In [9]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Conv2D, Conv2DTranspose, Flatten, Reshape,
    LeakyReLU, BatchNormalization, ZeroPadding2D, Cropping2D
)
from tensorflow.keras.models import Model, Sequential

# Input shape parameters
num_samples, N_MELS, time_steps, channels = data.shape
input_shape = (N_MELS, time_steps, channels)
latent_dim = 100  # Size of the latent space (noise vector)

def build_generator(latent_dim):
    model = Sequential()
    
    # Starting dimensions
    n_rows, n_cols = 8, 11  # Choose values so that upsampling results in (64, 87)
    model.add(Dense(256 * n_rows * n_cols, input_dim=latent_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Reshape((n_rows, n_cols, 256)))  # Shape: (8, 11, 256)
    
    # First upsampling
    model.add(Conv2DTranspose(256, kernel_size=4, strides=(2, 2), padding='same'))  # Shape: (16, 22, 256)
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    
    # Second upsampling
    model.add(Conv2DTranspose(128, kernel_size=4, strides=(2, 2), padding='same'))  # Shape: (32, 44, 128)
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    
    # Third upsampling
    model.add(Conv2DTranspose(64, kernel_size=4, strides=(2, 2), padding='same'))  # Shape: (64, 88, 64)
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    
    # Final adjustment to reach (64, 87, 1)
    model.add(Conv2DTranspose(1, kernel_size=4, strides=(1, 1), padding='valid', activation='tanh'))  # Shape: (67, 91, 1)
    # Adjusted cropping to achieve the desired output shape
    model.add(Cropping2D(cropping=((1, 2), (2, 2))))  # Crop to (64, 87, 1)
    
    return model


def build_discriminator(input_shape):
    model = Sequential()
    model.add(Conv2D(64, kernel_size=4, strides=2, padding='same', input_shape=input_shape))
    model.add(LeakyReLU(alpha=0.2))
    
    model.add(Conv2D(128, kernel_size=4, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    
    model.add(Conv2D(256, kernel_size=4, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

# Build the generator and discriminator
generator = build_generator(latent_dim)
discriminator = build_discriminator(input_shape)

# Compile the discriminator
discriminator.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5),
    run_eagerly=True
)

# Build and compile the GAN model
discriminator.trainable = True  # Freeze the discriminator's weights when training the generator
gan_input = Input(shape=(latent_dim,))
generated_image = generator(gan_input)
gan_output = discriminator(generated_image)
gan = Model(gan_input, gan_output)
gan.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5)
)


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "functional_23" is incompatible with the layer: expected shape=(None, 64, 63, 1), found shape=(None, 64, 87)[0m

Arguments received by Sequential.call():
  • args=('<KerasTensor shape=(None, 64, 87, 1), dtype=float32, sparse=False, name=keras_tensor_163>',)
  • kwargs={'mask': 'None'}

In [12]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Conv2D, Conv2DTranspose, Flatten, Reshape,
    LeakyReLU, BatchNormalization, ZeroPadding2D, Cropping2D
)
from tensorflow.keras.models import Model, Sequential

# Input shape parameters
num_samples, N_MELS, time_steps, channels = data.shape
print(f"Input shape: ({num_samples}, {N_MELS}, {time_steps}, {channels})")
input_shape = (N_MELS, time_steps, channels)
latent_dim = 100  # Size of the latent space (noise vector)

def build_generator(latent_dim):
    model = Sequential()
    
    # Starting dimensions - Adjusted for (64, 63) output
    n_rows, n_cols = 8, 8  # Choose values so that upsampling results in (64, 64) 
    model.add(Dense(128 * n_rows * n_cols, input_dim=latent_dim))  
    model.add(LeakyReLU(alpha=0.2))
    model.add(Reshape((n_rows, n_cols, 128)))  # Shape: (8, 8, 128)
    
    # First upsampling
    model.add(Conv2DTranspose(128, kernel_size=4, strides=(2, 2), padding='same'))  # Shape: (16, 16, 128)
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    
    # Second upsampling
    model.add(Conv2DTranspose(64, kernel_size=4, strides=(2, 2), padding='same'))  # Shape: (32, 32, 64)
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    

    # Third upsampling
    model.add(Conv2DTranspose(32, kernel_size=4, strides=(2, 2), padding='same'))  # Shape: (64, 64, 32)
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))

    # Final layer - Adjusted for (64, 63, 1) output
    model.add(Conv2DTranspose(1, kernel_size=(3, 3), strides=(1, 1), padding='valid', activation='tanh'))  # Shape: (64, 63, 1)

    # Cropping to get the exact desired shape if necessary
    model.add(Cropping2D(((0,2), (3,0))))    
    
    return model


def build_discriminator(input_shape):
    model = Sequential()
    model.add(Conv2D(64, kernel_size=4, strides=2, padding='same', input_shape=input_shape))
    model.add(LeakyReLU(alpha=0.2))
    
    model.add(Conv2D(128, kernel_size=4, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    
    model.add(Conv2D(256, kernel_size=4, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

# Build the generator and discriminator
generator = build_generator(latent_dim)
discriminator = build_discriminator(input_shape)

# Compile the discriminator
discriminator.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5),
    run_eagerly=True
)

# Build and compile the GAN model
discriminator.trainable = True  # Freeze the discriminator's weights when training the generator
gan_input = Input(shape=(latent_dim,))
generated_image = generator(gan_input)
gan_output = discriminator(generated_image)
gan = Model(gan_input, gan_output)
gan.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5)
)


Input shape: (1413, 64, 63, 1)


In [13]:
# Training parameters
epochs = 5000
batch_size = 32
save_interval = 1000  # Save generated samples every 1000 epochs

# Labels for real and fake images (unused in custom loop but kept for completeness)
real = np.ones((batch_size, 1), dtype=np.float32)
fake = np.zeros((batch_size, 1), dtype=np.float32)

# Define loss function
binary_cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=False)

# Define optimizers
generator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5)
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5)

# Build the generator and discriminator (ensure these functions are defined)
generator = build_generator(latent_dim)
discriminator = build_discriminator(input_shape)

# Ensure models are trainable
generator.trainable = True
discriminator.trainable = True

# Training step function
@tf.function
def train_step(real_images):
    # Generate noise
    noise = tf.random.normal([batch_size, latent_dim])

    # Train the discriminator
    with tf.GradientTape() as disc_tape:
        # Generate fake images
        generated_images = generator(noise, training=True)
        
        # Discriminator outputs
        real_output = discriminator(real_images, training=True)
        fake_output = discriminator(generated_images, training=True)

        # Calculate discriminator loss
        disc_loss_real = binary_cross_entropy(tf.ones_like(real_output), real_output)
        disc_loss_fake = binary_cross_entropy(tf.zeros_like(fake_output), fake_output)
        disc_loss = disc_loss_real + disc_loss_fake

    # Calculate discriminator gradients
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    
    # Apply discriminator gradients
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

    # Train the generator
    with tf.GradientTape() as gen_tape:
        # Generate fake images
        generated_images = generator(noise, training=True)
        
        # Discriminator output for generated images
        fake_output = discriminator(generated_images, training=True)

        # Calculate generator loss
        gen_loss = binary_cross_entropy(tf.ones_like(fake_output), fake_output)

    # Calculate generator gradients
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    
    # Apply generator gradients
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))

    return disc_loss, gen_loss

# Check trainable variables
print(f"Generator trainable variables: {len(generator.trainable_variables)}")
print(f"Discriminator trainable variables: {len(discriminator.trainable_variables)}")

# Training loop
for epoch in range(1, epochs + 1):
    # Select a random batch of real images
    idx = np.random.randint(0, data.shape[0], batch_size)
    real_imgs = data[idx]

    # Perform a training step
    d_loss, g_loss = train_step(real_imgs)

    # Print progress
    if epoch % 100 == 0 or epoch == 1:
        print(f"{epoch} [D loss: {d_loss.numpy():.4f}] [G loss: {g_loss.numpy():.4f}]")

    # If at save interval, save generated image samples
    if epoch % save_interval == 0:
        # Generate and save images
        noise = tf.random.normal([1, latent_dim])
        gen_img = generator(noise, training=False)
        gen_img = gen_img.numpy().squeeze()

        # Rescale back to original scale
        gen_img = (gen_img + 1) / 2  # Scale from [-1, 1] to [0, 1]
        gen_img = gen_img * (data_max - data_min) + data_min  # Rescale to original data range

        # Save the generated image
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(gen_img, sr=SR, hop_length=HOP_LENGTH, x_axis='time', y_axis='mel')
        plt.title(f"Generated Mel Spectrogram at Epoch {epoch}")
        plt.colorbar(format='%+2.0f dB')
        plt.tight_layout()
        plt.savefig(f"generated_image_epoch_{epoch}.png")
        plt.close()


Generator trainable variables: 16
Discriminator trainable variables: 12
1 [D loss: 1.9609] [G loss: 2.0372]
100 [D loss: 0.2051] [G loss: 3.9163]
200 [D loss: 0.2145] [G loss: 4.9114]
300 [D loss: 0.0683] [G loss: 5.1567]
400 [D loss: 0.5063] [G loss: 2.6100]
500 [D loss: 0.2208] [G loss: 3.5783]
600 [D loss: 0.4821] [G loss: 2.3689]
700 [D loss: 0.2634] [G loss: 3.5275]
800 [D loss: 0.5644] [G loss: 2.8879]
900 [D loss: 0.3330] [G loss: 3.0212]
1000 [D loss: 0.5814] [G loss: 3.7845]
1100 [D loss: 1.0346] [G loss: 4.8884]
1200 [D loss: 0.3142] [G loss: 2.6645]
1300 [D loss: 0.4955] [G loss: 2.5401]
1400 [D loss: 0.3485] [G loss: 2.4766]
1500 [D loss: 0.5230] [G loss: 3.2496]
1600 [D loss: 0.9867] [G loss: 2.7117]
1700 [D loss: 0.8517] [G loss: 2.1016]
1800 [D loss: 0.2692] [G loss: 2.8978]
1900 [D loss: 0.7775] [G loss: 0.8287]
2000 [D loss: 0.4149] [G loss: 3.3972]
2100 [D loss: 0.5642] [G loss: 1.8329]
2200 [D loss: 0.3051] [G loss: 2.6673]
2300 [D loss: 0.6707] [G loss: 2.7022]
2400

In [14]:
def generate_audio_from_mel(generator, latent_dim, filename):
    # Generate mel-spectrogram
    noise = np.random.normal(0, 1, (1, latent_dim))
    gen_mel = generator.predict(noise)
    gen_mel = gen_mel.squeeze()
    
    # Rescale the generated mel-spectrogram
    gen_mel = (gen_mel + 1) / 2  # Scale from [-1, 1] to [0, 1]
    gen_mel = gen_mel * (data_max - data_min) + data_min  # Rescale to original data range
    
    # Convert mel-spectrogram (in dB) to power
    gen_mel = librosa.db_to_power(gen_mel)
    
    # Invert the mel-spectrogram to a waveform
    y = librosa.feature.inverse.mel_to_audio(
        gen_mel,
        sr=SR,
        n_fft=2048,
        hop_length=HOP_LENGTH,
        win_length=2048,
        n_iter=60,
        power=1.0
    )
    
    # Save the audio using soundfilehttps://www.tensorflow.org/api_docs/python/tf/function
    sf.write(filename, y, SR)
    print(f"Generated audio saved to {filename}")
    
# Example usage
generate_audio_from_mel(generator, latent_dim, 'generated_audio_I.wav')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
Generated audio saved to generated_audio_I.wav


In [15]:
# Save final models
generator.save("final_generator_I.h5")
discriminator.save("final_discriminator_I.h5")



In [16]:
# Save final models
generator.save("final_generator_I.keras")
discriminator.save("final_discriminator_I.keras")

In [17]:
def generate_audio_from_mel_fixed(generator, latent_dim, filename, max_duration=20.0):
    """
    Generate audio from a trained GAN model and save it with fixed settings.
    """
    import numpy as np
    import librosa
    import soundfile as sf

    # Generate mel-spectrogram
    noise = np.random.normal(0, 1, (1, latent_dim))
    gen_mel = generator.predict(noise)
    gen_mel = gen_mel.squeeze()

    # Rescale the generated mel-spectrogram
    gen_mel = (gen_mel + 1) / 2  # Scale from [-1, 1] to [0, 1]
    gen_mel = gen_mel * (data_max - data_min) + data_min  # Rescale to original data range

    # Convert mel-spectrogram (in dB) to power
    gen_mel = librosa.db_to_power(gen_mel)

    # Invert the mel-spectrogram to a waveform
    y = librosa.feature.inverse.mel_to_audio(
        gen_mel,
        sr=SR,
        n_fft=2048,
        hop_length=HOP_LENGTH,
        win_length=2048,
        n_iter=60,
        power=1.0
    )

    # Adjust length to be slightly variable but within 20 seconds
    max_samples = int(SR * max_duration)
    y = librosa.util.fix_length(y, size=np.random.randint(max_samples - 1000, max_samples + 1000))

    # Save the audio using soundfile
    sf.write(filename, y, SR)
    print(f"Generated audio saved to {filename}")

def generate_batch_audio_fixed(generator, latent_dim, num_samples, folder_path, max_duration=4.0):
    """
    Generate a batch of audio files and save them to a folder.
    """
    import os

    os.makedirs(folder_path, exist_ok=True)
    for i in range(num_samples):
        filename = os.path.join(folder_path, f"synthetic_sample_{i}.wav")
        generate_audio_from_mel_fixed(generator, latent_dim, filename, max_duration=max_duration)

generate_batch_audio_fixed(generator, latent_dim, num_samples=100, folder_path="augmented_data_fixed_Is")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Generated audio saved to augmented_data_fixed_Is\synthetic_sample_0.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Generated audio saved to augmented_data_fixed_Is\synthetic_sample_1.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Generated audio saved to augmented_data_fixed_Is\synthetic_sample_2.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Generated audio saved to augmented_data_fixed_Is\synthetic_sample_3.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Generated audio saved to augmented_data_fixed_Is\synthetic_sample_4.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Generated audio saved to augmented_data_fixed_Is\synthetic_sample_5.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Generated audio saved to augmented_data_fixed_Is\synthetic_sampl