In [1]:
from datasets import load_dataset


dataset = load_dataset("google/MusicCaps")

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval'],
        num_rows: 5521
    })
})

In [3]:
train_df = dataset['train'].to_pandas()


In [4]:
train_df

Unnamed: 0,ytid,start_s,end_s,audioset_positive_labels,aspect_list,caption,author_id,is_balanced_subset,is_audioset_eval
0,-0Gj8-vB1q4,30,40,"/m/0140xf,/m/02cjck,/m/04rlf","['low quality', 'sustained strings melody', 's...",The low quality recording features a ballad so...,4,False,True
1,-0SdAVK79lg,30,40,"/m/0155w,/m/01lyv,/m/0342h,/m/042v_gx,/m/04rlf...","['guitar song', 'piano backing', 'simple percu...",This song features an electric guitar as the m...,0,False,False
2,-0vPFx-wRRI,30,40,"/m/025_jnm,/m/04rlf","['amateur recording', 'finger snipping', 'male...",a male voice is singing a melody with changing...,6,False,True
3,-0xzrMun0Rs,30,40,"/m/01g90h,/m/04rlf","['backing track', 'jazzy', 'digital drums', 'p...",This song contains digital drums playing a sim...,6,False,True
4,-1LrH01Ei1w,30,40,"/m/02p0sh1,/m/04rlf","['rubab instrument', 'repetitive melody on dif...",This song features a rubber instrument being p...,0,False,False
...,...,...,...,...,...,...,...,...,...
5516,zw5dkiklbhE,15,25,"/m/01sm1g,/m/0l14md","['amateur recording', 'percussion', 'wooden bo...",This audio contains someone playing a wooden b...,6,False,False
5517,zwfo7wnXdjs,30,40,"/m/02p0sh1,/m/04rlf,/m/06j64v","['instrumental music', 'arabic music', 'genera...",The song is an instrumental. The song is mediu...,1,True,True
5518,zx_vcwOsDO4,50,60,"/m/01glhc,/m/02sgy,/m/0342h,/m/03lty,/m/04rlf,...","['instrumental', 'no voice', 'electric guitar'...",The rock music is purely instrumental and feat...,2,True,True
5519,zyXa2tdBTGc,30,40,"/m/04rlf,/t/dd00034","['instrumental music', 'gospel music', 'strong...",The song is an instrumental. The song is slow ...,1,False,False


In [5]:
train_df['caption']

0       The low quality recording features a ballad so...
1       This song features an electric guitar as the m...
2       a male voice is singing a melody with changing...
3       This song contains digital drums playing a sim...
4       This song features a rubber instrument being p...
                              ...                        
5516    This audio contains someone playing a wooden b...
5517    The song is an instrumental. The song is mediu...
5518    The rock music is purely instrumental and feat...
5519    The song is an instrumental. The song is slow ...
5520    This is a glitch music piece. There is a synth...
Name: caption, Length: 5521, dtype: object

In [6]:
import os
import pandas as pd

# Path to the folder containing the WAV files
wav_folder = 'music_data'  # Update this with your actual folder path

# Function to get the WAV file path based on YouTube ID
def get_wav_path(ytid):
    return os.path.join(wav_folder, f"{ytid}.wav")

# Create a list to store tuples of captions and WAV file paths
caption_wav_list = []

# Iterate through the rows of the original DataFrame
for index, row in train_df.iterrows():
    # Get the YouTube ID and caption
    ytid = row['ytid']
    caption = row['caption']
    
    # Get the WAV file path
    wav_path = get_wav_path(ytid)
    
    # Append the caption and WAV file path to the list
    caption_wav_list.append((caption, wav_path))

# Create a new DataFrame from the list
caption_wav_df = pd.DataFrame(caption_wav_list, columns=['caption', 'wav_path'])



In [7]:
caption_wav_df

Unnamed: 0,caption,wav_path
0,The low quality recording features a ballad so...,music_data\-0Gj8-vB1q4.wav
1,This song features an electric guitar as the m...,music_data\-0SdAVK79lg.wav
2,a male voice is singing a melody with changing...,music_data\-0vPFx-wRRI.wav
3,This song contains digital drums playing a sim...,music_data\-0xzrMun0Rs.wav
4,This song features a rubber instrument being p...,music_data\-1LrH01Ei1w.wav
...,...,...
5516,This audio contains someone playing a wooden b...,music_data\zw5dkiklbhE.wav
5517,The song is an instrumental. The song is mediu...,music_data\zwfo7wnXdjs.wav
5518,The rock music is purely instrumental and feat...,music_data\zx_vcwOsDO4.wav
5519,The song is an instrumental. The song is slow ...,music_data\zyXa2tdBTGc.wav


In [8]:
caption_wav_df = caption_wav_df[:100]

In [9]:
caption_wav_df

Unnamed: 0,caption,wav_path
0,The low quality recording features a ballad so...,music_data\-0Gj8-vB1q4.wav
1,This song features an electric guitar as the m...,music_data\-0SdAVK79lg.wav
2,a male voice is singing a melody with changing...,music_data\-0vPFx-wRRI.wav
3,This song contains digital drums playing a sim...,music_data\-0xzrMun0Rs.wav
4,This song features a rubber instrument being p...,music_data\-1LrH01Ei1w.wav
...,...,...
95,This is a rock music piece playing in the back...,music_data\-taO6N-rxv4.wav
96,The low quality recording features a pop song ...,music_data\-tmY1GEH3_Y.wav
97,This composition contains an upright bass play...,music_data\-tpq_bzSKes.wav
98,An acoustic drum set is playing a 16th note rh...,music_data\-uaTK8sa5Ms.wav


In [10]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Get list of stopwords
stop_words = set(stopwords.words('english'))

# Function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

# Apply text preprocessing to captions in the DataFrame
caption_wav_df['processed_caption'] = caption_wav_df['caption'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Medhansh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Medhansh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  caption_wav_df['processed_caption'] = caption_wav_df['caption'].apply(preprocess_text)


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create a tokenizer
tokenizer = Tokenizer()

# Fit tokenizer on the processed captions
tokenizer.fit_on_texts(caption_wav_df['processed_caption'])

# Convert tokens to sequences of word indices
sequences = tokenizer.texts_to_sequences(caption_wav_df['processed_caption'])

# Pad sequences to ensure uniform length
max_caption_length = 58 # Example maximum length of caption sequence
padded_sequences = pad_sequences(sequences, maxlen=max_caption_length, padding='post')

# Inspect vocabulary size
vocab_size = len(tokenizer.word_index) +1 # Add 1 for padding token
print("Vocabulary size:", vocab_size)




Vocabulary size: 866


In [12]:

# Inspect example sequence
print("Example sequence:", padded_sequences[0])

Example sequence: [ 15  10   4   7 160   2  96  84  64  85  22   3  76  24  18  11   6 111
 216  51 112 113  65 337 338   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]


In [13]:

padded_sequences[0].shape

(58,)

In [14]:
max_length = max(len(caption) for caption in caption_wav_df['processed_caption'])
print("Maximum length of processed captions:", max_length)


Maximum length of processed captions: 58


In [15]:
caption_wav_df['processed_caption'][0]

['low',
 'quality',
 'recording',
 'features',
 'ballad',
 'song',
 'contains',
 'sustained',
 'strings',
 'mellow',
 'piano',
 'melody',
 'soft',
 'female',
 'vocal',
 'singing',
 'sounds',
 'sad',
 'soulful',
 'like',
 'something',
 'would',
 'hear',
 'sunday',
 'services']

In [16]:
caption_wav_df

Unnamed: 0,caption,wav_path,processed_caption
0,The low quality recording features a ballad so...,music_data\-0Gj8-vB1q4.wav,"[low, quality, recording, features, ballad, so..."
1,This song features an electric guitar as the m...,music_data\-0SdAVK79lg.wav,"[song, features, electric, guitar, main, instr..."
2,a male voice is singing a melody with changing...,music_data\-0vPFx-wRRI.wav,"[male, voice, singing, melody, changing, tempo..."
3,This song contains digital drums playing a sim...,music_data\-0xzrMun0Rs.wav,"[song, contains, digital, drums, playing, simp..."
4,This song features a rubber instrument being p...,music_data\-1LrH01Ei1w.wav,"[song, features, rubber, instrument, played, s..."
...,...,...,...
95,This is a rock music piece playing in the back...,music_data\-taO6N-rxv4.wav,"[rock, music, piece, playing, background, tuto..."
96,The low quality recording features a pop song ...,music_data\-tmY1GEH3_Y.wav,"[low, quality, recording, features, pop, song,..."
97,This composition contains an upright bass play...,music_data\-tpq_bzSKes.wav,"[composition, contains, upright, bass, playing..."
98,An acoustic drum set is playing a 16th note rh...,music_data\-uaTK8sa5Ms.wav,"[acoustic, drum, set, playing, 16th, note, rhy..."


Tejas ka code

In [17]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# 
# # Convert processed captions to text strings
# captions_text = caption_wav_df['processed_caption'].apply(lambda x: ' '.join(x))
# 
# # Initialize tokenizer
# tokenizer = Tokenizer()
# 
# # Fit tokenizer on captions text
# tokenizer.fit_on_texts(captions_text)
# 
# # Convert text to sequences of integers
# sequences = tokenizer.texts_to_sequences(captions_text)
# 
# # Get maximum sequence length
# max_seq_length = max(len(seq) for seq in sequences)
# 
# # Pad sequences to ensure uniform length
# padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
# 
# # Update processed captions in DataFrame with padded sequences
# caption_wav_df['padded_caption'] = padded_sequences.tolist()

In [18]:
# padded_sequences[0].shape

In [None]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Function to convert WAV files to spectrograms and save images
def convert_to_spectrogram(wav_path, specto_folder):
    # Load the audio file
    try:
        y, sr = librosa.load(wav_path)
    except FileNotFoundError:
        print(f"File not found: {wav_path}")
        return None
    
    # Compute the spectrogram
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    db_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    
    # Save the spectrogram image
    specto_path = os.path.join(specto_folder, '-'+os.path.basename(wav_path).replace('.wav', '.png'))
    plt.imsave(specto_path, db_spectrogram, cmap='viridis', format='png')
    
    return specto_path

# Create a folder for spectrograms if it doesn't exist
specto_folder = 'specto'
if not os.path.exists(specto_folder):
    os.makedirs(specto_folder)

# Apply the function to each WAV file and store the paths
caption_wav_df['specto_path'] = caption_wav_df['wav_path'].apply(lambda x: convert_to_spectrogram(x, specto_folder))


In [19]:
caption_wav_df

Unnamed: 0,caption,wav_path,processed_caption
0,The low quality recording features a ballad so...,music_data\-0Gj8-vB1q4.wav,"[low, quality, recording, features, ballad, so..."
1,This song features an electric guitar as the m...,music_data\-0SdAVK79lg.wav,"[song, features, electric, guitar, main, instr..."
2,a male voice is singing a melody with changing...,music_data\-0vPFx-wRRI.wav,"[male, voice, singing, melody, changing, tempo..."
3,This song contains digital drums playing a sim...,music_data\-0xzrMun0Rs.wav,"[song, contains, digital, drums, playing, simp..."
4,This song features a rubber instrument being p...,music_data\-1LrH01Ei1w.wav,"[song, features, rubber, instrument, played, s..."
...,...,...,...
95,This is a rock music piece playing in the back...,music_data\-taO6N-rxv4.wav,"[rock, music, piece, playing, background, tuto..."
96,The low quality recording features a pop song ...,music_data\-tmY1GEH3_Y.wav,"[low, quality, recording, features, pop, song,..."
97,This composition contains an upright bass play...,music_data\-tpq_bzSKes.wav,"[composition, contains, upright, bass, playing..."
98,An acoustic drum set is playing a 16th note rh...,music_data\-uaTK8sa5Ms.wav,"[acoustic, drum, set, playing, 16th, note, rhy..."


In [None]:
from PIL import Image

# Open the image
image_path = "D:\PycharmProjects\Gans Project\specto\--0Gj8-vB1q4.png"  # Replace with the path to your image
image = Image.open(image_path)

# Get the size of the image
image_size = image.size  # Returns a tuple (width, height)

# Print the size of the image
print("Image size (width x height):", image_size)


In [None]:
from keras.layers import Input, Embedding, LSTM, Dense, Conv2D, MaxPooling2D, Flatten, Concatenate, Reshape
from keras.models import Model

def build_generator(max_caption_length, spectrogram_shape):
    # Caption input
    caption_input = Input(shape=(max_caption_length,))
    
    # Reshape the input for LSTM
    caption_input_reshaped = Reshape((max_caption_length, 1))(caption_input)

    # LSTM layer
    lstm_units = 128
    lstm_output = LSTM(units=lstm_units)(caption_input_reshaped)
    lstm_output = Dense(units=128, activation='relu')(lstm_output)  # Add a dense layer to match shape with CNN output

    # Spectrogram input
    spectrogram_input = Input(shape=spectrogram_shape)

    # Reshape the spectrogram input to add batch dimension
    spectrogram_input_reshaped = Reshape((spectrogram_shape[0], spectrogram_shape[1], 1))(spectrogram_input)

    # CNN layer
    cnn_output = Conv2D(filters=64, kernel_size=(3, 3), strides=(2, 2), padding='same', activation='relu')(spectrogram_input_reshaped)
    cnn_output = MaxPooling2D(pool_size=(2, 2))(cnn_output)
    cnn_output = Flatten()(cnn_output)
    cnn_output = Dense(units=128, activation='relu')(cnn_output)

    # Concatenate LSTM and CNN outputs
    concatenated = Concatenate()([lstm_output, cnn_output])

    # Additional layers
    x = Dense(units=256, activation='relu')(concatenated)
    x = Dense(units=512, activation='relu')(x)
    x = Dense(units=1024, activation='relu')(x)

    # Output layer
    output = Dense(units=spectrogram_shape[0] * spectrogram_shape[1], activation='tanh')(x)
    output_reshaped = Reshape(spectrogram_shape)(output)

    # Model
    generator = Model(inputs=[caption_input, spectrogram_input], outputs=output_reshaped)

    return generator

# Define the maximum caption length and spectrogram shape
max_caption_length = 58  # Assuming this value based on the provided data
spectrogram_shape = (861, 128)  # Shape of your spectrogram images

# Build the generator model
generator = build_generator(max_caption_length, spectrogram_shape)

# Display the model summary
generator.summary()

In [None]:
from keras.layers import Input, Conv1D, Conv2D, BatchNormalization, ReLU, Conv2DTranspose, Reshape, Add, Activation, LSTM, Dense, Concatenate, Lambda
from keras.models import Model
import numpy as np

def build_generator(max_caption_length, spectrogram_shape):
    # Caption input
    caption_input = Input(shape=(max_caption_length,))
    
    # LSTM layer
    lstm_units = 128
    lstm_output = LSTM(units=lstm_units, return_sequences=True)(Reshape((-1, 1))(caption_input))
    lstm_output = LSTM(units=lstm_units)(lstm_output)
    
    # Spectrogram input
    spectrogram_input = Input(shape=spectrogram_shape)
    
    # Spectrogram processing layers (Convolutional layers)
    conv1d_filters = 16
    conv_kernel_size = 3
    x = Conv1D(filters=conv1d_filters, kernel_size=conv_kernel_size, padding='same')(spectrogram_input)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    
    # Reshape spectrogram to match Conv2DTranspose input shape
    x = Reshape((spectrogram_shape[0], 1, conv1d_filters))(x)
    
    # Upsampling layers
    upsampling_factor = 8
    x = Conv2DTranspose(filters=conv1d_filters, kernel_size=(1, 4), strides=(1, upsampling_factor), padding='same')(x)
    x = ReLU()(x)
    
    # Residual stack
    residual_blocks = 8
    for _ in range(residual_blocks):
        residual_output = residual_block(x, conv1d_filters)
        x = Add()([x, residual_output])
    
    # Upsampling layers
    x = Conv2DTranspose(filters=1, kernel_size=(1, 4), strides=(1, 2), padding='same')(x)
    x = ReLU()(x)
    
    # Residual stack
    for _ in range(residual_blocks):
        residual_output = residual_block(x, 1)
        x = Add()([x, residual_output])
    
    # Convolutional layer
    x = Conv2D(filters=1, kernel_size=(1, 7), padding='same')(x)
    
    # Flatten and reshape
    x = Reshape((-1,))(x)
    
    # Model output
    generator_output = Activation('tanh')(x)
    
    # Concatenate LSTM output with Convolutional output
    merged_output = Concatenate()([lstm_output, generator_output])
    
    # Final dense layer to output raw waveform
    output_waveform = Dense(units=8000, activation='tanh')(merged_output)
    
    # Build model
    generator = Model(inputs=[caption_input, spectrogram_input], outputs=output_waveform)
    
    return generator

def residual_block(input_tensor, filters):
    x = Conv2D(filters=filters, kernel_size=(1, 3), padding='same')(input_tensor)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2D(filters=filters, kernel_size=(1, 3), padding='same')(x)
    x = BatchNormalization()(x)
    return x

# Example usage
max_caption_length = 58
spectrogram_shape = (861, 128)
generator = build_generator(max_caption_length, spectrogram_shape)
generator.summary()

In [None]:
from keras.layers import Input, Conv1D, Conv2D, BatchNormalization, ReLU, Conv2DTranspose, Reshape, Add, Activation, LSTM, Dense, Concatenate, Lambda
from keras.models import Model
import numpy as np

def build_generator(max_caption_length, spectrogram_shape, output_waveform_size):
    # Caption input
    caption_input = Input(shape=(max_caption_length,))
    
    # LSTM layer
    lstm_units = 64
    lstm_output = LSTM(units=lstm_units, return_sequences=True)(Reshape((-1, 1))(caption_input))
    lstm_output = LSTM(units=lstm_units)(lstm_output)
    
    # Spectrogram input
    spectrogram_input = Input(shape=spectrogram_shape)
    
    # Spectrogram processing layers (Convolutional layers)
    conv1d_filters = 4
    conv_kernel_size = 3
    x = Conv1D(filters=conv1d_filters, kernel_size=conv_kernel_size, padding='same')(spectrogram_input)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    
    # Reshape spectrogram to match Conv2DTranspose input shape
    x = Reshape((spectrogram_shape[0], 1, conv1d_filters))(x)
    
    # Upsampling layers
    upsampling_factor = 2
    x = Conv2DTranspose(filters=conv1d_filters, kernel_size=(1, 4), strides=(1, upsampling_factor), padding='same')(x)
    x = ReLU()(x)
    
    # Residual stack
    residual_blocks = 2
    for _ in range(residual_blocks):
        residual_output = residual_block(x, conv1d_filters)
        x = Add()([x, residual_output])
    
    # Upsampling layers
    x = Conv2DTranspose(filters=1, kernel_size=(1, 4), strides=(1, 2), padding='same')(x)
    x = ReLU()(x)
    
    # Residual stack
    for _ in range(residual_blocks):
        residual_output = residual_block(x, 1)
        x = Add()([x, residual_output])
    
    # Convolutional layer
    x = Conv2D(filters=1, kernel_size=(1, 7), padding='same')(x)
    
    # Flatten and reshape
    x = Reshape((-1,))(x)
    
    # Model output
    generator_output = Activation('tanh')(x)
    
    # Concatenate LSTM output with Convolutional output
    merged_output = Concatenate()([lstm_output, generator_output])
    
    # Final dense layer to output raw waveform
    output_waveform = Dense(units=output_waveform_size, activation='tanh')(merged_output)
    
    # Build model
    generator = Model(inputs=[caption_input, spectrogram_input], outputs=output_waveform)
    
    return generator

def residual_block(input_tensor, filters):
    x = Conv2D(filters=filters, kernel_size=(1, 3), padding='same')(input_tensor)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2D(filters=filters, kernel_size=(1, 3), padding='same')(x)
    x = BatchNormalization()(x)
    return x

# Example usage
max_caption_length = 58
spectrogram_shape = (861, 128)
output_waveform_size = 441000  # For 10-second audio sampled at 44100 Hz
generator = build_generator(max_caption_length, spectrogram_shape, output_waveform_size)
generator.summary()


In [None]:
import os
import librosa
import soundfile as sf

def trim_audio(input_file, output_folder):
    # Load the audio file
    audio_data, sample_rate = librosa.load(input_file, sr=None)

    # Trim the audio to the desired segment (0 to 3 seconds)
    start_time = 0
    end_time = 3
    start_index = int(start_time * sample_rate)
    end_index = int(end_time * sample_rate)
    trimmed_audio = audio_data[start_index:end_index]

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Save the trimmed audio to a new file
    output_file = os.path.join(output_folder, os.path.basename(input_file))
    sf.write(output_file, trimmed_audio, sample_rate)

# Directory containing audio files
input_folder = "D:\PycharmProjects\Gans Project\music_data"

# Directory to save trimmed audio files
output_folder = "D:\PycharmProjects\Gans Project\Trimed"

# Iterate over audio files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".wav"):
        input_file = os.path.join(input_folder, filename)
        # Trim the audio and save it to the output folder
        trim_audio(input_file, output_folder)


In [None]:
import os
import librosa
import numpy as np
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Sequential

def extract_features(input_file):
    # Load the audio file
    audio_data, _ = librosa.load(input_file, sr=None)

    # Apply convolutional layer
    model = Sequential([
        Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(audio_data.shape[0], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)  # Output layer, adjust units according to your task
    ])

    # Reshape audio data for input to the model
    audio_data = audio_data.reshape(1, -1, 1)

    # Extract features using the model
    features = model.predict(audio_data)

    return features

# Single audio file
input_file = "music_data"

# Extract features from the audio file
features = extract_features(input_file)

# Directory to save the extracted features
output_folder = "D:\PycharmProjects\Gans Project\Trimed_!"

# Save the features to a file
output_file = os.path.join(output_folder, os.path.basename(input_file).replace(".wav", ".npy"))
np.save(output_file, features)


In [30]:
!pip install --upgrade --force-reinstall librosa


^C


In [20]:
from keras.layers import Input, Conv1D, Conv2D, BatchNormalization, ReLU, Conv2DTranspose, Reshape, Add, Activation, LSTM, Dense, Concatenate, Lambda, Flatten
from keras.models import Model
import numpy as np

def build_generator_segment(segment_length, max_caption_length, spectrogram_shape):
    # Caption input
    caption_input = Input(shape=(max_caption_length,))
    
    # LSTM layer
    lstm_units = 128
    lstm_output = LSTM(units=lstm_units, return_sequences=True)(Reshape((-1, 1))(caption_input))
    lstm_output = LSTM(units=lstm_units)(lstm_output)
    
    # Spectrogram input
    spectrogram_input = Input(shape=spectrogram_shape)
    
    # Spectrogram processing layers (Convolutional layers)
    conv1d_filters = 6
    conv_kernel_size = 3
    x = Conv1D(filters=conv1d_filters, kernel_size=conv_kernel_size, padding='same')(spectrogram_input)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    
    # Reshape spectrogram to match Conv2DTranspose input shape
    x = Reshape((spectrogram_shape[0], 1, conv1d_filters))(x)
    
    # Upsampling layers
    upsampling_factor = 4
    x = Conv2DTranspose(filters=conv1d_filters, kernel_size=(1, 4), strides=(1, upsampling_factor), padding='same')(x)
    x = ReLU()(x)
    
    # Residual stack
    residual_blocks = 4
    for _ in range(residual_blocks):
        residual_output = residual_block(x, conv1d_filters)
        x = Add()([x, residual_output])
    
    # Upsampling layers
    x = Conv2DTranspose(filters=1, kernel_size=(1, 4), strides=(1, 2), padding='same')(x)
    x = ReLU()(x)
    
    # Residual stack
    for _ in range(residual_blocks):
        residual_output = residual_block(x, 1)
        x = Add()([x, residual_output])
    
    # Convolutional layer
    x = Conv2D(filters=1, kernel_size=(1, 7), padding='same')(x)
    
    # Flatten and reshape
    x = Reshape((-1,))(x)
    
    # Model output
    generator_output = Activation('tanh')(x)
    
    # Reshape output into segments
    num_segments = int(generator_output.shape[1]) // segment_length
    segments = []
    for i in range(num_segments):
        segment = Lambda(lambda x: x[:, i * segment_length: (i + 1) * segment_length])(generator_output)
        segments.append(segment)
    
    if segments:
        # Concatenate segments
        concatenated_segments = Concatenate(axis=0)(segments)
    else:
        # If segments list is empty, return the generator output
        concatenated_segments = generator_output
    
    # Model
    generator_segment = Model(inputs=[caption_input, spectrogram_input], outputs=concatenated_segments)
    
    return generator_segment

def residual_block(input_tensor, filters):
    x = Conv2D(filters=filters, kernel_size=(1, 3), padding='same')(input_tensor)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2D(filters=filters, kernel_size=(1, 3), padding='same')(x)
    x = BatchNormalization()(x)
    return x

# Example usage
segment_length = 20 * 44100  
max_caption_length = 58
spectrogram_shape = (861, 128)
generator_segment = build_generator_segment(segment_length, max_caption_length, spectrogram_shape)
generator_segment.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 861, 128)]   0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 861, 6)       2310        ['input_2[0][0]']                
                                                                                                  
 batch_normalization (BatchNorm  (None, 861, 6)      24          ['conv1d[0][0]']                 
 alization)                                                                                       
                                                                                                  
 re_lu (ReLU)                   (None, 861, 6)       0           ['batch_normalization[0][0]']

In [21]:
from keras.layers import Input, Conv1D, LeakyReLU, Flatten, Dense
from keras.models import Model

def build_discriminator(segment_length):
    # Input for the generated or real audio segment
    audio_input = Input(shape=(segment_length, 1))
    
    # Convolutional layers
    x = Conv1D(filters=16, kernel_size=5, strides=2, padding='same')(audio_input)
    x = LeakyReLU(alpha=0.2)(x)
    
    # Flatten layer
    x = Flatten()(x)
    
    # Output layer
    validity = Dense(1, activation='sigmoid')(x)
    
    # Discriminator model
    discriminator = Model(inputs=audio_input, outputs=validity)
    
    return discriminator

# Example usage
segment_length = 20 * 44100  # Segment length in number of samples
discriminator = build_discriminator(segment_length)
discriminator.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 882000, 1)]       0         
                                                                 
 conv1d_1 (Conv1D)           (None, 441000, 16)        96        
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 441000, 16)        0         
                                                                 
 flatten (Flatten)           (None, 7056000)           0         
                                                                 
 dense (Dense)               (None, 1)                 7056001   
                                                                 
Total params: 7,056,097
Trainable params: 7,056,097
Non-trainable params: 0
_________________________________________________________________


In [23]:
from keras.layers import Input, Conv1D, LeakyReLU, Flatten, Dense

def build_discriminator(segment_length):
    # Input for the generated or real audio segment
    audio_input = Input(shape=(segment_length, 1))
    
    # Convolutional layers
    x = Conv1D(filters=8, kernel_size=5, strides=2, padding='same')(audio_input)
    x = LeakyReLU(alpha=0.2)(x)
    
    # Additional Convolutional layer
    x = Conv1D(filters=16, kernel_size=5, strides=2, padding='same')(x)
    x = LeakyReLU(alpha=0.2)(x)
    
    # Additional Convolutional layer
    x = Conv1D(filters=32, kernel_size=5, strides=2, padding='same')(x)
    x = LeakyReLU(alpha=0.2)(x)
    
    # Flatten layer
    x = Flatten()(x)
    
    # Output layer
    validity = Dense(1, activation='sigmoid')(x)
    
    # Discriminator model
    discriminator = Model(inputs=audio_input, outputs=validity)
    
    return discriminator

# Example usage
segment_length = 20 * 44100  # Segment length in number of samples
discriminator = build_discriminator(segment_length)
discriminator.summary()


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 882000, 1)]       0         
                                                                 
 conv1d_5 (Conv1D)           (None, 441000, 8)         48        
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 441000, 8)         0         
                                                                 
 conv1d_6 (Conv1D)           (None, 220500, 16)        656       
                                                                 
 leaky_re_lu_5 (LeakyReLU)   (None, 220500, 16)        0         
                                                                 
 conv1d_7 (Conv1D)           (None, 110250, 32)        2592      
                                                                 
 leaky_re_lu_6 (LeakyReLU)   (None, 110250, 32)        0   