In [None]:
import numpy as np
import librosa

def preprocess_audio(file_name, win_size=31):
    # Load the audio file
    y, sr = librosa.load(file_name, sr=8000, mono=True)

    # Compute the Short-Time Fourier Transform (STFT)
    S = np.abs(librosa.stft(y, n_fft=1024, hop_length=80, win_length=1024))

    # Convert to decibel scale
    db_S = librosa.amplitude_to_db(S, ref=np.max)

    # Normalize between 0 and 1
    norm_db_S = (db_S - np.min(db_S)) / (np.max(db_S) - np.min(db_S))

    # Padding for consistent window sizes
    num_frames = norm_db_S.shape[1]
    pad_num = num_frames % win_size
    if pad_num != 0:
        pad_length = win_size - pad_num
        padding_feature = np.zeros(shape=(513, pad_length))
        norm_db_S = np.concatenate((norm_db_S, padding_feature), axis=1)

    # Splitting the frames into windows
    x_test = [norm_db_S[:, j:j + win_size].T for j in range(0, norm_db_S.shape[1], win_size)]
    x_test = np.array(x_test)
    x_test = x_test[..., np.newaxis]  # Add a channel dimension for compatibility with CNNs

    return x_test, norm_db_S

In [None]:
import pandas as pd
import numpy as np

def preprocess_annotations(csv_file_path, hop_length=80, sample_rate=8000, window_size=31):
    # Read the CSV file
    annotations = pd.read_csv(csv_file_path, header=None, names=['timestamp', 'F0'])

    # Calculate the frame number for each timestamp
    annotations['frame'] = (annotations['timestamp'] * sample_rate // hop_length).astype(int)

    # Drop duplicate frames, keeping only the first occurrence
    annotations = annotations.drop_duplicates(subset='frame', keep='first')

    # Map F0 values to MIDI (or other desired pitch representation)
    annotations['pitch_to_midi'] = annotations['F0'].apply(lambda f: librosa.hz_to_midi(f) if f > 0 else 0)
    annotations['label'] = annotations['pitch_to_midi'].apply(lambda f: np.argmin(np.abs(pitch_range - f)))

    # Add 'non-voice' label for zero pitch
    total_frames = len(annotations)
    pad_length = total_frames % window_size
    if pad_length != 0:
        pad_length = window_size - pad_length
        padding_annotations = pd.DataFrame({'frame': range(total_frames, total_frames + pad_length), 'F0': 0, 'label': 0,})
        annotations = pd.concat([annotations, padding_annotations], ignore_index=True)

    one_hot_labels = np.zeros((annotations.shape[0], len(pitch_range)))
    one_hot_labels_vad = np.zeros((annotations.shape[0], 2))
    for _, annotation in annotations.iterrows():
        one_hot_labels[int(annotation['frame']), int(annotation['label'])] = 1
        if(annotation['label'] > 0):
          one_hot_labels_vad[int(annotation['frame']), 1] = 1
        else:
          one_hot_labels_vad[int(annotation['frame']), 0] = 1
    y_test = [one_hot_labels[j:j + window_size, :] for j in range(0, one_hot_labels.shape[0], window_size)]
    y_test_vad = [one_hot_labels_vad[j:j + window_size, :] for j in range(0, one_hot_labels_vad.shape[0], window_size)]
    y_test = np.array(y_test)
    y_test_vad = np.array(y_test_vad)
    return y_test, y_test_vad


In [None]:
import os
import numpy as np
from sklearn.model_selection import train_test_split

pitch_range = np.arange(38, 83 + 1.0/16, 1.0/16)
pitch_range = np.concatenate([np.zeros(1), pitch_range])

# Define your directories
audio_dir = '/content/drive/MyDrive/Colab_Notebooks/audio_mix/'
csv_dir = '/content/drive/MyDrive/Colab_Notebooks/annotation_melody/'

# Retrieve a sorted list of audio and CSV files
a_files = sorted([os.path.join(audio_dir, file) for file in os.listdir(audio_dir) if file.endswith('.wav')])
c_files = sorted([os.path.join(csv_dir, file) for file in os.listdir(csv_dir) if file.endswith('.csv')])




In [None]:
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dense, Flatten, Dropout, Bidirectional, LSTM, concatenate, TimeDistributed, LeakyReLU, Reshape, add, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
import math
def ResNet_Block(input,block_id,filterNum):
    ''' Create a ResNet block
    Args:
        input: input tensor
        filterNum: number of output filters
    Returns: a keras tensor
    '''
    x = BatchNormalization()(input)
    x = LeakyReLU(0.01)(x)
    x = MaxPooling2D((1, 4))(x)

    init = Conv2D(filterNum, (1, 1), name='conv'+str(block_id)+'_1x1', padding='same', kernel_initializer='he_normal', use_bias=False)(x)
    x = Conv2D(filterNum, (3, 3), name='conv'+str(block_id)+'_1',padding='same',kernel_initializer='he_normal',use_bias=False)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(0.01)(x)
    x = Conv2D(filterNum, (3, 3),  name='conv'+str(block_id)+'_2',padding='same',kernel_initializer='he_normal',use_bias=False)(x)

    x = add([init, x])
    return x

num_output = int(45 * 2 ** (math.log(16, 2)) + 2)
input = Input(shape=(31, 513, 1))

block_1 = Conv2D(64, (3, 3), name='conv1_1', padding='same', kernel_initializer='he_normal', use_bias=False,
                  kernel_regularizer=l2(1e-5))(input)
block_1 = BatchNormalization()(block_1)
block_1 = LeakyReLU(0.01)(block_1)
block_1 = Conv2D(64, (3, 3), name='conv1_2', padding='same', kernel_initializer='he_normal', use_bias=False,
                  kernel_regularizer=l2(1e-5))(block_1)

block_2 = ResNet_Block(input=block_1, block_id=2, filterNum=128)
block_3 = ResNet_Block(input=block_2, block_id=3, filterNum=192)
block_4 = ResNet_Block(input=block_3, block_id=4, filterNum=256)

block_4 = BatchNormalization()(block_4)
block_4 = LeakyReLU(0.01)(block_4)
block_4 = MaxPooling2D((1, 4))(block_4)
block_4 = Dropout(0.5)(block_4)

numOutput_P = 2 * block_4.shape[3]
output = Reshape((31, numOutput_P))(block_4)

output = Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.3, dropout=0.3))(output)
output = TimeDistributed(Dense(num_output))(output)
output = TimeDistributed(Activation("softmax"), name='output')(output)

block_1 = MaxPooling2D((1, 4 ** 4))(block_1)
block_2 = MaxPooling2D((1, 4 ** 3))(block_2)
block_3 = MaxPooling2D((1, 4 ** 2))(block_3)

joint = concatenate([block_1, block_2, block_3, block_4])
joint = Conv2D(256, (1, 1), padding='same', kernel_initializer='he_normal', use_bias=False,
                kernel_regularizer=l2(1e-5))(joint)
joint = BatchNormalization()(joint)
joint = LeakyReLU(0.01)(joint)
joint = Dropout(0.5)(joint)

num_V = joint.shape[3] * 2
output_V = Reshape((31, num_V))(joint)

output_V = Bidirectional(LSTM(32, return_sequences=True, stateful=False, recurrent_dropout=0.3, dropout=0.3))(
    output_V)
output_V = TimeDistributed(Dense(2))(output_V)
output_V = TimeDistributed(Activation("softmax"))(output_V)

output_NS = Lambda(lambda x: x[:, :, 0])(output)
output_NS = Reshape((31, 1))(output_NS)
output_S = Lambda(lambda x: 1 - x[:, :, 0])(output)
output_S = Reshape((31, 1))(output_S)
output_VV = concatenate([output_NS, output_S])

output_V = add([output_V, output_VV])
output_V = TimeDistributed(Activation("softmax"), name='output_V')(output_V)

model = Model(inputs=input, outputs=[output, output_V])
print(output.shape, output_V.shape)

model.summary()

In [None]:
from tensorflow.keras.optimizers import Adam

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss={'output': 'categorical_crossentropy', 'output_V': 'categorical_crossentropy'},
    loss_weights={'output': 1, 'output_V': 0.5},
    metrics=['accuracy']
)


In [None]:
for i in range(0, 5):
  for i in range(0, 10):
    csv_files = c_files[5*i:5*(i+1)]
    audio_files = a_files[5*i:5*(i+1)]
    combined_spectrograms = []
    combined_annotations = []
    vad_s = []
    for wav_file_path, csv_file_path in zip(audio_files, csv_files):
        # Preprocess the audio and annotation files
        spectrogram, _ = preprocess_audio(wav_file_path)
        annotation, vad = preprocess_annotations(csv_file_path)

        # Store the processed data
        combined_spectrograms.append(spectrogram)
        combined_annotations.append(annotation)
        vad_s.append(vad)

    # Convert lists to numpy arrays
    combined_spectrograms = np.vstack(combined_spectrograms)
    combined_annotations = np.vstack(combined_annotations)
    vad_s = np.vstack(vad_s)
    X_train, X_temp, y_train, y_temp, y_train_vad, y_temp_vad = train_test_split(combined_spectrograms, combined_annotations, vad_s, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test, y_val_vad, y_test_vad = train_test_split(X_temp, y_temp, y_temp_vad, test_size=0.5, random_state=42)
    model.fit(X_train,
    {"output": y_train, "output_V": y_train_vad},
    validation_data=(X_val,
    {"output": y_val, "output_V": y_val_vad},),
    epochs=5,
    verbose=1)
