# Custom Training Code <br>
Built by Alex Fisher and Kevin Parra-Olmedo

## Import Dependencies

In [121]:
import tensorflow as tf
# activate GPU
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(False)
tf.config.set_soft_device_placement(True)



[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Num GPUs Available:  1


In [120]:
import mir_eval
import glob
import json
import numpy as np
import pretty_midi
import librosa
import matplotlib.pyplot as plt

import sys
sys.path.append('../basic_pitch_original/')

from basic_pitch import inference
from basic_pitch import models

from basic_pitch.constants import (
    ANNOT_N_FRAMES,
    ANNOTATIONS_FPS,
    ANNOTATIONS_N_SEMITONES,
    AUDIO_N_SAMPLES,
    N_FREQ_BINS_CONTOURS,
    AUDIO_SAMPLE_RATE,
    FFT_HOP
)

BATCH_SIZE = 16
SPLIT_INTERVAL = 2
DATASET_PERCENTAGE = 1

tfkl = tf.keras.layers

## Load in sample dataset files<br>
We are using a small sample from MAESTRO dataset's 100 GB of midi/wav files

In [122]:
# get maestro sample files

midi_filenames = glob.glob('../../datasets/maestro_sample/*.midi')
audio_filenames = glob.glob('../../datasets/maestro_sample/*.wav')

audio_midi_pairs = []
for i in range(0, len(audio_filenames)):
    audio_midi_pairs.append((audio_filenames[i], midi_filenames[i]))

print(midi_filenames)
print(audio_filenames)
print(audio_midi_pairs)

print(len(midi_filenames))
print(len(audio_filenames))

['../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber2_MID--AUDIO_09_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--2.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber4_MID--AUDIO_11_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber5_MID--AUDIO_18_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber6_MID--AUDIO_20_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber6_MID--AUDIO_20_R3_2018_wav--2.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Recital1-3_MID--AUDIO_01_R1_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Recital1-3_MID--AUDIO_01_R1_2018_wav--2.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Recital1-3_MID--AUDIO_01_R1_2018_wav--3.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Reci

In [123]:
import json

# Load data from JSON file
with open('../../datasets/maestro-v3.0.0/maestro-v3.0.0.json', 'r') as f:
    data = json.load(f)

print("Number of samples:", len(data['midi_filename']))

audio_midi_pairs = []
for i in range(0, len(data['midi_filename'])):
    audio_filename = '../../datasets/maestro-v3.0.0/' + data['audio_filename'][f"{i}"]
    midi_filename = '../../datasets/maestro-v3.0.0/' + data['midi_filename'][f"{i}"]
    audio_midi_pairs.append((audio_filename, midi_filename))

audio_midi_pairs = audio_midi_pairs[:int(len(audio_midi_pairs) * 0.1)]
print("Number of samples used: " + str(len(audio_midi_pairs)))
print(audio_midi_pairs)

Number of samples: 1276
Number of samples used: 127
[('../../datasets/maestro-v3.0.0/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav', '../../datasets/maestro-v3.0.0/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi'), ('../../datasets/maestro-v3.0.0/2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AUDIO_03_R2_2008_wav--2.wav', '../../datasets/maestro-v3.0.0/2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AUDIO_03_R2_2008_wav--2.midi'), ('../../datasets/maestro-v3.0.0/2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-split_07-07-17_Piano-e_3-02_wav--3.wav', '../../datasets/maestro-v3.0.0/2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-split_07-07-17_Piano-e_3-02_wav--3.midi'), ('../../datasets/maestro-v3.0.0/2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MID--AUDIO_21_R1_2004_01_Track01_wav.wav', '../../datasets/maestro-v3.0.0/2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MID--AUDIO_21_R1_2004_01_Track01_wav.midi'), ('../../datasets/maestro-v3.0.0/2006/MIDI-Unproce

# Preprocess audio and MIDI pair files
Audio needs to fit what model takes as input (windowed audio, uses Basic Pitch's inference get_audio_input function)<br>
MIDI needs to match what model outputs (binary matrix)

In [124]:
def midi_to_piano_onset_matrix(midi_path, frames_per_second=ANNOTATIONS_FPS):
    """
    Convert MIDI file to a binary matrix representing onset of piano keys using a set FPS.

    Parameters:
    - midi_path (str): Path to the MIDI file.
    - frames_per_second (int): Number of frames per second for the binary representation.

    Returns:
    - numpy.ndarray: Binary matrix where rows represent the 88 piano keys and columns are time frames.
    """

    # Load the MIDI file
    midi_data = pretty_midi.PrettyMIDI(midi_path)

    # Duration of the MIDI file in seconds
    duration = midi_data.get_end_time()

    # 88 keys for standard piano
    num_piano_keys = 88

    # Calculate the total number of frames based on the FPS
    total_frames = int(duration * frames_per_second)

    # Initialize binary matrix with zeros
    binary_matrix = np.zeros((total_frames, num_piano_keys))

    for instrument in midi_data.instruments:
        for note in instrument.notes:
            # Only consider valid piano notes (from 21 to 108)
            if 21 <= note.pitch <= 108:
                # Find the frame for this onset time
                onset_frame = int(note.start * frames_per_second)

                # Prevent indexing beyond the matrix size
                if onset_frame < total_frames:
                    # Adjust the pitch value to fit within our matrix's row indices (0-87)
                    adjusted_pitch = note.pitch - 21

                    # Mark the onset in the binary matrix
                    binary_matrix[onset_frame, adjusted_pitch] = 1

    return binary_matrix


In [140]:
def preprocess_data(audio_filename, midi_filename, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP):
    # Initialize lists to store the preprocessed data

    n_overlapping_frames = 30
    overlap_len = n_overlapping_frames * FFT_HOP
    hop_size = AUDIO_N_SAMPLES - overlap_len
    offset = 0
    onsets = midi_to_piano_onset_matrix(midi_filename, frames_per_second=ANNOTATIONS_FPS)

    while offset < librosa.get_duration(path=audio_filename) - SPLIT_INTERVAL:
        audio_original, _ = librosa.load(audio_filename, sr=AUDIO_SAMPLE_RATE, offset=offset, duration=SPLIT_INTERVAL, mono=True)

        audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original])
        audio_windowed, window_times = inference.window_audio_file(audio_original, hop_size)

        split_onsets = onsets[int(offset*ANNOTATIONS_FPS):int((offset+SPLIT_INTERVAL)*ANNOTATIONS_FPS), :]
        if (split_onsets.shape[0] < ANNOTATIONS_FPS * SPLIT_INTERVAL):
            padding = ANNOTATIONS_FPS * SPLIT_INTERVAL - split_onsets.shape[0]
            split_onsets = np.pad(split_onsets, [(0, padding), (0, 0)], 'constant')

        yield np.array(audio_windowed), np.array(split_onsets)
        offset += SPLIT_INTERVAL


In [168]:
# NEED TO INITIALIZE DATA GENERATOR TO SAVE MEMORY (DATA WILL BE LOADED AS THE MODEL NEEDS IT, NOT ALL BEFORE TRAINING)

# create data generator
def data_generator(dataset, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP):
    print("CALLED DATA GENERATOR")
    song_count = 0
    for audio_filename, midi_filename in dataset:
        song_count += 1
        sample_count = 0
        print("\nsong_count = ", song_count)

        n_overlapping_frames = 30
        overlap_len = n_overlapping_frames * FFT_HOP
        hop_size = AUDIO_N_SAMPLES - overlap_len
        offset = 0
        onsets = midi_to_piano_onset_matrix(midi_filename, frames_per_second=ANNOTATIONS_FPS)

        while offset < librosa.get_duration(path=audio_filename) - SPLIT_INTERVAL:
            sample_count += 1
            print("\nsample_count = ", sample_count)
            audio_original, _ = librosa.load(audio_filename, sr=AUDIO_SAMPLE_RATE, offset=offset, duration=SPLIT_INTERVAL, mono=True)

            audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original])
            audio_windowed, window_times = inference.window_audio_file(audio_original, hop_size)

            split_onsets = onsets[int(offset*ANNOTATIONS_FPS):int((offset+SPLIT_INTERVAL)*ANNOTATIONS_FPS), :]
            if (split_onsets.shape[0] < ANNOTATIONS_FPS * SPLIT_INTERVAL):
                padding = ANNOTATIONS_FPS * SPLIT_INTERVAL - split_onsets.shape[0]
                split_onsets = np.pad(split_onsets, [(0, padding), (0, 0)], 'constant')

            yield np.array(audio_windowed), np.array(split_onsets)
            offset += SPLIT_INTERVAL

            


output_types = (tf.float32, tf.float64)  # Modify as per your data types
output_shapes = (tf.TensorShape([SPLIT_INTERVAL, 43844, 1]), tf.TensorShape([172, 88]))  # Modify as per your data shapes

# Example of splitting the dataset
train_size = int(0.8 * len(audio_midi_pairs))
train_audio_midi_pairs = audio_midi_pairs[:train_size]
val_audio_midi_pairs = audio_midi_pairs[train_size:]

train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_audio_midi_pairs, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP),
    output_types=output_types,
    output_shapes=output_shapes
)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(val_audio_midi_pairs, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP),
    output_types=output_types,
    output_shapes=output_shapes
)

batch_size = BATCH_SIZE  # Adjust according to your needs


for audio, onset in train_dataset.take(1):  # Adjust the number taken as needed
    print("Audio shape:", audio.shape)
    print("Onset shape:", onset.shape)
    # Optionally, visually inspect the actual data
    print("Audio data sample:", audio[0])  # Inspect first sample of the batch
    print("Onset data sample:", onset[0])  # Inspect first sample of the batch

CALLED DATA GENERATOR

song_count =  1

sample_count =  1
Audio shape: (2, 43844, 1)
Onset shape: (172, 88)
Audio data sample: tf.Tensor(
[[ 0.        ]
 [ 0.        ]
 [ 0.        ]
 ...
 [-0.00886715]
 [-0.00748847]
 [-0.00510733]], shape=(43844, 1), dtype=float32)
Onset data sample: tf.Tensor(
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(88,), dtype=float64)


In [None]:
# attempt at logging data generator
import logging
import cProfile

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def preprocess_data_debug(audio_filename, midi_filename, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP):
    n_overlapping_frames = 30
    overlap_len = n_overlapping_frames * FFT_HOP
    hop_size = AUDIO_N_SAMPLES - overlap_len
    offset = 0
    onsets = midi_to_piano_onset_matrix(midi_filename, frames_per_second=ANNOTATIONS_FPS)

    try:
        while offset < librosa.get_duration(path=audio_filename) - SPLIT_INTERVAL:
            audio_original, _ = librosa.load(audio_filename, sr=AUDIO_SAMPLE_RATE, offset=offset, duration=SPLIT_INTERVAL, mono=True)

            audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original])
            audio_windowed, window_times = inference.window_audio_file(audio_original, hop_size)

            split_onsets = onsets[int(offset*ANNOTATIONS_FPS):int((offset+SPLIT_INTERVAL)*ANNOTATIONS_FPS), :]
            if (split_onsets.shape[0] < ANNOTATIONS_FPS * SPLIT_INTERVAL):
                padding = ANNOTATIONS_FPS * SPLIT_INTERVAL - split_onsets.shape[0]
                split_onsets = np.pad(split_onsets, [(0, padding), (0, 0)], 'constant')
            # Your existing preprocessing code here
            # Add logging to monitor data shapes and types
            logging.info(f"Processed audio_windowed shape: {audio_windowed.shape}")
            logging.info(f"Processed split_onsets shape: {split_onsets.shape}")
            
            yield np.array(audio_windowed), np.array(split_onsets)
            offset += SPLIT_INTERVAL
    except Exception as e:
        logging.error(f"Error in preprocessing: {e}")
        raise

def test_generator(dataset, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP):
    for audio_filename, midi_filename in dataset[:5]:  # Test with a few samples
        for audio_windowed, split_onsets in preprocess_data_debug(audio_filename, midi_filename, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP):
            # Print or log for debugging
            print(f"Sample audio_windowed shape: {audio_windowed.shape}")
            print(f"Sample split_onsets shape: {split_onsets.shape}")

# Call the test function
test_generator(audio_midi_pairs, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP)

# Profile the generator function for performance
cProfile.run('test_generator(audio_midi_pairs, AUDIO_SAMPLE_RATE, SPLIT_INTERVAL, ANNOTATIONS_FPS, AUDIO_N_SAMPLES, FFT_HOP)')


## Train the model using the preprocessed data<br>
Something unique about this training process is that the y_batch data must be further preprocessed by adding padding to match the shape of the matrix produced by the model's output so that the shapes match and can be directly compared in the loss function. <br>


Not sure if there is a workaround for this. The only thing I can think of that might allow us to preprocess the data completely before the model predicts anything is by figuring out what matrix shape the model will produce before it produces it, then adding the padding to the MIDI matrix to match the shape. This might be able to be done by using some sort of equation with the input audio data. Model output always seems to produce more note onset information than is stored in the midi file. This could mean that the audio files need to be trimmed, but I'm not sure in what way or how to determine that.

In [8]:
# PRINT OUT ALL TRAINABLE LAYERS OF THE MODEL
# Iterate through the layers and print the layer name and its trainable status
for layer in models.model().layers:
    print(f"Layer: {layer.name}")
    print(f"Trainable: {layer.trainable}")
    for weight in layer.trainable_weights:
        print(f"\tWeight: {weight.name}, Shape: {weight.shape}")

# If you only want to see layers with trainable weights:
print("\nOnly layers with trainable weights:")
for layer in models.model().layers:
    if layer.trainable_weights:
        print(f"Layer: {layer.name}")
        for weight in layer.trainable_weights:
            print(f"\tWeight: {weight.name}, Shape: {weight.shape}")

Layer: input_1
Trainable: True
Layer: flatten_audio_ch
Trainable: True
Layer: cqt2010v2
Trainable: False
Layer: normalized_log
Trainable: True
Layer: tf.expand_dims
Trainable: True
Layer: batch_normalization
Trainable: True
	Weight: batch_normalization/gamma:0, Shape: (1,)
	Weight: batch_normalization/beta:0, Shape: (1,)
Layer: harmonic_stacking
Trainable: False
Layer: conv2d_1
Trainable: True
	Weight: conv2d_1/kernel:0, Shape: (3, 39, 8, 8)
	Weight: conv2d_1/bias:0, Shape: (8,)
Layer: batch_normalization_2
Trainable: True
	Weight: batch_normalization_2/gamma:0, Shape: (8,)
	Weight: batch_normalization_2/beta:0, Shape: (8,)
Layer: re_lu_1
Trainable: True
Layer: contours-reduced
Trainable: True
	Weight: contours-reduced/kernel:0, Shape: (5, 5, 8, 1)
	Weight: contours-reduced/bias:0, Shape: (1,)
Layer: contour
Trainable: True
Layer: tf.expand_dims_1
Trainable: True
Layer: conv2d_2
Trainable: True
	Weight: conv2d_2/kernel:0, Shape: (7, 7, 1, 32)
	Weight: conv2d_2/bias:0, Shape: (32,)
Laye

In [143]:
# CREATE CUSTOM LOSS FUNCTION FOR WEIGHTED BINARY CROSS ENTROPY
class WeightedBinaryCrossEntropy(tf.keras.losses.Loss):
    def __init__(self, pos_weight, neg_weight, from_logits=False, name='weighted_binary_crossentropy'):
        super().__init__(name=name)
        self.pos_weight = pos_weight
        self.neg_weight = neg_weight
        self.from_logits = from_logits

    def call(self, y_true, y_pred):
        if not self.from_logits:
            #print("\ny_pred: ", y_pred)
            original_length = 22050 * SPLIT_INTERVAL
            n_overlapping_frames = 30
            unwrapped_y_pred = self.unwrap_output_custom(y_pred, original_length, n_overlapping_frames)

            # Manually calculate the weighted binary cross-entropy for predictions that aren't logits
            epsilon = tf.keras.backend.epsilon()
            unwrapped_y_pred = tf.clip_by_value(unwrapped_y_pred, epsilon, 1.0 - epsilon)

            y_true = tf.cast(y_true, tf.float32)
            unwrapped_y_pred = tf.cast(unwrapped_y_pred, tf.float32)
            pos_weight = tf.cast(self.pos_weight, tf.float32)
            neg_weight = tf.cast(self.neg_weight, tf.float32)

            loss = -y_true * tf.math.log(unwrapped_y_pred) * pos_weight - (1.0 - y_true) * tf.math.log(1.0 - unwrapped_y_pred) * neg_weight
        else:
            # Use TensorFlow's built-in function for logits
            loss = tf.nn.weighted_cross_entropy_with_logits(labels=y_true, logits=y_pred, pos_weight=self.pos_weight)

        return tf.reduce_mean(loss)
    
    # custom unwrap output function that remains compatible with TensorFlow's graph execution
    def unwrap_output_custom(self, output: tf.Tensor, audio_original_length: int, n_overlapping_frames: int) -> tf.Tensor:
        """Unwrap batched model predictions to a single matrix.

        Args:
            output: tensor (n_batches, n_times_short, n_freqs)
            audio_original_length: length of original audio signal (in samples)
            n_overlapping_frames: number of overlapping frames in the output

        Returns:
            tensor (n_times, n_freqs)
        """
        output_rank = tf.rank(output)
        #print("output_rank: ", output_rank)
        
        def process_output():
            n_olap = int(0.5 * n_overlapping_frames)
            if n_olap > 0:
                output_processed = output[:, n_olap:-n_olap, :]
            else:
                output_processed = output
                
            output_shape = tf.shape(output_processed)
            n_output_frames_original = tf.cast(tf.floor(audio_original_length * (ANNOTATIONS_FPS / AUDIO_SAMPLE_RATE)), tf.int32)
            unwrapped_output = tf.reshape(output_processed, [output_shape[0] * output_shape[1], output_shape[2]])
            return unwrapped_output[:n_output_frames_original, :]  # trim to original audio length
        
        def handle_invalid_rank():
            # Print a warning message and return a dummy tensor
            tf.print(f"Warning: Expected output rank to be 3, got {output_rank}")
            return tf.zeros((0, 0), dtype=output.dtype)

        return tf.cond(tf.equal(output_rank, 3), process_output, handle_invalid_rank)

## Try built-in tensorflow train method

This seems to work, the note and onset loss continuously get reduced throughout each epoch! The model is successfully being trained.

In [174]:
# Initialize the model

model_train = models.model()
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
onset_loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
contour_loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
note_loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
model_train.compile(optimizer=adam_optimizer, loss={"onset": onset_loss_function, "note": note_loss_function})


In [175]:
# train model
num_epochs = 1

model_train.fit(train_dataset, validation_data=val_dataset, epochs=num_epochs)

CALLED DATA GENERATOR

song_count =  1

sample_count =  1
      1/Unknown - 3s 3s/step - loss: 0.0941 - note_loss: 0.0284 - onset_loss: 0.0657
sample_count =  2
      2/Unknown - 3s 110ms/step - loss: 0.0782 - note_loss: 0.0253 - onset_loss: 0.0529
sample_count =  3

sample_count =  4

sample_count =  5
      5/Unknown - 3s 44ms/step - loss: 0.0579 - note_loss: 0.0194 - onset_loss: 0.0385 
sample_count =  6

sample_count =  7

sample_count =  8
      8/Unknown - 3s 34ms/step - loss: 0.0472 - note_loss: 0.0153 - onset_loss: 0.0319
sample_count =  9

sample_count =  10

sample_count =  11
     11/Unknown - 3s 30ms/step - loss: 0.0401 - note_loss: 0.0127 - onset_loss: 0.0273
sample_count =  12

sample_count =  13

sample_count =  14
     14/Unknown - 3s 28ms/step - loss: 0.0350 - note_loss: 0.0111 - onset_loss: 0.0239
sample_count =  15

sample_count =  16

sample_count =  17
     17/Unknown - 3s 26ms/step - loss: 0.0311 - note_loss: 0.0096 - onset_loss: 0.0215
sample_count =  18

sample_

<keras.callbacks.History at 0x160a5269160>

### WHEN USING DATA GENERATOR TO PRODUCE DATASET (DATA IS PROCESSED ON THE FLY, NOT PREPROCESSED)
y_pred:  Tensor("model_1/note/reshape_3/Reshape:0", shape=(None, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy/Rank:0", shape=(), dtype=int32)

y_pred:  Tensor("model_1/onset/reshape_2/Reshape:0", shape=(None, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy_1/Rank:0", shape=(), dtype=int32)

y_pred:  Tensor("model_1/note/reshape_3/Reshape:0", shape=(None, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy/Rank:0", shape=(), dtype=int32)

y_pred:  Tensor("model_1/onset/reshape_2/Reshape:0", shape=(None, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy_1/Rank:0", shape=(), dtype=int32)


<br><br><br><br>

### WHEN USING PREPROCESSED DATA (GPU RUNS OUT OF MEMORY WHEN DATASET IS TOO LARGE, FORCED TO USE SMALL SUBSET OF DATA FOR THIS METHOD)

y_pred:  Tensor("model/note/reshape_3/Reshape:0", shape=(4, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy/Rank:0", shape=(), dtype=int32)

y_pred:  Tensor("model/onset/reshape_2/Reshape:0", shape=(4, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy_1/Rank:0", shape=(), dtype=int32)

y_pred:  Tensor("model/note/reshape_3/Reshape:0", shape=(4, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy/Rank:0", shape=(), dtype=int32)

y_pred:  Tensor("model/onset/reshape_2/Reshape:0", shape=(4, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy_1/Rank:0", shape=(), dtype=int32)

2765/2768 [============================>.] - ETA: 0s - loss: 0.0056 - note_loss: 0.0029 - onset_loss: 0.0027

y_pred:  Tensor("model/note/reshape_3/Reshape:0", shape=(4, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy/Rank:0", shape=(), dtype=int32)

y_pred:  Tensor("model/onset/reshape_2/Reshape:0", shape=(4, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy_1/Rank:0", shape=(), dtype=int32)

2768/2768 [==============================] - 53s 18ms/step - loss: 0.0056 - note_loss: 0.0029 - onset_loss: 0.0027 - val_loss: 0.0035 - val_note_loss: 0.0018 - val_onset_loss: 0.0017

<keras.callbacks.History at 0x15e27bc6430>

In [176]:
# Save our trained version of the model

model_train.save('saved_models/nov26_06')



INFO:tensorflow:Assets written to: saved_models/nov26_06\assets


INFO:tensorflow:Assets written to: saved_models/nov26_06\assets


## Test and Evaluate Trained Models

In [202]:

# command line execution to output resulting midi from trained model
!python ../basic_pitch_original/basic_pitch/predict.py --model_path "saved_models/nov26_02" "model_predictions/our_model_nov26_02/" "model_predictions/_test_audio/beethoven_fur_elise.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py --model_path "saved_models/nov26_02" "model_predictions/our_model_nov26_02/" "model_predictions/_test_audio/wii_music.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py --model_path "saved_models/nov26_02" "model_predictions/our_model_nov26_02/" "model_predictions/_test_audio/Gwyn, Lord of Cinder.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py --model_path "saved_models/nov26_02" "model_predictions/our_model_nov26_02/" "model_predictions/_test_audio/MIDI-Unprocessed_Recital1-3_MID--AUDIO_03_R1_2018_wav--5.wav"


✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\beethoven_fur_elise.mp3...


  Creating midi...
  🚨 model_predictions\our_model_nov26_02\beethoven_fur_elise_basic_pitch.mid already exists and would be overwritten. Skipping output files for model_predictions\_test_audio\beethoven_fur_elise.mp3.

✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\wii_music.mp3...


  Creating midi...
  🚨 model_predictions\our_model_nov26_02\wii_music_basic_pitch.mid already exists and would be overwritten. Skipping output files for model_predictions\_test_audio\wii_music.mp3.

✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\Gwyn, Lord of Cinder.mp3...


  Creating midi...
  💅 Saved to model_predictions\our_model_nov26_02\Gwyn,

In [203]:
!python ../basic_pitch_original/basic_pitch/predict.py "model_predictions/spotify_model/" "model_predictions/_test_audio/beethoven_fur_elise.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py "model_predictions/spotify_model/" "model_predictions/_test_audio/wii_music.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py "model_predictions/spotify_model/" "model_predictions/_test_audio/Gwyn, Lord of Cinder.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py "model_predictions/spotify_model/" "model_predictions/_test_audio/MIDI-Unprocessed_Recital1-3_MID--AUDIO_03_R1_2018_wav--5.wav"


✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\beethoven_fur_elise.mp3...


  Creating midi...
  🚨 model_predictions\spotify_model\beethoven_fur_elise_basic_pitch.mid already exists and would be overwritten. Skipping output files for model_predictions\_test_audio\beethoven_fur_elise.mp3.

✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\wii_music.mp3...


  Creating midi...
  🚨 model_predictions\spotify_model\wii_music_basic_pitch.mid already exists and would be overwritten. Skipping output files for model_predictions\_test_audio\wii_music.mp3.

✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\Gwyn, Lord of Cinder.mp3...


  Creating midi...
  💅 Saved to model_predictions\spotify_model\Gwyn, Lord of Cinder

In [179]:
from pathlib import Path
from tensorflow import Tensor, signal, keras, saved_model

current_directory = Path.cwd()
print(current_directory.parent)
icassp_2022_model_path = str(current_directory.parent) + "/basic_pitch_original/basic_pitch/saved_models/icassp_2022/nmp"
print("OG Model path: ", icassp_2022_model_path)
model_oct10 = saved_model.load('saved_models/oct10')
model_nov06 = saved_model.load('saved_models/nov06')
model_nov26_02 = saved_model.load('saved_models/nov26_02')
model_bp = saved_model.load(icassp_2022_model_path)

c:\_AlexFiles\Coding\Python\FALL2023_IndependentStudy\audio_to_midi_vst
OG Model path:  c:\_AlexFiles\Coding\Python\FALL2023_IndependentStudy\audio_to_midi_vst/basic_pitch_original/basic_pitch/saved_models/icassp_2022/nmp


In [183]:
# create sample for 1 song (Fur Elise by Beethoven)
beethoven_x = []

offset = 10 #seconds

beethoven_file = current_directory / "model_predictions/_test_audio/beethoven_fur_elise.mp3"
print(beethoven_file)
while offset < librosa.get_duration(path=beethoven_file) - SPLIT_INTERVAL:
    # preprocess audio

    n_overlapping_frames = 30
    overlap_len = n_overlapping_frames * FFT_HOP
    hop_size = AUDIO_N_SAMPLES - overlap_len

    # modified get_input_audio function to get audio from offset
    assert overlap_len % 2 == 0, "overlap_length must be even, got {}".format(overlap_len)
    audio_original, _ = librosa.load(beethoven_file, sr=AUDIO_SAMPLE_RATE, offset=offset, duration=SPLIT_INTERVAL, mono=True)

    original_length = audio_original.shape[0]
    audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original])
    audio_windowed, window_times = inference.window_audio_file(audio_original, hop_size)

    beethoven_x.append(audio_windowed)
    offset += SPLIT_INTERVAL

c:\_AlexFiles\Coding\Python\FALL2023_IndependentStudy\audio_to_midi_vst\independent_study\model_predictions\_test_audio\beethoven_fur_elise.mp3


In [None]:
print("Basic Pitch\n", model_bp(beethoven_x[0]), "\n\n")
print("Our model OCT10\n", model_oct10(beethoven_x[0]), "\n\n")
print("Our model NOV06\n", model_nov06(beethoven_x[0]), "\n\n")

{'onset': <tf.Tensor: shape=(4, 172, 88), dtype=float32, numpy=
 array([[[0.2563653 , 0.14136553, 0.11154418, ..., 0.10882651,
          0.136962  , 0.16868067],
         [0.18752408, 0.1122677 , 0.09407169, ..., 0.07430211,
          0.08307945, 0.10704739],
         [0.17874199, 0.12020129, 0.11011477, ..., 0.08455317,
          0.0864543 , 0.10679121],
         ...,
         [0.09266827, 0.11102663, 0.10919577, ..., 0.09788323,
          0.10385687, 0.08499473],
         [0.0693486 , 0.07680301, 0.09822215, ..., 0.08400798,
          0.11336812, 0.08087544],
         [0.08666889, 0.08257812, 0.10162653, ..., 0.08861089,
          0.11391509, 0.10966828]],
 
        [[0.21780737, 0.16268659, 0.12284816, ..., 0.11596433,
          0.13623056, 0.14814657],
         [0.22107677, 0.14205264, 0.11213985, ..., 0.10483827,
          0.1084298 , 0.11917666],
         [0.17761321, 0.11661748, 0.13201249, ..., 0.10609926,
          0.09241855, 0.12110808],
         ...,
         [0.08610087, 0

In [200]:
# FUNCTION TO EVALUATE MODEL
import mir_eval

# Define the evaluation function
def evaluate_model(data, model, threshold=0.5):
    # Lists to hold ground truth and predictions for evaluation
    reference_notes = []
    estimated_notes = []

    # Iterate over your dataset
    for audio_file, midi_file in data:
        # Predict the piano roll with your model
        y_pred = inference.predict(audio, model)
        print("MODEL OUTPUT:\n", "\n\nData:\n", y_pred)

    # Compute metrics using mir_eval
    scores = {
        'F-measure': [],
        'F-measure-no-offset': [],
        'Frame-level Accuracy': []
    }
    for ref, est in zip(reference_notes, estimated_notes):
        # mir_eval requires specific formats for reference and estimated notes
        ref_intervals, ref_pitches = mir_eval.util.piano_roll_to_intervals(ref)
        est_intervals, est_pitches = mir_eval.util.piano_roll_to_intervals(est)

        # Evaluate
        p, r, f_measure, _ = mir_eval.transcription.precision_recall_f1_overlap(ref_intervals, ref_pitches, est_intervals, est_pitches)
        scores['F-measure'].append(f_measure)

        # Compute F_no
        f_no = mir_eval.transcription.f_measure_without_offset(ref_intervals, ref_pitches, est_intervals, est_pitches)
        scores['F-measure-no-offset'].append(f_no)

        # Compute frame-level accuracy
        acc = mir_eval.transcription_accuracy(ref_intervals, ref_pitches, est_intervals, est_pitches)
        scores['Frame-level Accuracy'].append(acc)

    # Average the scores
    for key in scores:
        scores[key] = np.mean(scores[key])

    return scores

In [186]:
# PREPARE EVALUATION DATA
audio_filename = "model_predictions/_test_audio/MIDI-Unprocessed_Recital1-3_MID--AUDIO_03_R1_2018_wav--5.wav"
midi_filename = "model_predictions/_test_audio/MIDI-Unprocessed_Recital1-3_MID--AUDIO_03_R1_2018_wav--5.midi"

eval_audio_midi_pairs = [(audio_filename, midi_filename)]


In [201]:
# Evaluate the model
# You need to prepare 'validation_data' in the format that your model expects

scores = evaluate_model(eval_audio_midi_pairs, model_nov26_02)
print(scores)


Predicting MIDI for model_predictions/_test_audio/MIDI-Unprocessed_Recital1-3_MID--AUDIO_03_R1_2018_wav--5.wav...
MODEL OUTPUT:
 

Data:
 ({'note': array([[0.0029864 , 0.00470835, 0.01070962, ..., 0.00027813, 0.00013995,
        0.00013994],
       [0.00344211, 0.00358758, 0.01067878, ..., 0.00031013, 0.00015533,
        0.00014709],
       [0.00383667, 0.00290961, 0.00915301, ..., 0.0003195 , 0.00016033,
        0.00014896],
       ...,
       [0.00711869, 0.00244828, 0.00316688, ..., 0.00028081, 0.00014825,
        0.00016137],
       [0.00763034, 0.00270094, 0.00305608, ..., 0.0002851 , 0.00014807,
        0.00016215],
       [0.00821401, 0.00278966, 0.0031475 , ..., 0.00029076, 0.00014854,
        0.00016333]], dtype=float32), 'onset': array([[2.91753057e-02, 2.33364701e-01, 9.89132747e-02, ...,
        2.59385881e-04, 6.24145250e-05, 5.37400738e-05],
       [2.99986526e-02, 2.02528089e-01, 9.63626355e-02, ...,
        1.04513485e-04, 3.49795264e-05, 4.18237832e-05],
       [3.0580

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# OUTDATED CODE

In [None]:
# OUTDATED: This code doesn't split songs into intervals, it tries to use the whole song as one sample in the dataset


x = []
y = []

for idx in range(0, int(len(audio_filenames))):

    # preprocess audio
    n_overlapping_frames = 30
    overlap_len = n_overlapping_frames * FFT_HOP
    hop_size = AUDIO_N_SAMPLES - overlap_len
    audio_windowed, _, audio_original_length = inference.get_audio_input(audio_filenames[idx], overlap_len, hop_size)

    x.append(audio_windowed)

    # preprocess midi
    pm_midi = pretty_midi.PrettyMIDI(midi_filenames[idx])
    onsets = midi_to_piano_onset_matrix(midi_filenames[idx], frames_per_second=ANNOTATIONS_FPS)
    y.append(onsets)
    

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Shape in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op FloorDiv in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Neg in device /job:localhost/replica

In [134]:
# OUTDATED: DATA SHOULD BE GENERATED AS IT IS NEEDED TO SAVE MEMORY
os_x = []
os_y = []


for idx, (audio_filename, midi_filename) in enumerate(audio_midi_pairs[:int(len(audio_midi_pairs)*0.2)]):
    offset = 0
    # preprocess midi
    onsets = midi_to_piano_onset_matrix(midi_filename, frames_per_second=ANNOTATIONS_FPS)
    while offset < librosa.get_duration(path=audio_filename) - SPLIT_INTERVAL:
        # preprocess audio

        n_overlapping_frames = 30
        overlap_len = n_overlapping_frames * FFT_HOP
        hop_size = AUDIO_N_SAMPLES - overlap_len

        # modified get_input_audio function to get audio from offset
        assert overlap_len % 2 == 0, "overlap_length must be even, got {}".format(overlap_len)
        audio_original, _ = librosa.load(audio_filename, sr=AUDIO_SAMPLE_RATE, offset=offset, duration=SPLIT_INTERVAL, mono=True)

        original_length = audio_original.shape[0]
        audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original])
        audio_windowed, window_times = inference.window_audio_file(audio_original, hop_size)
    
        os_x.append(audio_windowed)

        split_onsets = onsets[int(offset*ANNOTATIONS_FPS):int((offset+SPLIT_INTERVAL)*ANNOTATIONS_FPS), :]
        if (split_onsets.shape[0] < ANNOTATIONS_FPS * SPLIT_INTERVAL):
            padding = ANNOTATIONS_FPS * SPLIT_INTERVAL - split_onsets.shape[0]
            split_onsets = np.pad(split_onsets, [(0, padding), (0, 0)], 'constant')
        os_y.append(split_onsets)

        offset += SPLIT_INTERVAL

# DO NOT BATCH DATASET 

tensor_dataset = tf.data.Dataset.from_tensor_slices((os_x, os_y))
#tensor_dataset = tensor_dataset.take(300) # take 300 batches for now, comment this out later
train_dataset = tensor_dataset.take(int(len(tensor_dataset)*0.8))
val_dataset = tensor_dataset.skip(int(len(tensor_dataset)*0.8))

print("train_dataset: ", train_dataset)
print("val_dataset: ", val_dataset)

take_count = sum(1 for _ in train_dataset)
print(f"Size of take_dataset: {take_count}")

skip_count = sum(1 for _ in val_dataset)
print(f"Size of skip_dataset: {skip_count}")

ds_count = sum(1 for _ in tensor_dataset)
print(f"Size of batched_dataset: {ds_count}")

print("\n\n\n\n")
for audio, onset in train_dataset.take(1):  # Adjust the number taken as needed
    print("Audio shape:", audio.shape)
    print("Onset shape:", onset.shape)
    # Optionally, visually inspect the actual data
    print("Audio data sample:", audio[0])  # Inspect first sample of the batch
    print("Onset data sample:", onset[0])  # Inspect first sample of the batch

train_dataset:  <TakeDataset element_spec=(TensorSpec(shape=(2, 43844, 1), dtype=tf.float32, name=None), TensorSpec(shape=(172, 88), dtype=tf.float64, name=None))>
val_dataset:  <SkipDataset element_spec=(TensorSpec(shape=(2, 43844, 1), dtype=tf.float32, name=None), TensorSpec(shape=(172, 88), dtype=tf.float64, name=None))>
Size of take_dataset: 5568
Size of skip_dataset: 1393
Size of batched_dataset: 6961





Audio shape: (2, 43844, 1)
Onset shape: (172, 88)
Audio data sample: tf.Tensor(
[[ 0.        ]
 [ 0.        ]
 [ 0.        ]
 ...
 [-0.00886715]
 [-0.00748847]
 [-0.00510733]], shape=(43844, 1), dtype=float32)
Onset data sample: tf.Tensor(
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(88,), dtype=float64)


In [None]:
# OUTDATED: This code splits data into custom made batches instead of using tensorflow


# put data into batches
batched_dataset = []
batch_size = 1 # change this to BATCH_SIZE later

i = 0
while (i < len(x)):
    if (i < len(x) - batch_size):
        x_batch = x[i:i+batch_size]
        y_batch = y[i:i+batch_size]
        batched_dataset.append((x_batch, y_batch))
    else:
        x_batch = x[i:]
        y_batch = y[i:]
        batched_dataset.append((x_batch, y_batch))
    i += batch_size

# Split batched dataset into training and validation sets
# 80% training, 20% validation
train_dataset = batched_dataset[:int(len(batched_dataset)*0.8)]
val_dataset = batched_dataset[int(len(batched_dataset)*0.8):]

In [None]:
# OUTDATED: THIS CODE DOES NOT WORK, IT IS JUST FOR REFERENCE


num_epochs = 5
epoch_train_loss = []
epoch_val_loss = []
n_overlapping_frames = 30
audio_original_length = 110250

# initialize spotify basic pitch model
model = models.model()

# Initialize the model
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
model.compile(optimizer=adam_optimizer, loss=loss_function)
print("INITIALIZED NEW MODEL (training from scratch)")

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}:")
    
    train_loss = []
    # Training
    # Loop through training set batches (batch size 1 for now)
    for idx, (x, y) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
        
            # Forward pass
            logits = model(x, training=True)

            print("logits['onset'].shape: ", logits['onset'].shape)
            # Process output
            unwrapped_output = {k: inference.unwrap_output(logits[k], audio_original_length, n_overlapping_frames) for k in logits}
            
            print("unwrapped_output['onset'].shape: ", unwrapped_output['onset'].shape)
            # Compute loss for onsets
            loss_value_onsets = loss_function(y, unwrapped_output['onset'])

            # Compute loss for notes
            loss_value_notes = loss_function(y, unwrapped_output['note'])

            # average loss
            loss_value = (loss_value_onsets + loss_value_notes) / 2

            gradients = tape.gradient(loss_value, model.trainable_weights)

            grads_and_vars = [(grad, var) for grad, var in zip(gradients, model.trainable_weights) if grad is not None]

            if idx == 1 and epoch == 0:
                for grad, var in zip(gradients, model.trainable_weights):
                    if grad is None:
                        print(f"None gradient for {var.name}")
                print("Gradients: ", gradients)
                print("Gradients and vars: ", grads_and_vars)
                
            # Update weights
            adam_optimizer.apply_gradients(grads_and_vars)
            
            # record loss
            train_loss.append(loss_value)
        

            # Print progress
            if idx % 10 == 0:
                print(f"Step {idx}: loss = {loss_value:.4f}, accuracy = TBD")
            
    # Reset metric at the end of epoch
    avgLoss = np.mean(train_loss)
    epoch_train_loss.append(avgLoss)
    print("Epoch {}/{} training loss: {}".format(epoch+1, num_epochs, avgLoss))

    # Validation
    val_loss = []
     # Loop through validation set batches (batch size 1 for now)
    for idx, (x, y) in enumerate(val_dataset):
        # Forward pass
        logits = model(x, training=False)

        # Process output
        unwrapped_output = {k: inference.unwrap_output(logits[k], audio_original_length, n_overlapping_frames) for k in logits}

        # Compute loss for onsets
        loss_value_onsets = loss_function(y, unwrapped_output['onset'])

        # Compute loss for notes
        loss_value_notes = loss_function(y, unwrapped_output['note'])

        # average loss
        loss_value = (loss_value_onsets + loss_value_notes) / 2
    
        # record loss
        val_loss.append(loss_value)
        
    avgLoss = np.mean(val_loss)
    epoch_val_loss.append(avgLoss)
    print("Epoch {}/{} validation loss: {}".format(epoch+1, num_epochs, avgLoss))

plt.plot(epoch_train_loss)
plt.plot(epoch_val_loss)

INITIALIZED NEW MODEL (training from scratch)

Epoch 1/5:
logits['onset'].shape:  (4, 172, 88)
unwrapped_output['onset'].shape:  (430, 88)
Step 0: loss = 0.0616, accuracy = TBD
logits['onset'].shape:  (4, 172, 88)
unwrapped_output['onset'].shape:  (430, 88)
None gradient for batch_normalization_52/gamma:0
None gradient for batch_normalization_52/beta:0
None gradient for conv2d_79/kernel:0
None gradient for conv2d_79/bias:0
None gradient for batch_normalization_54/gamma:0
None gradient for batch_normalization_54/beta:0
None gradient for contours-reduced/kernel:0
None gradient for contours-reduced/bias:0
None gradient for conv2d_80/kernel:0
None gradient for conv2d_80/bias:0
None gradient for conv2d_82/kernel:0
None gradient for conv2d_82/bias:0
None gradient for batch_normalization_55/gamma:0
None gradient for batch_normalization_55/beta:0
None gradient for conv2d_81/kernel:0
None gradient for conv2d_81/bias:0
None gradient for conv2d_83/kernel:0
None gradient for conv2d_83/bias:0
Gradi

KeyboardInterrupt: 