# Custom Training Code <br>
Built by Alex Fisher and Kevin Parra-Olmedo

## Import Dependencies

In [2]:
import mir_eval
import glob
import json
import numpy as np
import tensorflow as tf
import pretty_midi
import librosa
import matplotlib.pyplot as plt

import sys
sys.path.append('../basic_pitch_original/')

from basic_pitch import inference
from basic_pitch import models

from basic_pitch.constants import (
    ANNOT_N_FRAMES,
    ANNOTATIONS_FPS,
    ANNOTATIONS_N_SEMITONES,
    AUDIO_N_SAMPLES,
    N_FREQ_BINS_CONTOURS,
    AUDIO_SAMPLE_RATE,
    FFT_HOP
)

BATCH_SIZE = 3
SPLIT_INTERVAL = 5

tfkl = tf.keras.layers

In [3]:
# Load in the ground truth MIDI files
# glob is a pattern matching utility for files

#use maestro-v3.0.0.json to get needed files

# outdated code because I took a sample of the dataset and put in a different folder

with open('datasets/maestro/maestro-v3.0.0/maestro-v3.0.0.json', 'r') as f:
    data = json.load(f)
    midi_filenames = data['midi_filename']
    audio_filenames = data['audio_filename']

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/maestro/maestro-v3.0.0/maestro-v3.0.0.json'

## Load in sample dataset files<br>
We are using a small sample from MAESTRO dataset's 100 GB of midi/wav files

In [4]:
# get maestro sample files

midi_filenames = glob.glob('../../datasets/maestro_sample/*.midi')
audio_filenames = glob.glob('../../datasets/maestro_sample/*.wav')

print(midi_filenames)
print(audio_filenames)

print(len(midi_filenames))
print(len(audio_filenames))

['../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber2_MID--AUDIO_09_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--2.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber4_MID--AUDIO_11_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber5_MID--AUDIO_18_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber6_MID--AUDIO_20_R3_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Chamber6_MID--AUDIO_20_R3_2018_wav--2.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Recital1-3_MID--AUDIO_01_R1_2018_wav--1.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Recital1-3_MID--AUDIO_01_R1_2018_wav--2.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Recital1-3_MID--AUDIO_01_R1_2018_wav--3.midi', '../../datasets/maestro_sample\\MIDI-Unprocessed_Reci

# Preprocess audio and MIDI pair files
Audio needs to fit what model takes as input (windowed audio, uses Basic Pitch's inference get_audio_input function)<br>
MIDI needs to match what model outputs (binary matrix)

In [5]:
def midi_to_piano_onset_matrix(midi_path, frames_per_second=ANNOTATIONS_FPS):
    """
    Convert MIDI file to a binary matrix representing onset of piano keys using a set FPS.

    Parameters:
    - midi_path (str): Path to the MIDI file.
    - frames_per_second (int): Number of frames per second for the binary representation.

    Returns:
    - numpy.ndarray: Binary matrix where rows represent the 88 piano keys and columns are time frames.
    """

    # Load the MIDI file
    midi_data = pretty_midi.PrettyMIDI(midi_path)

    # Duration of the MIDI file in seconds
    duration = midi_data.get_end_time()

    # 88 keys for standard piano
    num_piano_keys = 88

    # Calculate the total number of frames based on the FPS
    total_frames = int(duration * frames_per_second)

    # Initialize binary matrix with zeros
    binary_matrix = np.zeros((total_frames, num_piano_keys))

    for instrument in midi_data.instruments:
        for note in instrument.notes:
            # Only consider valid piano notes (from 21 to 108)
            if 21 <= note.pitch <= 108:
                # Find the frame for this onset time
                onset_frame = int(note.start * frames_per_second)

                # Prevent indexing beyond the matrix size
                if onset_frame < total_frames:
                    # Adjust the pitch value to fit within our matrix's row indices (0-87)
                    adjusted_pitch = note.pitch - 21

                    # Mark the onset in the binary matrix
                    binary_matrix[onset_frame, adjusted_pitch] = 1

    return binary_matrix


In [6]:
x = []
y = []

for idx in range(0, int(len(audio_filenames))):

    # preprocess audio
    n_overlapping_frames = 30
    overlap_len = n_overlapping_frames * FFT_HOP
    hop_size = AUDIO_N_SAMPLES - overlap_len
    audio_windowed, _, audio_original_length = inference.get_audio_input(audio_filenames[idx], overlap_len, hop_size)

    x.append(audio_windowed)

    # preprocess midi
    pm_midi = pretty_midi.PrettyMIDI(midi_filenames[idx])
    onsets = midi_to_piano_onset_matrix(midi_filenames[idx], frames_per_second=ANNOTATIONS_FPS)
    y.append(onsets)
    
    

KeyboardInterrupt: 

In [8]:
# one song dataset

os_x = []
orig_audio_len = []
os_y = []

for idx in range(0, int(len(audio_filenames))):
    offset = 0
    # preprocess midi
    onsets = midi_to_piano_onset_matrix(midi_filenames[idx], frames_per_second=ANNOTATIONS_FPS)
    while offset < librosa.get_duration(path=audio_filenames[idx]) - SPLIT_INTERVAL:
        # preprocess audio

        n_overlapping_frames = 30
        overlap_len = n_overlapping_frames * FFT_HOP
        hop_size = AUDIO_N_SAMPLES - overlap_len

        # modified get_input_audio function to get audio from offset
        assert overlap_len % 2 == 0, "overlap_length must be even, got {}".format(overlap_len)
        audio_original, _ = librosa.load(audio_filenames[idx], sr=AUDIO_SAMPLE_RATE, offset=offset, duration=SPLIT_INTERVAL, mono=True)

        original_length = audio_original.shape[0]
        audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original])
        audio_windowed, window_times = inference.window_audio_file(audio_original, hop_size)
    
        os_x.append(audio_windowed)
        orig_audio_len.append(original_length)

        split_onsets = onsets[int(offset*ANNOTATIONS_FPS):int((offset+SPLIT_INTERVAL)*ANNOTATIONS_FPS), :]
        if (split_onsets.shape[0] < ANNOTATIONS_FPS * SPLIT_INTERVAL):
            padding = ANNOTATIONS_FPS * SPLIT_INTERVAL - split_onsets.shape[0]
            split_onsets = np.pad(split_onsets, [(0, padding), (0, 0)], 'constant')
        os_y.append(split_onsets)

        offset += SPLIT_INTERVAL

In [9]:
print(os_x[0].shape)
print(os_y[0].shape)

print(len(os_x))
print(len(os_y))
print(len(orig_audio_len))

print(os_x[0])
print(orig_audio_len[2])

(4, 43844, 1)
(430, 88)
7257
7257
7257
tf.Tensor(
[[[ 0.        ]
  [ 0.        ]
  [ 0.        ]
  ...
  [-0.0117997 ]
  [-0.01149738]
  [-0.01084852]]

 [[ 0.00116504]
  [ 0.00040939]
  [-0.00011012]
  ...
  [-0.03573342]
  [-0.03855322]
  [-0.03870913]]

 [[ 0.00629463]
  [ 0.0067053 ]
  [ 0.00703699]
  ...
  [ 0.        ]
  [ 0.        ]
  [ 0.        ]]

 [[-0.01862951]
  [-0.01724252]
  [-0.01553928]
  ...
  [ 0.        ]
  [ 0.        ]
  [ 0.        ]]], shape=(4, 43844, 1), dtype=float32)
110250


In [130]:
# DO NOT BATCH DATASET 

tensor_dataset = tf.data.Dataset.from_tensor_slices((os_x, os_y))
#tensor_dataset = tensor_dataset.take(300) # take 300 batches for now, comment this out later
train_dataset = tensor_dataset.take(int(len(tensor_dataset)*0.8))
val_dataset = tensor_dataset.skip(int(len(tensor_dataset)*0.8))


In [11]:
print("train_dataset: ", train_dataset)
print("val_dataset: ", val_dataset)

take_count = sum(1 for _ in train_dataset)
print(f"Size of take_dataset: {take_count}")

skip_count = sum(1 for _ in val_dataset)
print(f"Size of skip_dataset: {skip_count}")

ds_count = sum(1 for _ in tensor_dataset)
print(f"Size of batched_dataset: {ds_count}")



train_dataset:  <_TakeDataset element_spec=(TensorSpec(shape=(4, 43844, 1), dtype=tf.float32, name=None), TensorSpec(shape=(430, 88), dtype=tf.float64, name=None))>
val_dataset:  <_SkipDataset element_spec=(TensorSpec(shape=(4, 43844, 1), dtype=tf.float32, name=None), TensorSpec(shape=(430, 88), dtype=tf.float64, name=None))>
Size of take_dataset: 240
Size of skip_dataset: 60
Size of batched_dataset: 300


In [None]:
# put data into batches
batched_dataset = []
batch_size = 1 # change this to BATCH_SIZE later

i = 0
while (i < len(x)):
    if (i < len(x) - batch_size):
        x_batch = x[i:i+batch_size]
        y_batch = y[i:i+batch_size]
        batched_dataset.append((x_batch, y_batch))
    else:
        x_batch = x[i:]
        y_batch = y[i:]
        batched_dataset.append((x_batch, y_batch))
    i += batch_size

In [None]:
# Split batched dataset into training and validation sets
# 80% training, 20% validation
train_dataset = batched_dataset[:int(len(batched_dataset)*0.8)]
val_dataset = batched_dataset[int(len(batched_dataset)*0.8):]

In [None]:
# Split dataset into train/test split
# 80% training, 20% validation
x_train = x[:int(len(x)*0.8)]
y_train = y[:int(len(y)*0.8)]
x_test = x[int(len(x)*0.8):]
y_test = y[int(len(y)*0.8):]

In [None]:
output = model.predict(x_test[1])





In [None]:
# initialize spotify basic pitch model
model = models.model()
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])

# Initialize the model
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_function = tf.keras.losses.BinaryCrossentropy()



In [None]:
logits = model(train_dataset[0][0], training=True)
print(logits)

{'onset': <tf.Tensor: shape=(356, 172, 88), dtype=float32, numpy=
array([[[0.04727691, 0.08470501, 0.0437418 , ..., 0.02655039,
         0.02690475, 0.03086778],
        [0.05278471, 0.03867819, 0.00993544, ..., 0.04313786,
         0.01514172, 0.0203534 ],
        [0.26405764, 0.5626659 , 0.17359261, ..., 0.03620816,
         0.01413448, 0.0142984 ],
        ...,
        [0.1899556 , 0.43211785, 0.5669065 , ..., 0.10389652,
         0.05054878, 0.05620126],
        [0.24860631, 0.3420169 , 0.44899505, ..., 0.13852572,
         0.08319806, 0.09333236],
        [0.3292999 , 0.53262085, 0.60701287, ..., 0.2194675 ,
         0.14628221, 0.21946846]],

       [[0.3681499 , 0.32686046, 0.44428867, ..., 0.05431946,
         0.05742459, 0.07718755],
        [0.27146247, 0.54407746, 0.6061005 , ..., 0.11466714,
         0.04711415, 0.07696318],
        [0.38571095, 0.42635372, 0.5341912 , ..., 0.09636305,
         0.04582022, 0.05597818],
        ...,
        [0.34633774, 0.22532123, 0.4428990

## Train the model using the preprocessed data<br>
Something unique about this training process is that the y_batch data must be further preprocessed by adding padding to match the shape of the matrix produced by the model's output so that the shapes match and can be directly compared in the loss function. <br>


Not sure if there is a workaround for this. The only thing I can think of that might allow us to preprocess the data completely before the model predicts anything is by figuring out what matrix shape the model will produce before it produces it, then adding the padding to the MIDI matrix to match the shape. This might be able to be done by using some sort of equation with the input audio data. Model output always seems to produce more note onset information than is stored in the midi file. This could mean that the audio files need to be trimmed, but I'm not sure in what way or how to determine that.

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
# PRINT OUT ALL TRAINABLE LAYERS OF THE MODEL
# Iterate through the layers and print the layer name and its trainable status
for layer in models.model().layers:
    print(f"Layer: {layer.name}")
    print(f"Trainable: {layer.trainable}")
    for weight in layer.trainable_weights:
        print(f"\tWeight: {weight.name}, Shape: {weight.shape}")

# If you only want to see layers with trainable weights:
print("\nOnly layers with trainable weights:")
for layer in models.model().layers:
    if layer.trainable_weights:
        print(f"Layer: {layer.name}")
        for weight in layer.trainable_weights:
            print(f"\tWeight: {weight.name}, Shape: {weight.shape}")

Layer: input_24
Trainable: True
Layer: flatten_audio_ch_23
Trainable: True
Layer: cqt2010v2_23
Trainable: False
Layer: normalized_log_23
Trainable: True
Layer: tf.expand_dims_46
Trainable: True
Layer: batch_normalization_92
Trainable: True
	Weight: batch_normalization_92/gamma:0, Shape: (1,)
	Weight: batch_normalization_92/beta:0, Shape: (1,)
Layer: harmonic_stacking
Trainable: False
Layer: conv2d_139
Trainable: True
	Weight: conv2d_139/kernel:0, Shape: (3, 39, 8, 8)
	Weight: conv2d_139/bias:0, Shape: (8,)
Layer: batch_normalization_94
Trainable: True
	Weight: batch_normalization_94/gamma:0, Shape: (8,)
	Weight: batch_normalization_94/beta:0, Shape: (8,)
Layer: re_lu_93
Trainable: True
Layer: contours-reduced
Trainable: True
	Weight: contours-reduced/kernel:0, Shape: (5, 5, 8, 1)
	Weight: contours-reduced/bias:0, Shape: (1,)
Layer: contour
Trainable: True
Layer: tf.expand_dims_47
Trainable: True
Layer: conv2d_140
Trainable: True
	Weight: conv2d_140/kernel:0, Shape: (7, 7, 1, 32)
	Weigh

In [100]:
# CREATE CUSTOM LOSS FUNCTION FOR WEIGHTED BINARY CROSS ENTROPY
class WeightedBinaryCrossEntropy(tf.keras.losses.Loss):
    def __init__(self, pos_weight, neg_weight, from_logits=False, name='weighted_binary_crossentropy'):
        super().__init__(name=name)
        self.pos_weight = pos_weight
        self.neg_weight = neg_weight
        self.from_logits = from_logits

    def call(self, y_true, y_pred):
        if not self.from_logits:
            print("y_pred: ", y_pred)
            unwrapped_y_pred = self.unwrap_output_custom(y_pred, audio_original_length, n_overlapping_frames)

            # Manually calculate the weighted binary cross-entropy for predictions that aren't logits
            epsilon = tf.keras.backend.epsilon()
            unwrapped_y_pred = tf.clip_by_value(unwrapped_y_pred, epsilon, 1.0 - epsilon)

            y_true = tf.cast(y_true, tf.float32)
            unwrapped_y_pred = tf.cast(unwrapped_y_pred, tf.float32)
            pos_weight = tf.cast(self.pos_weight, tf.float32)
            neg_weight = tf.cast(self.neg_weight, tf.float32)

            loss = -y_true * tf.math.log(unwrapped_y_pred) * pos_weight - (1.0 - y_true) * tf.math.log(1.0 - unwrapped_y_pred) * neg_weight
        else:
            # Use TensorFlow's built-in function for logits
            loss = tf.nn.weighted_cross_entropy_with_logits(labels=y_true, logits=y_pred, pos_weight=self.pos_weight)

        return tf.reduce_mean(loss)
    
    # custom unwrap output function that remains compatible with TensorFlow's graph execution
    def unwrap_output_custom(self, output: tf.Tensor, audio_original_length: int, n_overlapping_frames: int) -> tf.Tensor:
        """Unwrap batched model predictions to a single matrix.

        Args:
            output: tensor (n_batches, n_times_short, n_freqs)
            audio_original_length: length of original audio signal (in samples)
            n_overlapping_frames: number of overlapping frames in the output

        Returns:
            tensor (n_times, n_freqs)
        """
        print("output: ", output)
        output_rank = tf.rank(output)
        print("output_rank: ", output_rank)
        
        def process_output():
            n_olap = int(0.5 * n_overlapping_frames)
            if n_olap > 0:
                output_processed = output[:, n_olap:-n_olap, :]
            else:
                output_processed = output
                
            output_shape = tf.shape(output_processed)
            n_output_frames_original = tf.cast(tf.floor(audio_original_length * (ANNOTATIONS_FPS / AUDIO_SAMPLE_RATE)), tf.int32)
            unwrapped_output = tf.reshape(output_processed, [output_shape[0] * output_shape[1], output_shape[2]])
            return unwrapped_output[:n_output_frames_original, :]  # trim to original audio length
        
        def handle_invalid_rank():
            # Print a warning message and return a dummy tensor
            tf.print(f"Warning: Expected output rank to be 3, got {output_rank}")
            return tf.zeros((0, 0), dtype=output.dtype)

        return tf.cond(tf.equal(output_rank, 3), process_output, handle_invalid_rank)

## Try built-in tensorflow train method

This seems to work, the note and onset loss continuously get reduced throughout each epoch! The model is successfully being trained.

In [131]:
# Initialize the model

model_train = models.model()
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
onset_loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
contour_loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
note_loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
model_train.compile(optimizer=adam_optimizer, loss={"onset": onset_loss_function, "note": note_loss_function})


In [133]:
# train model
num_epochs = 1
model_train.fit(train_dataset, batch_size=16, epochs=num_epochs, validation_data=val_dataset)

output:  Tensor("model_27/note/reshape_3/Reshape:0", shape=(4, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy/Rank:0", shape=(), dtype=int32)
y_pred:  Tensor("model_27/onset/reshape_2/Reshape:0", shape=(4, 172, 88), dtype=float32)
output:  Tensor("model_27/onset/reshape_2/Reshape:0", shape=(4, 172, 88), dtype=float32)
output_rank:  Tensor("weighted_binary_crossentropy_1/Rank:0", shape=(), dtype=int32)


<keras.src.callbacks.History at 0x1e07aab9670>

In [134]:
# Save our trained version of the model

model_train.save('saved_models/nov06')

INFO:tensorflow:Assets written to: saved_models/nov06\assets


INFO:tensorflow:Assets written to: saved_models/nov06\assets


## Try Custom Training Loop Method

In [53]:
num_epochs = 5
epoch_train_loss = []
epoch_val_loss = []
n_overlapping_frames = 30
audio_original_length = 110250

# initialize spotify basic pitch model
model = models.model()

# Initialize the model
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_function = WeightedBinaryCrossEntropy(pos_weight=0.95, neg_weight=0.05)
model.compile(optimizer=adam_optimizer, loss=loss_function)
print("INITIALIZED NEW MODEL (training from scratch)")

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}:")
    
    train_loss = []
    # Training
    # Loop through training set batches (batch size 1 for now)
    for idx, (x, y) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
        
            # Forward pass
            logits = model(x, training=True)

            print("logits['onset'].shape: ", logits['onset'].shape)
            # Process output
            unwrapped_output = {k: inference.unwrap_output(logits[k], audio_original_length, n_overlapping_frames) for k in logits}
            
            print("unwrapped_output['onset'].shape: ", unwrapped_output['onset'].shape)
            # Compute loss for onsets
            loss_value_onsets = loss_function(y, unwrapped_output['onset'])

            # Compute loss for notes
            loss_value_notes = loss_function(y, unwrapped_output['note'])

            # average loss
            loss_value = (loss_value_onsets + loss_value_notes) / 2

            gradients = tape.gradient(loss_value, model.trainable_weights)

            grads_and_vars = [(grad, var) for grad, var in zip(gradients, model.trainable_weights) if grad is not None]

            if idx == 1 and epoch == 0:
                for grad, var in zip(gradients, model.trainable_weights):
                    if grad is None:
                        print(f"None gradient for {var.name}")
                print("Gradients: ", gradients)
                print("Gradients and vars: ", grads_and_vars)
                
            # Update weights
            adam_optimizer.apply_gradients(grads_and_vars)
            
            # record loss
            train_loss.append(loss_value)
        

            # Print progress
            if idx % 10 == 0:
                print(f"Step {idx}: loss = {loss_value:.4f}, accuracy = TBD")
            
    # Reset metric at the end of epoch
    avgLoss = np.mean(train_loss)
    epoch_train_loss.append(avgLoss)
    print("Epoch {}/{} training loss: {}".format(epoch+1, num_epochs, avgLoss))

    # Validation
    val_loss = []
     # Loop through validation set batches (batch size 1 for now)
    for idx, (x, y) in enumerate(val_dataset):
        # Forward pass
        logits = model(x, training=False)

        # Process output
        unwrapped_output = {k: inference.unwrap_output(logits[k], audio_original_length, n_overlapping_frames) for k in logits}

        # Compute loss for onsets
        loss_value_onsets = loss_function(y, unwrapped_output['onset'])

        # Compute loss for notes
        loss_value_notes = loss_function(y, unwrapped_output['note'])

        # average loss
        loss_value = (loss_value_onsets + loss_value_notes) / 2
    
        # record loss
        val_loss.append(loss_value)
        
    avgLoss = np.mean(val_loss)
    epoch_val_loss.append(avgLoss)
    print("Epoch {}/{} validation loss: {}".format(epoch+1, num_epochs, avgLoss))

plt.plot(epoch_train_loss)
plt.plot(epoch_val_loss)

INITIALIZED NEW MODEL (training from scratch)

Epoch 1/5:
logits['onset'].shape:  (4, 172, 88)
unwrapped_output['onset'].shape:  (430, 88)
Step 0: loss = 0.0616, accuracy = TBD
logits['onset'].shape:  (4, 172, 88)
unwrapped_output['onset'].shape:  (430, 88)
None gradient for batch_normalization_52/gamma:0
None gradient for batch_normalization_52/beta:0
None gradient for conv2d_79/kernel:0
None gradient for conv2d_79/bias:0
None gradient for batch_normalization_54/gamma:0
None gradient for batch_normalization_54/beta:0
None gradient for contours-reduced/kernel:0
None gradient for contours-reduced/bias:0
None gradient for conv2d_80/kernel:0
None gradient for conv2d_80/bias:0
None gradient for conv2d_82/kernel:0
None gradient for conv2d_82/bias:0
None gradient for batch_normalization_55/gamma:0
None gradient for batch_normalization_55/beta:0
None gradient for conv2d_81/kernel:0
None gradient for conv2d_81/bias:0
None gradient for conv2d_83/kernel:0
None gradient for conv2d_83/bias:0
Gradi

KeyboardInterrupt: 

In [None]:
model.save('saved_models/oct10')

INFO:tensorflow:Assets written to: saved_models/oct10\assets


INFO:tensorflow:Assets written to: saved_models/oct10\assets


## Test Trained Models

In [135]:
# command line execution to output resulting midi from trained model
!python ../basic_pitch_original/basic_pitch/predict.py --model_path "saved_models/nov06" "model_predictions/our_model/" "model_predictions/_test_audio/beethoven_fur_elise.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py "model_predictions/spotify_model/" "model_predictions/_test_audio/beethoven_fur_elise.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py --model_path "saved_models/nov06" "model_predictions/our_model/" "model_predictions/_test_audio/wii_music.mp3"
!python ../basic_pitch_original/basic_pitch/predict.py "model_predictions/spotify_model/" "model_predictions/_test_audio/wii_music.mp3"


✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\beethoven_fur_elise.mp3...


  Creating midi...
  💅 Saved to model_predictions\our_model\beethoven_fur_elise_basic_pitch.mid

✨ Done ✨


✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\beethoven_fur_elise.mp3...


  Creating midi...
  🚨 model_predictions\spotify_model\beethoven_fur_elise_basic_pitch.mid already exists and would be overwritten. Skipping output files for model_predictions\_test_audio\beethoven_fur_elise.mp3.

✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take a few seconds)...

Predicting MIDI for model_predictions\_test_audio\wii_music.mp3...


  Creating midi...
  💅 Saved to model_predictions\our_model\wii_music_basic_pitch.mid

✨ Done ✨


✨✨✨✨✨✨✨✨✨
✨ Basic Pitch  ✨
✨✨✨✨✨✨✨✨✨

Importing Tensorflow (this may take

In [123]:
from pathlib import Path
from tensorflow import Tensor, signal, keras, saved_model

current_directory = Path.cwd()
print(current_directory.parent)
icassp_2022_model_path = str(current_directory.parent) + "/basic_pitch_original/basic_pitch/saved_models/icassp_2022/nmp"
print("OG Model path: ", icassp_2022_model_path)
model_oct10 = saved_model.load('saved_models/oct10')
model_nov06 = saved_model.load('saved_models/nov06')
model_bp = saved_model.load(icassp_2022_model_path)


c:\_AlexFiles\Coding\Python\FALL2023_IndependentStudy\audio_to_midi_vst
OG Model path:  c:\_AlexFiles\Coding\Python\FALL2023_IndependentStudy\audio_to_midi_vst/basic_pitch_original/basic_pitch/saved_models/icassp_2022/nmp


In [125]:
# create sample for 1 song (Fur Elise by Beethoven)
beethoven_x = []

offset = 10 #seconds

beethoven_file = current_directory / "model_predictions/_test_audio/beethoven_fur_elise.mp3"
print(beethoven_file)
while offset < librosa.get_duration(path=beethoven_file) - SPLIT_INTERVAL:
    # preprocess audio

    n_overlapping_frames = 30
    overlap_len = n_overlapping_frames * FFT_HOP
    hop_size = AUDIO_N_SAMPLES - overlap_len

    # modified get_input_audio function to get audio from offset
    assert overlap_len % 2 == 0, "overlap_length must be even, got {}".format(overlap_len)
    audio_original, _ = librosa.load(beethoven_file, sr=AUDIO_SAMPLE_RATE, offset=offset, duration=SPLIT_INTERVAL, mono=True)

    original_length = audio_original.shape[0]
    audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original])
    audio_windowed, window_times = inference.window_audio_file(audio_original, hop_size)

    beethoven_x.append(audio_windowed)
    offset += SPLIT_INTERVAL

c:\_AlexFiles\Coding\Python\FALL2023_IndependentStudy\audio_to_midi_vst\independent_study\model_predictions\_test_audio\beethoven_fur_elise.mp3


In [126]:
model_bp(beethoven_x[0])

{'onset': <tf.Tensor: shape=(4, 172, 88), dtype=float32, numpy=
 array([[[0.2563653 , 0.14136553, 0.11154418, ..., 0.10882651,
          0.136962  , 0.16868067],
         [0.18752408, 0.1122677 , 0.09407169, ..., 0.07430211,
          0.08307945, 0.10704739],
         [0.17874199, 0.12020129, 0.11011477, ..., 0.08455317,
          0.0864543 , 0.10679121],
         ...,
         [0.09266827, 0.11102663, 0.10919577, ..., 0.09788323,
          0.10385687, 0.08499473],
         [0.0693486 , 0.07680301, 0.09822215, ..., 0.08400798,
          0.11336812, 0.08087544],
         [0.08666889, 0.08257812, 0.10162653, ..., 0.08861089,
          0.11391509, 0.10966828]],
 
        [[0.21780737, 0.16268659, 0.12284816, ..., 0.11596433,
          0.13623056, 0.14814657],
         [0.22107677, 0.14205264, 0.11213985, ..., 0.10483827,
          0.1084298 , 0.11917666],
         [0.17761321, 0.11661748, 0.13201249, ..., 0.10609926,
          0.09241855, 0.12110808],
         ...,
         [0.08610087, 0

In [127]:
model_oct10(beethoven_x[0])

{'onset': <tf.Tensor: shape=(4, 172, 88), dtype=float32, numpy=
 array([[[0.444482  , 0.31237635, 0.20683138, ..., 0.61643285,
          0.7267155 , 0.5396218 ],
         [0.30873603, 0.12919618, 0.10692903, ..., 0.4448164 ,
          0.51427305, 0.48544478],
         [0.5374681 , 0.18853362, 0.13876264, ..., 0.458224  ,
          0.5304953 , 0.4897275 ],
         ...,
         [0.36935937, 0.5306894 , 0.5449164 , ..., 0.320996  ,
          0.23089427, 0.26329532],
         [0.36367205, 0.60169804, 0.6845523 , ..., 0.32479608,
          0.3278274 , 0.2771007 ],
         [0.35844558, 0.51667976, 0.43400443, ..., 0.33894673,
          0.34691682, 0.33619347]],
 
        [[0.3036792 , 0.31951433, 0.3827361 , ..., 0.58958495,
          0.67314124, 0.56244427],
         [0.3735905 , 0.6404076 , 0.60093606, ..., 0.54255474,
          0.64743936, 0.46937564],
         [0.3643065 , 0.5471824 , 0.45531607, ..., 0.525807  ,
          0.53859895, 0.45449227],
         ...,
         [0.38378203, 0

In [129]:
model_nov06(beethoven_x[0])

{'onset': <tf.Tensor: shape=(4, 172, 88), dtype=float32, numpy=
 array([[[1.00330047e-01, 2.60852836e-02, 4.11974033e-03, ...,
          8.04347664e-08, 6.40524007e-08, 2.37228755e-06],
         [5.50187081e-02, 2.11416800e-02, 2.51630717e-03, ...,
          2.12454339e-08, 1.33039570e-08, 6.67369648e-07],
         [6.93362579e-02, 4.17249352e-02, 4.47468646e-03, ...,
          6.84766576e-07, 2.89789057e-07, 6.46032640e-06],
         ...,
         [2.45004721e-05, 4.04264085e-07, 1.29416051e-07, ...,
          2.78809806e-04, 1.69223305e-04, 2.47982563e-03],
         [1.16946478e-06, 7.29757310e-10, 5.56043322e-10, ...,
          3.55879217e-03, 2.22265581e-03, 1.67798400e-02],
         [1.05555613e-04, 4.12707948e-07, 4.06309880e-07, ...,
          7.28279427e-02, 5.56767881e-02, 1.33258268e-01]],
 
        [[6.09828979e-02, 4.50251818e-01, 1.75188005e-01, ...,
          2.27900614e-06, 1.09542816e-06, 3.90493478e-05],
         [6.69300649e-03, 2.59133764e-02, 1.74868573e-02, ...,
  