In [1]:
import pretty_midi
from evaluate_midi import prepare_eval_data

# Load reference notes from USDX MIDI file
midi = pretty_midi.PrettyMIDI('../data/out/usdx-midi/wake-me-up.mid')
original_midi = midi.instruments[0].notes

reference_notes = prepare_eval_data(original_midi)

In [2]:
import tensorflow as tf
from basic_pitch.inference import predict, ICASSP_2022_MODEL_PATH

# Load prediction model to cache it between function calls
basic_pitch_model = tf.saved_model.load(str(ICASSP_2022_MODEL_PATH))

# Generate MIDI from vocals track
def predict_notes(**options):
    model_output, midi_data, note_events = predict(
        "../data/out/optimized_audio/vocals.wav",
        basic_pitch_model,
        minimum_frequency=80,
        maximum_frequency=1000,
        **options
    )
    if len(midi_data.instruments) == 0:
        print("No notes predicted for params %r" % options)
        return []
    return midi_data.instruments[0].notes

In [4]:
# Evaluation function
from evaluate_midi import evaluate
from skopt import gp_minimize
from skopt.space import Real, Integer

# Monkey patch numpy to avoid skopt error
# (see https://github.com/scikit-optimize/scikit-optimize/issues/1138 for details)
import numpy
numpy.int = int

def evaluate_params(params):
    options = {
        "onset_threshold": params[0], # default: 0.5
        "frame_threshold": params[1], # default: 0.3
        "minimum_note_length": params[2], # default: 127.7
    }
    notes = predict_notes(**options)
    if len(notes) == 0:
        # optimization is trying to minimize the result
        # return max value to indicate invalid result
        return 1
    estimated_notes = prepare_eval_data(notes)
    scores = evaluate(reference_notes, estimated_notes)
    return 1.0 - scores['F-measure']

# Parameter ranges for prediction options
param_ranges = [
    Real(0.2, 0.8, name="onset_threshold"),
    Real(0.2, 0.8, name="frame_threshold"),
    Integer(80, 250, name="minimum_note_length")
]

gp_minimize(evaluate_params, param_ranges, n_calls=100)

Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicti



Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...




Predicting MIDI for ../data/out/optimized_audio/vocals.wav...




Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicting MIDI for ../data/out/optimized_audio/vocals.wav...
Predicti

          fun: 0.8895582329317269
            x: [0.2, 0.25501208430819716, 134]
    func_vals: [ 9.403e-01  9.365e-01 ...  8.972e-01  8.972e-01]
      x_iters: [[0.5301232310927759, 0.22884326898568813, 168], [0.2925753180909904, 0.3710359815013974, 202], [0.22111362551852728, 0.3595853954208452, 177], [0.5909326836988139, 0.6120539224984891, 241], [0.40343851094321304, 0.32469445426418597, 136], [0.3379009301440803, 0.5148610796688657, 113], [0.3648067104061148, 0.7305994534591005, 224], [0.675641171886195, 0.5129450955667988, 169], [0.37280992699253146, 0.42869847725007115, 156], [0.7296526948957998, 0.7448619637242753, 112], [0.2, 0.4049107100831101, 80], [0.8, 0.5626127930645245, 80], [0.2, 0.2, 80], [0.2, 0.4400118935288232, 121], [0.34126700398011844, 0.2, 250], [0.7990052069511879, 0.39376963479695226, 121], [0.2, 0.6205138830235053, 80], [0.47395117983010865, 0.4513533038241543, 99], [0.2, 0.2990868302790715, 80], [0.2, 0.4978485430696133, 141], [0.2, 0.5077110964013956, 80], 