# Compare piano transcription methods

In [1]:
from src import utils
from src.detect.onset_utils import OnsetMaker, bandpass_filter
from src.detect.midi_utils import group_onsets
from pretty_midi import PrettyMIDI
import librosa
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
import src.visualise.visualise_utils as vutils

In [62]:
corp = utils.CorpusMaker.from_excel('corpus_updated', only_annotated=True, only_30_corpus=False)

## Current approach

In [18]:
loaded = utils.load_corpus_from_files(f'{utils.get_project_root()}/data/cambridge-jazz-trio-database-v02')
fns = set(track['fname'] for track in corp.tracks)
res_cur = [{'track': t.item['fname'], 'method': 'current', **t.item['validation']['piano']} for t in loaded if t.item['validation']['piano'] is not None and t.item['fname'] in fns]

## Automatic MIDI transcription

In [19]:
def auto_midi(item):
    om = OnsetMaker(item, skip_processing=True)
    mm = PrettyMIDI(f'{utils.get_project_root()}/data/cambridge-jazz-trio-database-v02/{item["fname"]}/piano_midi.mid')
    ons = [o.start for o in mm.instruments[0].notes]
    fmt = group_onsets(ons, keep_func=np.min)
    return {
        'track': item['fname'],
        'method': 'automatic_midi', 
        **om.compare_onset_detection_accuracy(
            fname=rf'{om.references_dir}/manual_annotation/{item["fname"]}_piano.txt',
            onsets=fmt,
        )
    }

In [20]:
# Fast enough to process consecutively
res_mm = [auto_midi(i) for i in corp.tracks]

## Spectral flux

In [21]:
# Set the optimised defaults
params_sf = {
    "wait": 18,
    "delta": 0.023021937161684,
    "pre_max": 20,
    "post_max": 22,
    "pre_avg": 53,
    "post_avg": 4
}

In [22]:
def spec_flux(item):
    made = OnsetMaker(item, skip_processing=False)
    ons = librosa.onset.onset_detect(
        y=made.audio['piano'],
        sr=utils.SAMPLE_RATE,
        hop_length=utils.HOP_LENGTH,
        units='time',
        **params_sf
    )  
    return {
        'track': item['fname'],
        'method': 'spectral_flux', 
        **made.compare_onset_detection_accuracy(
            fname=rf'{made.references_dir}/manual_annotation/{item["fname"]}_piano.txt',
            onsets=ons,
        )
    }

In [23]:
with Parallel(n_jobs=-1, backend='loky', verbose=10) as par:
    res_sf = par(delayed(spec_flux)(i) for i in corp.tracks)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  34 | elapsed:   15.8s remaining:  2.7min
[Parallel(n_jobs=-1)]: Done   7 out of  34 | elapsed:   22.0s remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  11 out of  34 | elapsed:   37.3s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  15 out of  34 | elapsed:   47.1s remaining:   59.6s
[Parallel(n_jobs=-1)]: Done  19 out of  34 | elapsed:   51.4s remaining:   40.6s
[Parallel(n_jobs=-1)]: Done  23 out of  34 | elapsed:   54.7s remaining:   26.2s
[Parallel(n_jobs=-1)]: Done  27 out of  34 | elapsed:   55.8s remaining:   14.5s
[Parallel(n_jobs=-1)]: Done  31 out of  34 | elapsed:   56.7s remaining:    5.5s
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:  1.0min finished


## Current approach, no filtering

In [24]:
def no_filter(item):
    made = OnsetMaker(item, skip_processing=True)
    fname = made._get_channel_override_fpath('piano', made.instrs['piano'])
    made.audio = {}
    y, _ = librosa.load(
        path=fname,
        sr=utils.SAMPLE_RATE,
        mono=True,
        offset=0,
        duration=None,
        dtype=np.float64,
        res_type='soxr_vhq',
    )
    made.audio['piano'] = librosa.util.normalize(y)
    ons = made.onset_detect_cnn('piano')
    return {
        'track': item['fname'],
        'method': 'cnn_no_filter', 
        **made.compare_onset_detection_accuracy(
            fname=rf'{made.references_dir}/manual_annotation/{item["fname"]}_piano.txt',
            onsets=ons,
        )
    }

In [25]:
with Parallel(n_jobs=-1, backend='loky', verbose=10) as par:
    res_nf = par(delayed(no_filter)(i) for i in corp.tracks)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  34 | elapsed:    2.2s remaining:   22.6s
[Parallel(n_jobs=-1)]: Done   7 out of  34 | elapsed:    4.3s remaining:   16.7s
[Parallel(n_jobs=-1)]: Done  11 out of  34 | elapsed:    6.3s remaining:   13.2s
[Parallel(n_jobs=-1)]: Done  15 out of  34 | elapsed:    7.6s remaining:    9.6s
[Parallel(n_jobs=-1)]: Done  19 out of  34 | elapsed:    8.7s remaining:    6.9s
[Parallel(n_jobs=-1)]: Done  23 out of  34 | elapsed:    9.0s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done  27 out of  34 | elapsed:    9.3s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done  31 out of  34 | elapsed:    9.9s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:   10.6s finished


## Current approach, more filter

In [26]:
def more_filter(item, lowcut, highcut):
    made = OnsetMaker(item, skip_processing=True)
    fname = made._get_channel_override_fpath('piano', made.instrs['piano'])
    made.audio = {}
    y, _ = librosa.load(
        path=fname,
        sr=utils.SAMPLE_RATE,
        mono=True,
        offset=0,
        duration=None,
        dtype=np.float64,
        res_type='soxr_vhq',
    )
    y = bandpass_filter(
        audio=y,
        lowcut=lowcut,
        highcut=highcut,
        order=made.order
    )
    made.audio['piano'] = librosa.util.normalize(y)
    ons = made.onset_detect_cnn('piano')
    return {
        'track': item['fname'],
        'method': f'cnn_{lowcut}_{highcut}', 
        **made.compare_onset_detection_accuracy(
            fname=rf'{made.references_dir}/manual_annotation/{item["fname"]}_piano.txt',
            onsets=ons,
        )
    }

In [27]:
with Parallel(n_jobs=-1, backend='loky', verbose=10) as par:
    res_220_1760 = par(delayed(more_filter)(i, 220, 1760) for i in corp.tracks)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  34 | elapsed:    2.4s remaining:   24.4s
[Parallel(n_jobs=-1)]: Done   7 out of  34 | elapsed:    4.2s remaining:   16.3s
[Parallel(n_jobs=-1)]: Done  11 out of  34 | elapsed:    6.7s remaining:   14.0s
[Parallel(n_jobs=-1)]: Done  15 out of  34 | elapsed:    7.9s remaining:   10.0s
[Parallel(n_jobs=-1)]: Done  19 out of  34 | elapsed:    9.1s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done  23 out of  34 | elapsed:    9.4s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  27 out of  34 | elapsed:    9.7s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  31 out of  34 | elapsed:   10.3s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:   11.5s finished


## Putting it all together

In [28]:
def fmt(x):
    x = x.dropna()
    return f'{round(np.mean(x), 2)} ± {round(np.std(x), 2)}'

In [59]:
df = pd.DataFrame(res_mm + res_sf + res_nf + res_220_1760)

In [60]:
f = df.groupby('method')['f_score'].apply(fmt)
p = df.groupby('method')['precision'].apply(fmt)
r = df.groupby('method')['recall'].apply(fmt)
(
    pd.concat([f, p, r], axis=1)
    .transpose()
    .rename(
        columns={'automatic_midi': "(1)", 'spectral_flux': "(2)", 'cnn_no_filter': "(3)", 'cnn_220_1760': "(4)"},
        index={'f_score': 'F', 'precision': 'P', 'recall': 'R'}
    )
    [["(1)", "(2)", "(3)", "(4)"]]
)

method,(1),(2),(3),(4)
F,0.77 ± 0.13,0.84 ± 0.06,0.92 ± 0.03,0.92 ± 0.03
P,0.71 ± 0.16,0.79 ± 0.1,0.9 ± 0.06,0.95 ± 0.03
R,0.86 ± 0.09,0.9 ± 0.04,0.93 ± 0.03,0.89 ± 0.05


## F-score table for copying

In [63]:
loaded = utils.load_corpus_from_files(f'{utils.get_project_root()}/data/cambridge-jazz-trio-database-v02')
fns = set(track['fname'] for track in corp.tracks)
bigres = []
for t in loaded:
    if t.item['fname'] in fns:
        bigres.append(pd.DataFrame([{'track': t.item['fname'], 'instr': ins, **t.item['validation'][ins]} for ins in ['piano', 'bass', 'drums', 'mix', 'mix_downbeats']]))
fdf = pd.concat(bigres)
print(len(bigres)) # should be 34

34


In [64]:
f = fdf.groupby('instr')['f_score'].apply(fmt)
p = fdf.groupby('instr')['precision'].apply(fmt)
r = fdf.groupby('instr')['recall'].apply(fmt)
(
    pd.concat([f, p, r], axis=1)
    .transpose()
    .rename(
        columns={'bass': "Bass", 'drums': "Drums", 'piano': "Piano", 'mix': "Beats", "mix_downbeats": "Downbeats"},
        index={'f_score': 'F', 'precision': 'P', 'recall': 'R'}
    )
)

instr,Bass,Drums,Beats,Downbeats,Piano
F,0.93 ± 0.05,0.95 ± 0.03,0.97 ± 0.05,0.63 ± 0.44,0.93 ± 0.03
P,0.94 ± 0.04,0.96 ± 0.04,0.97 ± 0.05,0.63 ± 0.44,0.93 ± 0.04
R,0.93 ± 0.07,0.94 ± 0.04,0.97 ± 0.05,0.63 ± 0.44,0.93 ± 0.04


In [65]:
fdf.groupby('instr')['mean_asynchrony'].describe() * 1000

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
instr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bass,34000.0,-4.286911,4.380241,-16.01875,-6.811371,-3.599585,-2.312851,4.008071
drums,34000.0,-3.537782,2.953177,-9.301071,-4.986423,-4.274213,-2.520374,3.074587
mix,34000.0,-4.819691,4.579224,-13.228514,-7.856931,-5.772287,-1.74399,5.006074
mix_downbeats,25000.0,-5.069938,4.494798,-12.498632,-8.224669,-5.94783,-0.627362,2.778432
piano,34000.0,-5.774872,2.277146,-10.774737,-7.067589,-6.045731,-4.447851,-0.57911


In [66]:
fdf[fdf['instr'].isin(['piano', 'bass', 'drums'])]['f_score'].describe()

count    102.000000
mean       0.935296
std        0.037301
min        0.731906
25%        0.916129
50%        0.944986
75%        0.959358
max        0.997436
Name: f_score, dtype: float64

In [38]:
fdf[fdf['instr'].isin(['piano', 'bass', 'drums'])]['mean_asynchrony'].describe() * 1000

count    102000.000000
mean         -4.533189
std           3.418420
min         -16.018750
25%          -6.585747
50%          -4.498220
75%          -2.729725
max           4.008071
Name: mean_asynchrony, dtype: float64

## Get total number of MIDI events

In [40]:
def midi_len(item):
    mm = PrettyMIDI(f'{utils.get_project_root()}/data/cambridge-jazz-trio-database-v02/{item["fname"]}/piano_midi.mid')
    return len(mm.instruments[0].notes)

corp = utils.CorpusMaker.from_excel('corpus_updated', only_annotated=False, only_30_corpus=False)
# Fast enough to process consecutively
nevents = sum(midi_len(i) for i in corp.tracks)
nevents

2174833