# Evaluate

This notebook evaluates the quality of the online alignments in a given experiment directory.

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import os.path
from pathlib import Path
import pandas as pd
import pickle
import re
import import_ipynb
import system_utils
import eval_tools

## Calculate Alignment Errors

First we calculate the alignment errors of a given system on all evaluated measures.

In [None]:
exp_dir = 'experiments/PairwiseSparseDTW_0.8' # change
scenarios_dir = 'scenarios'
eval_dir = 'eval/' + os.path.basename(exp_dir)
eval_tools.calcAlignErrors_batch(exp_dir, scenarios_dir, eval_dir, '')

## Plot Error vs Tolerance

We can visualize the results by plotting the error rate across a range of error tolerances.

In [None]:
def plotErrorVsTolerance(eval_dirs, maxTol, savefile = None, style='bar', bar_tols=[100, 200, 500, 1000, 2000]):
    '''
    Plots the error rate across a range of error tolerances.
    
    Inputs
    eval_dir: the eval directories to plot
    maxTol: maximum error tolerance to consider (in milliseconds)
    savefile: if specified, will save the figure to the given filepath
    style: 'bar' or 'line'
    tols: list of error tolerances to plot if using 'bar'
    '''
    
    errRates_list = []
    #color=['blue','orange','green','lightgreen','red'] # hard coded for main results figure
    for i, eval_dir in enumerate(eval_dirs):
    
        # load
        with open(f'{eval_dir}/errs.pkl', 'rb') as f:
            d = pickle.load(f)

        # flattened list
        errs = []
        for scenario_id in d:
            errs = np.append(errs, d[scenario_id][0])

        # calculate error rates
        errRates = np.zeros(maxTol+1)
        tols = np.arange(maxTol+1)
        for j in tols:
            errRates[j] = np.mean(np.abs(errs) > j/1000)
        errRates_list.append(errRates)
        
        if style == 'line':
            plt.plot(tols, errRates * 100.0)
        elif style == 'bar':
            bar_width = 0.1
            errs = [errRates[tol] * 100.0 for tol in bar_tols]
            pos = np.arange(len(errs)) + i * bar_width
            # hard code for main results figure
            #plt.bar(pos, errs, width=bar_width, label=os.path.basename(eval_dir), color = color[i])
            plt.bar(pos, errs, width=bar_width, label=os.path.basename(eval_dir))
            plt.xticks([r + bar_width*len(eval_dirs)/2 for r in range(len(bar_tols))], map(str, bar_tols))
        
    plt.ylabel('Error Rate (%)')
    plt.xlabel('Error Tolerance (ms)')
    plt.legend([os.path.basename(eval_dir) for eval_dir in eval_dirs])
    plt.grid(linestyle='--')
    if savefile:
        plt.savefig(savefile)

    return errRates_list, tols

Plot the error rate vs error tolerance curve for one system of interest:

In [None]:
maxTol = 5000 # in milliseconds
eval_dir = 'eval/PairwiseSparseDTW_0.8'
errRates_list, tols = plotErrorVsTolerance([eval_dir], maxTol, savefile=False)
for i in [100,200,500,1000,2000]:
    print(errRates_list[0][i]*100.0)

Overlay multiple error curves for comparison:

In [None]:
systems_to_compare = ['NaivePairwiseDTW', 'ISA','SeparatedDTW_Spleeter','SeparatedDTW_HDemucs', 'PairwiseSparseDTW_0.8']
eval_dirs = [f'eval/{s}' for s in systems_to_compare]
errRates_list, tols = plotErrorVsTolerance(eval_dirs, maxTol, savefile='Results.png')
[errRates_list[i][500]*100.0 for i in range(len(eval_dirs))]

## Separate error curves by condition

Visualize the same error curve for a single system, but separated by different conditions.  For example, one can visualize the performance across:
- TSM factor
- full mix recording
- concerto
- composer
- chunk within a movement

In [None]:
def plotErrorVsTolerance_separated(eval_dir, mapping, maxTol, savefile = None):
    '''
    Plots error rate across a range of error tolerances.  Data is separated into categories
    specified in the given dictionary, and each category is plotted as a separate curve.
    
    Inputs
    eval_dir: the eval directory to process
    mapping: a dictionary whose key is the scenario id and whose value is the category name.
      Any scenario ids that are not in the dictionary will be excluded from the plot.
    maxTol: maximum error tolerance to consider (in milliseconds)
    savefile: if specified, will save the figure to the given filepath
    '''
    
    # initialize
    categories = list(sorted(set(mapping.values())))
    errors_by_category = {}
    for c in categories:
        errors_by_category[c] = [] # flattened list of alignment errors by category
    
    # load
    with open(f'{eval_dir}/errs.pkl', 'rb') as f:
        d = pickle.load(f)  # key: scenario_id, value: (errors, measureNums)

    # aggregate data by category
    for scenario_id in d:
        if scenario_id in mapping:
            category = mapping[scenario_id]
            errors_by_category[category] = np.append(errors_by_category[category], d[scenario_id][0])

    # calculate error rates by category
    errRates_list = {}
    numPts = {}
    for c in categories:
        errRates = np.zeros(maxTol+1)
        tols = np.arange(maxTol+1)
        for i in tols:
            errRates[i] = np.mean(np.abs(errors_by_category[c]) > i/1000)
        errRates_list[c] = errRates
        numPts[c] = len(errors_by_category[c]) # for debugging
        plt.plot(tols, errRates * 100.0)
        
    plt.ylabel('Error Rate (%)')
    plt.xlabel('Error Tolerance (ms)')
    plt.legend(categories)
    plt.grid(linestyle='--')
    if savefile:
        plt.savefig(savefile)

    return errRates_list, tols, numPts

In [None]:
def mapByTSMFactor():
    '''
    Constructs a mapping separated by TSM factor.
    '''
    d = system_utils.get_scenario_info(SCENARIOS_SUMMARY)
    mapping = {}
    for scenario_id in d:
        mapping[scenario_id] = d[scenario_id]['p'].split('/')[-2] # e.g. 'tsm0.80'
    return mapping

In [None]:
def mapByFullMix():
    '''
    Constructs a mapping separated by full mix recording.
    '''
    d = system_utils.get_scenario_info(SCENARIOS_SUMMARY)
    mapping = {}
    for scenario_id in d:
        mapping[scenario_id] = os.path.splitext(os.path.basename(d[scenario_id]['po']))[0] # e.g. 'rach2_mov1_PO1'
    return mapping

In [None]:
def mapByComposer():
    '''
    Constructs a mapping separated by composer.
    '''
    d = system_utils.get_scenario_info(SCENARIOS_SUMMARY)
    mapping = {}
    for scenario_id in d:
        po_id = os.path.splitext(os.path.basename(d[scenario_id]['po']))[0] # e.g. 'rach2_mov1_PO1'
        concerto_id = po_id.split('_')[0] # e.g. 'rach2'
        composer = re.search(r'([a-z]+)\d+', concerto_id).group(1)
        mapping[scenario_id] = composer

    return mapping

In [None]:
def mapByChunk(mov_id):
    '''
    Constructs a mapping separated by chunk within a specified concerto movement.
    
    Inputs
    mov_id: id specifying the concerto movement to analyze, e.g. 'rach2_mov1'
    '''
    # construct mapping with tuple categories
    d = system_utils.get_scenario_info(SCENARIOS_SUMMARY)
    mapping = {}
    for scenario_id in d:
        if mov_id in d[scenario_id]['po']: # only keep scenario ids for the concerto movement of interest
            mapping[scenario_id] = (d[scenario_id]['measStart'], d[scenario_id]['measEnd'])
        
    # map tuples to string (e.g. 'chunk1', 'chunk2'
    tup2str = {}
    for i, tup in enumerate(sorted(set(mapping.values()))):
        tup2str[tup] = f'Chunk{i+1}'

    # construct mapping with string categories
    renamed = {}
    for scenario_id in mapping:
        renamed[scenario_id] = tup2str[mapping[scenario_id]]
        
    return renamed

In [None]:
eval_dir = 'eval/PairwiseSparseDTW_0.8'
maxTol = 2000 # in milliseconds
SCENARIOS_SUMMARY = 'scenarios/scenarios.summary'

In [None]:
errRates_list, tols, numPts = plotErrorVsTolerance_separated(eval_dir, mapByTSMFactor(), maxTol)

In [None]:
errRates_list, tols, numPts = plotErrorVsTolerance_separated(eval_dir, mapByFullMix(), maxTol)

In [None]:
errRates_list, tols, numPts = plotErrorVsTolerance_separated(eval_dir, mapByChunk('beeth1_mov1'), maxTol)

In [None]:
errRates_list, tols, numPts = plotErrorVsTolerance_separated(eval_dir, mapByComposer(), maxTol)

In [None]:
for i in [100,200,500,1000,2000]:
    print(errRates_list['bach'][i])

## Separate results by P-PO and O-PO

The accuracy of the P - O alignment depends on two separate alignments: the P - PO and O - PO alignments.  The code below characterizes the alignment accuracy of each of these two alignments separately.

In [None]:
def calcAlignErrors_batch_P_PO(exp_dir, scenarios_dir, annot_dir, out_dir):
    '''
    Calculates the P-PO alignment errors for all scenarios in an experiment directory for which PO beat annotations exist.
    This function is a modification of calcAlignErrors_batch(). 
    
    Inputs
    exp_dir: the experiment directory to evaluate
    scenarios_dir: the directory containing the scenarios information
    annot_dir: the directory containing beat annotation files
    out_dir: the directory to save outputs and figures to
    '''
    # only evaluate scenarios whose PO file has beat annotations
    d = {}
    for scenario_id in eval_tools.getScenarioIds(scenarios_dir):
        hypFile = f'{exp_dir}/{scenario_id}/p_po_align.npy'
        pianoAnnot = f'{scenarios_dir}/{scenario_id}/p.beats'
        orchAnnot = f'{scenarios_dir}/{scenario_id}/o.beats'
        scenarioInfo = f'{scenarios_dir}/{scenario_id}/scenario.info'
        mixAudiofile = system_utils.get_scenario_info(scenarioInfo)['po'] # e.g. basedir/audio/rach2_mov1_PO1.wav
        mixAnnot = f'{annot_dir}/{Path(mixAudiofile).stem}.beats'
        if not os.path.exists(mixAnnot):
            continue # skip if no PO annotation file
        errs, measNums = eval_tools.calcAlignErrors_single(hypFile, pianoAnnot, mixAnnot, scenarioInfo, frames=True)
        d[scenario_id] = (errs, measNums) # key: scenario_id, value: (errors, measureNums)
        
    # save
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    outfile = f'{out_dir}/errs.pkl'
    pickle.dump(d, open(outfile, 'wb'))

In [None]:
def calcAlignErrors_batch_O_PO(exp_dir, scenarios_dir, annot_dir, out_dir, in_cache = True):
    '''
    Calculates the O-PO alignment errors for all scenarios in an experiment directory for which PO beat annotations exist.
    This function is a modification of calcAlignErrors_batch(). 
    
    Inputs
    exp_dir: the experiment directory to evaluate
    scenarios_dir: the directory containing the scenarios information
    annot_dir: the directory containing beat annotation files
    out_dir: the directory to save outputs and figures to
    in_cache: if True, expects the alignment file to be in the cache folder
    '''
    # only evaluate scenarios whose PO file has beat annotations
    d = {}
    for scenario_id in eval_tools.getScenarioIds(scenarios_dir):
        orchAnnot = f'{scenarios_dir}/{scenario_id}/o.beats'
        scenarioInfo = f'{scenarios_dir}/{scenario_id}/scenario.info'
        mixAudiofile = system_utils.get_scenario_info(scenarioInfo)['po'] # e.g. basedir/audio/rach2_mov1_PO1.wav
        basename = Path(mixAudiofile).stem # rach2_mov1_PO1
        mixAnnot = f'{annot_dir}/{basename}.beats'
        if not os.path.exists(mixAnnot):
            continue # skip if no PO annotation file       
        if in_cache:
            hypFile = f'{exp_dir}/cache/{getCacheDir(basename)}/o_po_align.npy'        
        else:
            hypFile = f'{exp_dir}/{scenario_id}/o_po_align.npy'
        errs, measNums = eval_tools.calcAlignErrors_single(hypFile, orchAnnot, mixAnnot, scenarioInfo, frames=True)
        d[scenario_id] = (errs, measNums) # key: scenario_id, value: (errors, measureNums)
        
    # save
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    outfile = f'{out_dir}/errs.pkl'
    pickle.dump(d, open(outfile, 'wb'))

def getCacheDir(basename):
    '''
    Given the basename of the mix audio file, determine the cache directory name.
    '''
    parts = basename.split('_') # ['rach2', 'mov1', 'PO1']
    parts.insert(-1, 'O1')
    return '_'.join(parts) # rach2_mov1_O1_PO1

In [None]:
exp_dir = 'experiments/PairwiseSparseDTW_0.8' # change
scenarios_dir = 'scenarios'
annot_dir = 'annot'
eval_root = 'backup/eval_p_po'
eval_dir = f'{eval_root}/' + os.path.basename(exp_dir)
calcAlignErrors_batch_P_PO(exp_dir, scenarios_dir, annot_dir, eval_dir)

In [None]:
systems_to_compare = ['PairwiseSparseDTW_0.8']
#systems_to_compare = ['NaivePairwiseDTW', 'SeparatedDTW_SPL-PT', 'SeparatedDTW_SPL-TTA','SeparatedDTW_HDemucs', 'ISA_Chroma', 'PairwiseSparseDTW_0.8', 'TimeSparse']
eval_dirs = [f'{eval_root}/{s}' for s in systems_to_compare]
errRates_list, tols = plotErrorVsTolerance(eval_dirs, maxTol)
plt.title('P-PO Alignment')

In [None]:
#['NaivePairwiseDTW', 'SeparatedDTW_SPL-PT', 'SeparatedDTW_SPL-TTA','SeparatedDTW_HDemucs', 'ISA_Chroma', 'ISA_CQT', 'ISA_BCQT', 'PairwiseSparseDTW_0.8']
exp_dir = 'experiments/PairwiseSparseDTW_0.8' # change
scenarios_dir = 'scenarios'
annot_dir = 'annot'
eval_root = 'backup/eval_o_po'
eval_dir = f'{eval_root}/' + os.path.basename(exp_dir)
calcAlignErrors_batch_O_PO(exp_dir, scenarios_dir, annot_dir, eval_dir, in_cache=True)

In [None]:
systems_to_compare = ['PairwiseSparseDTW_0.8']
#systems_to_compare = ['NaivePairwiseDTW', 'SeparatedDTW_SPL-PT', 'SeparatedDTW_SPL-TTA','SeparatedDTW_HDemucs','PairwiseSparseDTW_0.8', 'TimeSparse']
eval_dirs = [f'{eval_root}/{s}' for s in systems_to_compare]
errRates_list, tols = plotErrorVsTolerance(eval_dirs, maxTol)
plt.title('O-PO Alignment')

In [None]:
# exp_dir = 'experiments/ISA_Chroma' # change
# scenarios_dir = 'scenarios'
# annot_dir = 'annot'
# eval_dir = 'eval_o_po/' + os.path.basename(exp_dir)
# scenario_id = 's21'
# in_cache = False

# orchAnnot = f'{scenarios_dir}/{scenario_id}/o.beats'
# scenarioInfo = f'{scenarios_dir}/{scenario_id}/scenario.info'
# mixAudiofile = system_utils.get_scenario_info(scenarioInfo)['po'] # e.g. basedir/audio/rach2_mov1_PO1.wav
# basename = Path(mixAudiofile).stem # rach2_mov1_PO1
# mixAnnot = f'{annot_dir}/{basename}.beats'

# if in_cache:
#     hypFile = f'{exp_dir}/cache/{getCacheDir(basename)}/o_po_align.npy'        
# else:
#     hypFile = f'{exp_dir}/{scenario_id}/o_po_align.npy'

# errs, measNums = eval_tools.calcAlignErrors_single(hypFile, orchAnnot, mixAnnot, scenarioInfo, frames=True)
# gt, measNums = eval_tools.getGroundTruthTimestamps(orchAnnot, mixAnnot, scenarioInfo) # ground truth
# hypalign = np.load(hypFile) # piano-orchestra predicted alignment in sec

# if True:
#     hypalign = hypalign / (22050/512)
# pred = np.interp(gt[:,0], hypalign[0,:], hypalign[1,:])
# err = pred - gt[:,1]