# Pairwise Sparse DTW

This notebook implements a pairwise sparse DTW system.  The only requirement in this notebook is that it implement the `offline_processing()` and `online_processing()` functions, which will be imported and run in `02_RunExperiment.ipynb`.  The rest of the notebook is for experimenting, visualizing, and analyzing the system, so it should be thought of as a sandbox for development.

Here is a summary of the pairwise sparse DTW approach:
- Offline processing: A subset of features from the orchestra recording is selected based on chroma flux.  This subset of features is then aligned against the full mix recording using a variant of subsequence DTW that appropriately handles the gaps between selected features.
- Online processing: The solo piano and full mix recordings are aligned with standard DTW using chroma features, and the predicted alignment is then used to infer the corresponding alignment between the piano and orchestra recordings.


## Offline Processing

In the offline processing stage, three things are computed and stored in the `cache/` folder:
- chroma features for the orchestra recording (O features)
- chroma features for the full mix recording (PO features)
- predicted DTW alignment between the orchestra and full mix recordings using the sparse DTW alignment method.  This approach first selects a subset of the O features that have the highest flux, and then aligns these selected features against the PO features.

In [None]:
import numpy as np
import librosa as lb
import os
import os.path
import import_ipynb
import align_tools
import system_utils
from hmc_mir.align import dtw
from numba import jit, njit, prange
from matplotlib import pyplot as plt
import time

In [None]:
def offline_processing(scenario_dir, cache_dir, hop_length, frac_keep):
    '''
    Carries out offline processing for the pairwise sparse DTW system.
    
    Inputs:
        scenario_dir: The scenario directory to process
        cache_dir: The location of the cache directory
        hop_length: The hop length in samples used when computing chroma features
        frac_keep: the fraction of orchestra features to use during alignment, scalar between 0 and 1
    
    This function will store the computed chroma features and estimated alignment in the cache folder.
    '''
    
    # setup
    system_utils.verify_scenario_dir(scenario_dir)
    if os.path.exists(cache_dir):
        # print(f'{cache_dir} has already been processed.  Skipping.')
        pass
    else:
        # setup
        os.makedirs(cache_dir)

        # compute orchestra features
        o_file = f'{scenario_dir}/o.wav'
        y_o, sr = lb.core.load(o_file)
        F_o = lb.feature.chroma_cqt(y=y_o, sr=sr, hop_length=hop_length, norm=None) 

        # compute full mix features
        po_file = f'{scenario_dir}/po.wav'
        y_po, sr = lb.core.load(po_file)
        F_po = lb.feature.chroma_cqt(y=y_po, sr=sr, hop_length=hop_length, norm=None)
        
        # select subset of O features
        t_start = time.time()
        orch_start_sec, orch_end_sec = system_utils.get_orchestra_start_end_times(scenario_dir)
        orch_start_frm = int(np.round(orch_start_sec * sr / hop_length))
        orch_end_frm = int(np.round(orch_end_sec * sr / hop_length)) + 1
        o_feats_sel, idx_sel, gaplens, flux_thresh = selectFeatures(F_o[:,orch_start_frm:orch_end_frm], frac_keep)
                
        # compute sparse DTW alignment (orchestra as query) 
        C = 1 - lb.util.normalize(o_feats_sel, norm=2, axis=0).T @ lb.util.normalize(F_po, norm=2, axis=0)
        D, B, wp = dtw_sparse_subseq(C, gaplens)
        wp[0,:] = idx_sel[wp[0,:]] # convert back to O frames
        wp[0,:] = wp[0,:] + orch_start_frm # account for offset
        t_end = time.time()
        
        # save to cache
        np.save(f'{cache_dir}/o_chroma.npy', F_o)
        np.save(f'{cache_dir}/po_chroma.npy', F_po)
        np.save(f'{cache_dir}/o_po_align.npy', wp)
        np.save(f'{cache_dir}/runtime_o_po.npy', t_end - t_start)
    
    return

In [None]:
def selectFeatures(F, frac_keep):
    '''
    Selects a subset of features that have the highest flux.

    Inputs:
        F: feature matrix of size DxN, where D is the feature dimension and N is the number of features
        frac_keep: the fraction of features to keep, a scalar between 0 and 1

    Returns:
        F_sel: a DxM feature matrix containing the selected subset of features
        idx_sel: an array of length M specifying the indices of the features that were selected
        gaplens: an array of length M specifying the gap lengths between selected features
        flux_thresh: the threshold used to select features based on their flux
    '''
    flux_vals = np.sum(np.abs(F[:,0:-1] - F[:,1:]), axis=0)
    flux_thresh = sorted(flux_vals, reverse=True)[int(np.round(frac_keep * len(flux_vals)))-1]
    idx_sel = np.where(np.array(flux_vals > flux_thresh) == 1)[0]
    gaplens = idx_sel[1:] - idx_sel[0:-1]
    gaplens = np.append(gaplens, len(flux_vals) - idx_sel[-1])
    F_sel = F[:,idx_sel]
    
    return F_sel, idx_sel, gaplens, flux_thresh

In [None]:
@jit(nopython=True)
def dtw_sparse_subseq(C, gaplens):
    '''
    A variant of subsequence DTW that aligns a selected subset of query features against a longer reference sequence.
    The query sequence can start and end anywhere in the reference sequence, and the alignment handles gaps between
    selected query features.
    
    Inputs:
        C: an MxN matrix of pairwise costs, where M is the length of the (selected) query features and N is the length of
           the reference sequence
        gaplens: an array of length M specifying the gap lengths between selected features

    Returns:
        D: cumulative cost matrix, size MxN
        B: backtrace matrix of size MxN, each element specifies either the step index (if dense matching)
           or the number of reference frames skipped (if sparse matching)
        path: a numpy array of (row, col) coordinates for the optimal path
    '''
    D = np.ones(C.shape) * np.inf
    B = np.zeros(C.shape, dtype=np.int32)
    steps = np.array([1,1,1,2,2,1]).reshape((-1,2))
    weights = np.array([1,1,2])

    D[0, :] = C[0,:]

    for row in range(1, C.shape[0]):
        for col in range(1, C.shape[1]):
            
            if row >= 2 and gaplens[row-2] == 1 and gaplens[row-1] == 1:
                
                # dense matching
                bestCost = D[row, col]
                bestCostIndex = -1
                for stepIndex in range(steps.shape[0]):
                    if row - steps[stepIndex][0] >= 0 and col - steps[stepIndex][1] >= 0:
                        costForStep = C[row, col] * weights[stepIndex] + D[row - steps[stepIndex][0], col - steps[stepIndex][1]]
                        if costForStep < bestCost:
                            bestCost = costForStep
                            bestCostIndex = stepIndex
                D[row, col] = bestCost
                B[row, col] = bestCostIndex
                
            else:
                
                # sparse matching
                cstep_lbound = int(np.ceil(gaplens[row-1]/2))
                cstep_ubound = gaplens[row-1]*2 + 1
                bestCost = D[row, col]
                for cstep in range(cstep_lbound, cstep_ubound):
                    rprev = row - 1
                    cprev = col - cstep
                    if cprev >= 0:
                        costForStep = C[row, col] + D[rprev, cprev]
                        if costForStep < bestCost:
                            bestCost = costForStep
                            bestCostIndex = cstep
                D[row, col] = bestCost
                B[row, col] = bestCostIndex
                
    path = dtw_backtrace_sparse(D, B, gaplens, steps, subseq=True)
    path.reverse()
    path = np.array(path).T

    return D, B, path

In [None]:
@jit(nopython=True)
def dtw_backtrace_sparse(D, B, gaplens, steps, subseq):
    '''
    Backtraces through the cumulative cost matrix D
    
    Inputs:
        D: cumulative cost matrix
        B: backtrace matrix
        gaplens: array specifying the gap lengths between selected features
        steps: a numpy matrix specifying the allowable transitions.  It should be of dimension (L, 2), where each row specifies (row step, col step)
        subseq: boolean indicating whether to assume a subsequence alignment
    
    Returns:
        A numpy array of (row, col) coordinates for the optimal path.
    '''

    rstart = B.shape[0] - 1
    if subseq:
        cstart = np.argmin(D[-1])
    else:
        cstart = B.shape[1] - 1
    pos = (rstart, cstart)
    path = []
    path.append(pos)
    while (pos[0] != 0 and pos[1] != 0) or (pos[0] and subseq):
        
        (row, col) = pos
        if row >= 2 and gaplens[row-1] == 1 and gaplens[row-2] == 1:
            
            # dense matching
            stepidx = B[row, col]
            (rstep, cstep) = steps[stepidx]
            pos = (row-rstep, col-cstep)
            path.append(pos)
            
        else:
            
            # sparse matching
            rstep = 1
            cstep = B[row, col]
            pos = (row-rstep, col-cstep)
            path.append(pos)

    return path

In [None]:
def verify_cache_dir(indir):
    '''
    Verifies that the specified cache directory has the required files.
    
    Inputs
    indir: The cache directory to verify
    '''
    assert os.path.exists(f'{indir}/o_chroma.npy'), f'o_chroma.npy missing from {indir}'
    assert os.path.exists(f'{indir}/po_chroma.npy'), f'po_chroma.npy missing from {indir}'
    assert os.path.exists(f'{indir}/o_po_align.npy'), f'o_po_align.npy missing from {indir}'

# Online Processing

In the online processing stage, we do two things:
- estimate the P-PO alignment using standard subsequence DTW with chroma features, and
- infer the P-O alignment based on the estimated P-PO and O-PO alignments

Note that this baseline system is not a valid online system since it uses offline DTW.

In [None]:
def online_processing(scenario_dir, out_dir, cache_dir, hop_length, steps, weights):
    '''
    Carries out `online' processing for the pairwise sparse DTW system.
    
    Inputs:
        scenario_dir: The scenario directory to process
        out_dir: The directory to put results, intermediate files, and logging info
        cache_dir: The cache directory
        hop_length: The hop length in samples used when computing chroma features
        steps: an L x 2 array specifying the allowable DTW transitions
        weights: a length L array specifying the DTW transition weights

    This function will compute and save the predicted alignment in the output directory in a file hyp.npy
    '''
    
    # verify & setup
    system_utils.verify_scenario_dir(scenario_dir)
    verify_cache_dir(cache_dir)
    assert not os.path.exists(out_dir), f'Output directory {out_dir} already exists.'
    os.makedirs(out_dir)
    
    # compute features
    p_file = f'{scenario_dir}/p.wav'
    y, sr = lb.core.load(p_file)
    F_p = lb.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length, norm=2)  # piano features
    F_po = np.load(f'{cache_dir}/po_chroma.npy') # full mix features
        
    # precomputed PO-O alignment
    hop_sec = hop_length / sr
    wp_BC = np.flipud(np.load(f'{cache_dir}/o_po_align.npy'))
    wp_BC = np.hstack((np.array([0,0]).reshape((2,-1)), wp_BC)) # prepend (0,0) to handle edge cases properly
   
    # compute P-PO alignment
    t_start = time.time()
    C = align_tools.cosine_dist(F_p, F_po)
    _, _, wp_AB = dtw.dtw(C, steps, weights, True)
    t_end = time.time()

    # infer piano-orchestra alignment
    wp_AC = align_tools.infer_alignment(wp_AB, wp_BC, frames=True)
    np.save(f'{out_dir}/hyp.npy', wp_AC*hop_sec)
    np.save(f'{out_dir}/runtime_p_po.npy', t_end - t_start)
    
    return

In [None]:
def verify_hyp_dir(indir):
    '''
    Verifies that the specified scenario hypothesis directory has the required files.
    
    Inputs
    indir: The cache directory to verify
    '''
    assert os.path.exists(f'{indir}/hyp.npy')

# Example


Here is an example of how to call the offline and online processing functions on a scenario directory.

In [None]:
# scenario_dir = 'scenarios/s2'
# out_dir = 'experiments/test/s2'
# cache_dir = 'experiments/test/cache'
# hop_size = 512
# steps = np.array([1,1,1,2,2,1]).reshape((-1,2))
# weights = np.array([2,3,3])
# offline_processing(scenario_dir, cache_dir, hop_size, steps, weights)
# online_processing(scenario_dir, out_dir, cache_dir, hop_size, steps, weights)