# Frame Classification + Dense-Sparse DTW

This notebook implements an approach that first classifies individual frames as piano or orchestra based on a GMM, and then estimates the P-PO and O-PO alignments using dense-sparse DTW.

Here is a summary of the source separation + DTW approach:
- Offline processing:
    - Source separation is used to split the full mix recording into estimated piano and estimated orchestra recordings.
    - The orchestra and estimated orchestra are aligned with standard DTW using chroma features.
- Online processing: The solo piano and estimated piano are aligned with DTW, and the predicted alignment is then used to infer the corresponding alignment between the piano and orchestra recordings.

## Offline Processing

The offline processing is the same as in the simple offline DTW system.  In the offline processing stage, three things are computed and stored in the `cache/` folder:
- chroma features for the orchestra recording
- chroma features for the estimated orchestra recording (i.e. the result of performing source separation on the full mix recording)
- predicted DTW alignment between the orchestra and estimated orchestra recordings

NOTE: because we do not have the code to do the source separation, the precomputed source separated files should be placed in a folder somewhere in the root `audio/` folder.  This must be done BEFORE the following code can be run. This is an example of what files should be present in the directory *before* running the offline processing step (shown for the HDemucs source separation model):


```
separation
├── HDemucs
    ├── bach5_mov1_PO1_O.wav
    ├── bach5_mov1_PO1_P.wav
    ├── bach5_mov1_PO2_O.wav
    ├── bach5_mov1_PO2_P.wav
    ├── beeth1_mov1_PO1_O.wav
    ├── beeth1_mov1_PO1_P.wav
    ├── beeth1_mov1_PO2_O.wav
    ├── beeth1_mov1_PO2_P.wav
    ├── mozart21_mov1_PO1_O.wav
    ├── mozart21_mov1_PO1_P.wav
    ├── mozart21_mov1_PO2_O.wav
    ├── mozart21_mov1_PO2_P.wav
    ├── rach2_mov1_PO1_O.wav
    ├── rach2_mov1_PO1_P.wav
    ├── rach2_mov1_PO2_O.wav
    └── rach2_mov1_PO2_P.wav
```

In [None]:
import numpy as np
import pandas as pd
import import_ipynb
import librosa as lb
from sklearn import mixture
import system_utils
import align_tools
import os
import os.path
import subprocess
import pickle
from pathlib import Path
from hmc_mir.align import dtw
import time
from numba import jit, njit, prange

In [None]:
def offline_processing(scenario_dir, cache_dir, hop_length, steps, weights):
    '''Carries out the same offline processing steps as the simple offline DTW system.
    
    Args
        scenario_dir: The scenario directory to process
        cache_dir: The location of the cache directory
        hop_length: The hop length in samples used when computing chroma features
        steps: an L x 2 array specifying the allowable DTW transitions
        weights: a length L array specifying the DTW transition weights
        separation_dir: directory where the pre-separated audio files are stored
    
    This function will store the computed chroma features and estimated alignment in the cache folder.
    '''

    # setup
    system_utils.verify_scenario_dir(scenario_dir)

    if os.path.exists(cache_dir):
        # print(f'{cache_dir} has already been processed.  Skipping.')
        pass
    else:
        # setup
        os.makedirs(cache_dir)

        # compute orchestra features
        o_file = f'{scenario_dir}/o.wav'
        y_o, sr = lb.core.load(o_file)
        F_o_chroma = lb.feature.chroma_cqt(y=y_o, sr=sr, hop_length=hop_length, norm=2) 
        F_o_mfcc = lb.feature.mfcc(y=y_o, sr=sr, hop_length=hop_length)

        # compute full mix features
        po_file = f'{scenario_dir}/po.wav'
        y_po, sr = lb.core.load(po_file)
        F_po_chroma = lb.feature.chroma_cqt(y=y_po, sr=sr, hop_length=hop_length, norm=2)
        F_po_mfcc = lb.feature.mfcc(y=y_po, sr=sr, hop_length=hop_length)
      
        # fit GMM model to orchestra MFCCs
        gmm_O = mixture.GaussianMixture(n_components=10, covariance_type='diag', random_state=0)
        gmm_O.fit(F_o_mfcc[1:13,:].T)
        
        # save to cache - TO DO
        np.save(f'{cache_dir}/o_chroma.npy', F_o_chroma)
        np.save(f'{cache_dir}/o_mfcc.npy', F_o_mfcc)
        np.save(f'{cache_dir}/po_chroma.npy', F_po_chroma)
        np.save(f'{cache_dir}/po_mfcc.npy', F_po_mfcc)
        with open(f'{cache_dir}/gmm_O.pkl','wb') as f:
            pickle.dump(gmm_O,f)
        #np.save(f'{cache_dir}/runtime_o_po.npy', t_end - t_start)
    
    return

In [None]:
@jit(nopython=True)
def dtw_sparse_subseq(C, gaplens):
    '''
    A variant of subsequence DTW that aligns a selected subset of query features against a longer reference sequence.
    The query sequence can start and end anywhere in the reference sequence, and the alignment handles gaps between
    selected query features.
    
    Inputs:
        C: an MxN matrix of pairwise costs, where M is the length of the (selected) query features and N is the length of
           the reference sequence
        gaplens: an array of length M specifying the gap lengths between selected features

    Returns:
        D: cumulative cost matrix, size MxN
        B: backtrace matrix of size MxN, each element specifies either the step index (if dense matching)
           or the number of reference frames skipped (if sparse matching)
        path: a numpy array of (row, col) coordinates for the optimal path
    '''
    D = np.ones(C.shape) * np.inf
    B = np.zeros(C.shape, dtype=np.int32)
    steps = np.array([1,1,1,2,2,1]).reshape((-1,2))
    weights = np.array([1,1,2])

    D[0, :] = C[0,:]

    for row in range(1, C.shape[0]):
        for col in range(1, C.shape[1]):
            
            if row >= 2 and gaplens[row-2] == 1 and gaplens[row-1] == 1:
                
                # dense matching
                bestCost = D[row, col]
                bestCostIndex = -1
                for stepIndex in range(steps.shape[0]):
                    if row - steps[stepIndex][0] >= 0 and col - steps[stepIndex][1] >= 0:
                        costForStep = C[row, col] * weights[stepIndex] + D[row - steps[stepIndex][0], col - steps[stepIndex][1]]
                        if costForStep < bestCost:
                            bestCost = costForStep
                            bestCostIndex = stepIndex
                D[row, col] = bestCost
                B[row, col] = bestCostIndex
                
            else:
                
                # sparse matching
                # cstep_lbound = int(np.ceil(gaplens[row-1]/2))
                # cstep_ubound = gaplens[row-1]*2 + 1
                # bestCost = D[row, col]
                # for cstep in range(cstep_lbound, cstep_ubound):
                #     rprev = row - 1
                #     cprev = col - cstep
                #     if cprev >= 0:
                #         costForStep = C[row, col] + D[rprev, cprev]
                #         if costForStep < bestCost:
                #             bestCost = costForStep
                #             bestCostIndex = cstep
                # D[row, col] = bestCost
                # B[row, col] = bestCostIndex

                crange_lbound = max(col - gaplens[row-1]*2, 0)
                crange_ubound = col - int(np.ceil(gaplens[row-1]/2)) + 1
                #if crange_lbound >= crange_ubound:
                #    print(f'crange_lb = {crange_lbound}, crange_ubound = {crange_ubound}, row = {row}, col = {col}')
                if crange_ubound > crange_lbound:
                    D[row, col] = np.min(D[row-1, crange_lbound:crange_ubound]) + C[row,col]
                    B[row, col] = col - (crange_lbound + np.argmin(D[row-1, crange_lbound:crange_ubound]))
    
    path = dtw_backtrace_sparse(D, B, gaplens, steps, subseq=True)
    path.reverse()
    path = np.array(path).T

    return D, B, path

In [None]:
@jit(nopython=True)
def dtw_backtrace_sparse(D, B, gaplens, steps, subseq):
    '''
    Backtraces through the cumulative cost matrix D
    
    Inputs:
        D: cumulative cost matrix
        B: backtrace matrix
        gaplens: array specifying the gap lengths between selected features
        steps: a numpy matrix specifying the allowable transitions.  It should be of dimension (L, 2), where each row specifies (row step, col step)
        subseq: boolean indicating whether to assume a subsequence alignment
    
    Returns:
        A numpy array of (row, col) coordinates for the optimal path.
    '''

    rstart = B.shape[0] - 1
    if subseq:
        cstart = np.argmin(D[-1])
    else:
        cstart = B.shape[1] - 1
    pos = (rstart, cstart)
    path = []
    path.append(pos)
    while (pos[0] != 0 and pos[1] != 0) or (pos[0] and subseq):
        
        (row, col) = pos
        if row >= 2 and gaplens[row-1] == 1 and gaplens[row-2] == 1:
            
            # dense matching
            stepidx = B[row, col]
            (rstep, cstep) = steps[stepidx]
            pos = (row-rstep, col-cstep)
            path.append(pos)
            
        else:
            
            # sparse matching
            rstep = 1
            cstep = B[row, col]
            pos = (row-rstep, col-cstep)
            path.append(pos)

    return path

In [None]:
def verify_cache_dir(indir):
    '''
    Verifies that the specified cache directory has the required files.
    
    Inputs
    indir: The cache directory to verify
    '''

    # Feature Files
    assert os.path.exists(f'{indir}/o_chroma.npy'), f'Missing o_chroma.npy in {indir}'
    assert os.path.exists(f'{indir}/o_mfcc.npy'), f'Missing o_mfcc.npy in {indir}'
    assert os.path.exists(f'{indir}/po_chroma.npy'), f'Missing po_chroma.npy in {indir}'
    assert os.path.exists(f'{indir}/po_mfcc.npy'), f'Missing po_mfcc.npy in {indir}'
    assert os.path.exists(f'{indir}/gmm_O.pkl'), f'Missing gmm_O.pkl in {indir}'

## Online Processing

In the online processing stage, we do two things:
1. compute an offline alignment between the piano and estimated piano using DTW,
2. use the predicted alignment to infer the alignment between the piano and orchestra recordings

Note that this implementation is an offline system, but is implemented in a way that can be extended to the online case easily.

In [None]:
def online_processing(scenario_dir, out_dir, cache_dir, hop_length, steps, weights):
    '''
    Carries out `online' processing using the MATCH algorithm.
    
    Inputs
    scenario_dir: The scenario directory to process
    out_dir: The directory to put results, intermediate files, and logging info
    cache_dir: The cache directory
    hop_sec: The hop size in sec used in the offline DTW stage
    separation_dir: directory where the pre-separated audio files are stored

    This function will compute and save the predicted alignment in the output directory in a file hyp.npy
    '''
    # TODO: move this below `os.makedirs` and do the separation here instead of importing pre-separated files
    piece_name = Path(cache_dir).name.split('_')
    piece_name.pop(2)
    piece_name = '_'.join(piece_name) # e.g. rach2_mov1
    #verify_separated_file(separation_dir, piece_name)
    
    # verify & setup
    # System_MATCH.verify_match_installation()
    system_utils.verify_scenario_dir(scenario_dir)
    verify_cache_dir(cache_dir)
    assert not os.path.exists(out_dir), f'Output directory {out_dir} already exists.'
    os.makedirs(out_dir)
           
    # compute P features
    p_file = f'{scenario_dir}/p.wav'
    y, sr = lb.core.load(p_file)
    F_p_chroma = lb.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length, norm=2)
    F_p_mfcc = lb.feature.mfcc(y=y, sr=sr, hop_length=hop_length)    
    hop_sec = hop_length / sr

    # load O and PO features
    F_o_chroma = np.load(f'{cache_dir}/o_chroma.npy')
    F_o_mfcc = np.load(f'{cache_dir}/o_mfcc.npy')
    F_po_chroma = np.load(f'{cache_dir}/po_chroma.npy')
    F_po_mfcc = np.load(f'{cache_dir}/po_mfcc.npy')

    # select matching portion of PO
    C = align_tools.cosine_dist(F_p_chroma, F_po_chroma)
    _, _, wp_AB = dtw.dtw(C, steps, weights, True) # P - PO subsequence alignment
    po_match_start_frm, po_match_end_frm = wp_AB[1,0], wp_AB[1,-1] + 1
    F_po_match_mfcc = F_po_mfcc[:,po_match_start_frm:po_match_end_frm]
    F_po_match_chroma = F_po_chroma[:,po_match_start_frm:po_match_end_frm]

    # load GMM model for orchestra MFCCs
    with open(f'{cache_dir}/gmm_O.pkl', 'rb') as f:
        gmm_O = pickle.load(f)

    # fit GMM model to piano MFCCs
    gmm_P = mixture.GaussianMixture(n_components=10, covariance_type='diag', random_state=0).fit(F_p_mfcc[1:13,:].T)

    # classify PO_match frames
    gmm_O_scores = gmm_O.score_samples(F_po_match_mfcc[1:13,:].T)
    gmm_P_scores = gmm_P.score_samples(F_po_match_mfcc[1:13,:].T)
    idx_sel_P = np.where(gmm_P_scores > gmm_O_scores)[0]
    gaplens_P = idx_sel_P[1:] - idx_sel_P[0:-1]
    gaplens_P = np.append(gaplens_P, len(gmm_P_scores) - idx_sel_P[-1])
    idx_sel_O = np.where(gmm_P_scores <= gmm_O_scores)[0]
    gaplens_O = idx_sel_O[1:] - idx_sel_O[0:-1]
    gaplens_O = np.append(gaplens_O, len(gmm_P_scores) - idx_sel_O[-1])

    # compute dense-sparse alignment for PO_match - O
    C = 1 - F_po_match_chroma[:,idx_sel_O].T @ F_o_chroma
    D, B, wp_BC = dtw_sparse_subseq(C, gaplens_O) # PO_match - O alignment
    wp_BC[0,:] = idx_sel_O[wp_BC[0,:]] # convert back to PO frames
    wp_BC[0,:] = wp_BC[0,:] + po_match_start_frm # account for offset, specifies PO - O alignment

    # re-estimate PO - P alignment
    C = 1 - F_po_match_chroma[:,idx_sel_P].T @ F_p_chroma
    D, B, wp_AB_reest = dtw_sparse_subseq(C, gaplens_P)
    wp_AB_reest[0,:] = idx_sel_P[wp_AB_reest[0,:]] # convert back to PO frames
    wp_AB_reest[0,:] = wp_AB_reest[0,:] + po_match_start_frm # account for offset
    wp_AB_reest = np.flipud(wp_AB_reest) # now specifies P - PO alignment

    # infer piano-orchestra alignment
    wp_AC = align_tools.infer_alignment(wp_AB_reest, wp_BC, frames=True)
    np.save(f'{out_dir}/hyp.npy', wp_AC*hop_sec)
    np.save(f'{out_dir}/o_po_align.npy', np.flipud(wp_BC))
    np.save(f'{out_dir}/p_po_align.npy', wp_AB_reest)
    #np.save(f'{out_dir}/runtime_p_po.npy', t_end - t_start)
    
    return

In [None]:
def selectFeatures(F, frac_keep):
    '''
    Selects a subset of features that have the highest flux.

    Inputs:
        F: feature matrix of size DxN, where D is the feature dimension and N is the number of features
        frac_keep: the fraction of features to keep, a scalar between 0 and 1

    Returns:
        F_sel: a DxM feature matrix containing the selected subset of features
        idx_sel: an array of length M specifying the indices of the features that were selected
        gaplens: an array of length M specifying the gap lengths between selected features
        flux_thresh: the threshold used to select features based on their flux
    '''
    flux_vals = np.sum(np.abs(F[:,0:-1] - F[:,1:]), axis=0)
    flux_thresh = sorted(flux_vals, reverse=True)[int(np.round(frac_keep * len(flux_vals)))-1]
    idx_sel = np.where(np.array(flux_vals > flux_thresh) == 1)[0]
    gaplens = idx_sel[1:] - idx_sel[0:-1]
    gaplens = np.append(gaplens, len(flux_vals) - idx_sel[-1])
    F_sel = F[:,idx_sel]
    
    return F_sel, idx_sel, gaplens, flux_thresh

In [None]:
def verify_hyp_dir(indir):
    '''
    Verifies that the specified scenario hypothesis directory has the required files.
    
    Inputs
    indir: The cache directory to verify
    '''
    assert os.path.exists(f'{indir}/hyp.npy'), f'{indir} is missing the required files, please re run the online processing'

# Example

Here is an example of how to call the offline and online processing functions on a scenario directory.

In [None]:
# scenario_dir = 'scenarios/s2'
# out_dir = 'experiments/test/s2'
# cache_dir = 'experiments/test/cache'
# hop_size = 512
# steps = np.array([1,1,1,2,2,1]).reshape((-1,2))
# weights = np.array([2,3,3], dtype=np.float64)
# offline_processing(scenario_dir, cache_dir, hop_size, steps, weights)
# online_processing(scenario_dir, out_dir, cache_dir, hop_size, steps, weights, )

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt

In [None]:
# scenario_dir = 'scenarios/s2'
# #out_dir = 'experiments/test/s2'
# #cache_dir = 'experiments/test/cache'
# hop_length = 512
# #steps = np.array([1,1,1,2,2,1]).reshape((-1,2))
# #weights = np.array([2,3,3], dtype=np.float64)
# #offline_processing(scenario_dir, cache_dir, hop_size, steps, weights)
# #online_processing(scenario_dir, out_dir, cache_dir, hop_size, steps, weights, )

In [None]:
# # compute orchestra features
# o_file = f'{scenario_dir}/o.wav'
# y_o, sr = lb.core.load(o_file)
# F_o_chroma = lb.feature.chroma_cqt(y=y_o, sr=sr, hop_length=hop_length, norm=2) 
# F_o_mfcc = lb.feature.mfcc(y=y_o, sr=sr, hop_length=hop_length)

# # compute full mix features
# po_file = f'{scenario_dir}/po.wav'
# y_po, sr = lb.core.load(po_file)
# F_po_chroma = lb.feature.chroma_cqt(y=y_po, sr=sr, hop_length=hop_length, norm=2)
# F_po_mfcc = lb.feature.mfcc(y=y_po, sr=sr, hop_length=hop_length)

# # compute piano features
# p_file = f'{scenario_dir}/p.wav'
# y_p, sr = lb.core.load(p_file)
# F_p_chroma = lb.feature.chroma_cqt(y=y_p, sr=sr, hop_length=hop_length, norm=2)
# F_p_mfcc = lb.feature.mfcc(y=y_p, sr=sr, hop_length=hop_length)

In [None]:
# # select matching portion of PO
# steps = np.array([[1,1],[1,2],[2,1]])
# weights = np.array([1,1,2])
# C = align_tools.cosine_dist(F_p_chroma, F_po_chroma)
# _, _, wp_AB = dtw.dtw(C, steps, weights, True)
# po_match_start_frm, po_match_end_frm = wp_AB[1,0], wp_AB[1,-1] + 1
# F_po_match_mfcc = F_po_mfcc[:,po_match_start_frm:po_match_end_frm]
# F_po_match_chroma = F_po_chroma[:,po_match_start_frm:po_match_end_frm]

In [None]:
# # fit GMM model to orchestra MFCCs
# #F_o_mfcc_scaled = sklearn.preprocessing.scale(F_o_mfcc, axis=1)
# #gmm_O = mixture.GaussianMixture(n_components=10, covariance_type='diag', reg_covar = 1e-2, random_state=0).fit(F_o_mfcc[1:,:].T)
# gmm_O = mixture.GaussianMixture(n_components=10, covariance_type='diag', random_state=None).fit(F_o_mfcc[1:13,:].T)
# #gmm_O2 = mixture.GaussianMixture(n_components=3, covariance_type='diag', random_state=None).fit(F_o_mfcc[1:13,:].T)

In [None]:
# # fit GMM model to piano MFCCs
# #F_p_mfcc_scaled = sklearn.preprocessing.scale(F_p_mfcc, axis=1)
# #gmm_P = mixture.GaussianMixture(n_components=10, covariance_type='diag', reg_covar = 1e-2, random_state=0).fit(F_p_mfcc.T)
# gmm_P = mixture.GaussianMixture(n_components=10, covariance_type='diag', random_state=None).fit(F_p_mfcc[1:13,:].T)
# #gmm_P2 = mixture.GaussianMixture(n_components=3, covariance_type='diag', random_state=None).fit(F_p_mfcc[1:13,:].T)

In [None]:
# gmm_O_scores = gmm_O.score_samples(F_po_match_mfcc[1:13,:].T)
# gmm_P_scores = gmm_P.score_samples(F_po_match_mfcc[1:13,:].T)

In [None]:
#gmm_O_scores2 = gmm_O2.score_samples(F_po_mfcc[1:13,:].T)
#gmm_P_scores2 = gmm_P2.score_samples(F_po_mfcc[1:13,:].T)

In [None]:
# idx_sel_P = np.where(gmm_P_scores > gmm_O_scores)[0]
# gaplens_P = idx_sel_P[1:] - idx_sel_P[0:-1]
# gaplens_P = np.append(gaplens_P, len(gmm_P_scores) - idx_sel_P[-1])

In [None]:
# idx_sel_O = np.where(gmm_P_scores <= gmm_O_scores)[0]
# gaplens_O = idx_sel_O[1:] - idx_sel_O[0:-1]
# gaplens_O = np.append(gaplens_O, len(gmm_P_scores) - idx_sel_O[-1])

# #return idx_sel_P, gaplens_P, idx_sel_O, gaplens_O

In [None]:
# # compute dense-sparse alignment for PO_match - O
# C = 1 - F_po_match_chroma[:,idx_sel_O].T @ F_o_chroma
# D, B, wp = dtw_sparse_subseq(C, gaplens_O)
# wp[0,:] = idx_sel_O[wp[0,:]] # convert back to PO frames
# wp[0,:] = wp[0,:] + po_match_start_frm # account for offset

In [None]:
# # compare regular PO-O subseq DTW alignment path
# C2 = 1 - F_po_match_chroma.T @ F_o_chroma
# _, _, wp2 = dtw.dtw(C2, steps, weights, True)
# wp2[0,:] = wp2[0,:] + po_match_start_frm # account for offset

In [None]:
# plt.plot(wp[0,:],wp[1,:], 'r.')
# #plt.plot(wp2[0,:],wp2[1,:], 'b.')

In [None]:
# # compute dense-sparse alignment for PO_match - P
# C3 = 1 - F_po_match_chroma[:,idx_sel_P].T @ F_p_chroma
# D, B, wp3 = dtw_sparse_subseq(C3, gaplens_P)
# wp3[0,:] = idx_sel_P[wp3[0,:]] # convert back to PO frames
# wp3[0,:] = wp3[0,:] + po_match_start_frm # account for offset

In [None]:
# plt.plot(wp3[0,:],wp3[1,:], 'r.') # wp3 specifices PO-P
# plt.plot(wp_AB[1,:],wp_AB[0,:], 'b') # wp_AB specific P-PO

In [None]:
# def getMaxLength(pred):
#     maxlen = 0
#     maxEndIdx = -1
#     curlen = 0
#     prevp = -1
#     for i, p in enumerate(pred):
#         if prevp == p:
#             curlen += 1
#         else:
#             if curlen > maxlen:
#                 maxlen = curlen
#                 maxEndIdx = i
#             curlen = 1
#             prevp = p
#     if curlen > maxlen:
#         maxlen = curlen
#         maxEndIdx = -1
#     return maxlen, maxEndIdx