In [18]:
import numpy as np
import matplotlib.pyplot as plt
import os.path
from pathlib import Path
import pickle
import multiprocessing
import time
import gc
from tqdm import tqdm

In [19]:
import import_ipynb

In [20]:
import DTW

In [21]:
import NWTW

In [22]:
import FlexDTW

In [23]:
DATASET = 'train' # 'test'
VERSION = 'toy'

In [24]:
QUERY_LIST = Path(f'cfg_files/queries.{DATASET}.{VERSION}')

In [25]:
SYSTEMS = ['dtw1', 'dtw2', 'dtw3', 'subseqdtw1', 'subseqdtw2', 'subseqdtw3', 'nwtw', 'flexdtw']
BENCHMARKS = ['matching', 'subseq_20', 'subseq_30', 'subseq_40', 'partialStart', 'partialEnd', 'partialOverlap', 
              'pre_5', 'pre_10', 'pre_20', 'post_5', 'post_10', 'post_20', 'prepost_5', 'prepost_10',
              'prepost_20']

In [26]:
features_root = Path('../ttmp/Chopin_Mazurkas_features')
FEAT_DIRS = {}

for benchmark in BENCHMARKS:
    if benchmark == 'partialOverlap':
        FEAT_DIRS[benchmark] = ([features_root/'partialStart', features_root/'partialEnd'])
    elif 'prepost' in benchmark:
        sec = benchmark.split('_')[-1]
        FEAT_DIRS[benchmark] = ([features_root/f'pre_{sec}', features_root/f'post_{sec}'])
    else:
        FEAT_DIRS[benchmark] = [features_root/f'{benchmark}', features_root/'original']

In [27]:
steps = {'dtw1': np.array([1,1,1,2,2,1]).reshape((-1,2)),
        'dtw2': np.array([1,1,1,2,2,1]).reshape((-1,2)),
        'dtw3': np.array([1,1,1,2,2,1]).reshape((-1,2)),
        'subseqdtw1': np.array([1,1,1,2,2,1]).reshape((-1,2)),
        'subseqdtw2': np.array([1,1,1,2,2,1]).reshape((-1,2)),
        'subseqdtw3': np.array([1,1,1,2,2,1]).reshape((-1,2)),
        'nwtw': 0, # transitions are specified in NWTW algorithm
        'flexdtw': np.array([1,1,1,2,2,1]).reshape((-1,2))
        }
weights = {'dtw1': np.array([2,3,3]),
          'dtw2': np.array([1,1,1]),
          'dtw3': np.array([1,2,2]),
          'subseqdtw1': np.array([1,1,2]),
          'subseqdtw2': np.array([2,3,3]),
          'subseqdtw3': np.array([1,2,2]),
          'nwtw': 0, # weights are specified in NWTW algorithm
          'flexdtw': np.array([1.25,3,3])
          }
other_params = {
                'flexdtw': {'beta': 0.1}
               }

# Benchmarks

In [28]:
def get_outfile(outdir, benchmark, system, queryid):
    outpath = (outdir / benchmark / system)
    outpath.mkdir(parents=True, exist_ok=True)
    outfile = (outpath / queryid).with_suffix('.pkl')
    return outfile

In [29]:
def align_system(system, F1, F2, outfile):
    
    subseq = 'subseq' in system
    
    if system == 'flexdtw':
        L1 = F1.shape[1]
        L2 = F2.shape[1]
        buffer = min(L1, L2) * (1 - (1 - other_params[system]['beta']) * min(L1,L2) / max(L1, L2))
        C = 1 - FlexDTW.L2norm(F1).T @ FlexDTW.L2norm(F2) # cos distance metric
        best_cost, wp, debug = FlexDTW.flexdtw(C, steps=steps[system], weights=weights[system], buffer=buffer)
    elif system == 'nwtw':
        downsample = 1
        C = 1 - NWTW.L2norm(F1)[:,0::downsample].T @ NWTW.L2norm(F2)[:,0::downsample] # cos distance metric
        optcost, wp, D, B = NWTW.NWTW_faster(C, gamma=0.346)
    else:
        downsample = 1
        if subseq and (F2.shape[1] < F1.shape[1]):
            C = 1 - DTW.L2norm(F2)[:,0::downsample].T @ DTW.L2norm(F1)[:,0::downsample] # cos distance metric
            wp = DTW.alignDTW(C, steps=steps[system], weights=weights[system], downsample=downsample, outfile=outfile, subseq=subseq)
            wp = wp[::-1,:]
        else:
            C = 1 - DTW.L2norm(F1)[:,0::downsample].T @ DTW.L2norm(F2)[:,0::downsample] # cos distance metric
            wp = DTW.alignDTW(C, steps=steps[system], weights=weights[system], downsample=downsample, outfile=outfile, subseq=subseq)
            
    if wp is not None:
        pickle.dump(wp, open(outfile, 'wb'))

In [30]:
def run_all_benchmarks(outdir):
    parts_batch = []
    queryids = []
    with open(QUERY_LIST, 'r') as f:
        for line in tqdm(f):
            parts = line.strip().split(' ')
            assert len(parts) == 2
            queryid = os.path.basename(parts[0]) + '__' + os.path.basename(parts[1])
            
            if 'Czerny-Stefanska-1949_pid9086' in queryid:
                continue
            
            parts_batch.append(parts)
            queryids.append(queryid)
            
    for benchmark in BENCHMARKS:
#         for i in range(len(parts_batch)):
#             run_benchmark(benchmark, FEAT_DIRS[benchmark][0], FEAT_DIRS[benchmark][1], parts_batch[i], outdir, queryids[i])
        run_benchmark_batch(benchmark, FEAT_DIRS[benchmark][0], FEAT_DIRS[benchmark][1], parts_batch, outdir, queryids, n_cores=4)

In [31]:
def run_benchmark_batch(benchmark, featdir1, featdir2, parts_batch, outdir, queryids, n_cores):
    inputs = []
    assert len(parts_batch) == len(queryids)
    
    for i in range(len(parts_batch)):
        featfile1 = (featdir1 / parts_batch[i][0]).with_suffix('.npy')
        featfile2 = (featdir2 / parts_batch[i][1]).with_suffix('.npy')
        
        F1 = np.load(featfile1)
        F2 = np.load(featfile2)
        
        for system in SYSTEMS:
            
            # only compute alignment if this hypothesis file doesn't already exist
            outfile = get_outfile(outdir, benchmark, system, queryids[i])
            if not os.path.isfile(outfile):   
                inputs.append((system, F1, F2, outfile))

    # process files in parallel
    pool = multiprocessing.Pool(processes = multiprocessing.cpu_count()-1)
    pool.starmap(align_system, inputs)
    
    
    return

In [32]:
def run_benchmark(benchmark, featdir1, featdir2, parts, outdir, queryid):
    featfile1 = (featdir1 / parts[0]).with_suffix('.npy')
    featfile2 = (featdir2 / parts[1]).with_suffix('.npy')

    F1 = np.load(featfile1)
    F2 = np.load(featfile2)
        
    # run all baselines
    for system in SYSTEMS:
        
        # only compute alignment if this hypothesis file doesn't already exist
        outfile = get_outfile(outdir, benchmark, system, queryids[i])
        if not os.path.isfile(outfile):   
            align_system(system, F1, F2, outfile)

In [33]:
outdir = Path(f'experiments_{DATASET}/{VERSION}')
run_all_benchmarks(outdir)

5it [00:00, 8800.47it/s]
