# Data Prep

The goal of this notebook is to prep the data for the alignment task.  This includes computing audio features and generating a query list.

In [15]:
import numpy as np
import librosa as lb
import os
import os.path
from pathlib import Path
import multiprocessing

In [16]:
ANNOTATIONS_ROOT = Path('Chopin_Mazurkas/annotations_beat')
AUDIO_ROOT = Path('Chopin_Mazurkas/wav_22050_mono')
FEATURES_ROOT = Path('features')
train_files = Path('cfg_files/filelist.train.txt')
test_files = Path('cfg_files/filelist.test.txt')

In [17]:
if not os.path.exists(FEATURES_ROOT):
    os.mkdir(FEATURES_ROOT)

### Create Train-toy, Train-small, Train-medium, Train-full, Test-full filelists

In [6]:
import random
from itertools import combinations

In [49]:
def create_internal_datasets(num_pairs, filelist, piece, outdir):
    '''piece is ex. 'Chopin_Op017No4' '''
    # load file names from original file list
    total_files = []
    with open(filelist, 'r') as infile:
        for line in infile:
            if piece in line:
                total_files.append(line)
    # generate all pairs
    all_pairs = list(combinations(total_files, 2))
    subset = random.sample(range(0, len(all_pairs)), num_pairs) # select num_pairs random pairs from all_pairs
    # write to outfile
    with open(outdir,'a') as outfile:
        for index in subset:
            item1, item2 = all_pairs[index]
            outfile.write(item1[:-1]+' '+item2)   
    return

In [50]:
def create_allpairs_datasets(filelist, piece, outdir):
    total_files = []
    with open(filelist, 'r') as infile:
        for line in infile:
            if piece in line:
                total_files.append(line)
    # generate all pairs
    all_pairs = list(combinations(total_files, 2))
    # write to outfile
    with open(outdir,'a') as outfile:
        for pair in all_pairs:
            item1, item2 = pair[0], pair[1]
            outfile.write(item1[:-1]+' '+item2)   
    return

In [51]:
# Train-toy: randomly select 5 pairs from Op 17 #4
create_internal_datasets(5, train_files, 'Chopin_Op017No4', 'cfg_files/filelist.train_toy.txt')

# Train-small: randomly select 200 pairs from Op 17 #4
create_internal_datasets(200, train_files, 'Chopin_Op017No4', 'cfg_files/filelist.train_small.txt')

# Train-medium: includes all (63 choose 2) pairs for Op 17 #4
create_internal_datasets(1953, train_files, 'Chopin_Op017No4', 'cfg_files/filelist.train_medium.txt')

# Train-full: includes all (63 choose 2) + (88, choose 2) pairs for Op 17 #4 and Op 63 #3
create_internal_datasets(1953, train_files, 'Chopin_Op017No4', 'cfg_files/filelist.train_full.txt')
create_internal_datasets(3828, train_files, 'Chopin_Op063No3', 'cfg_files/filelist.train_full.txt')

In [52]:
# Test-full: includes all pairs from the 3 test Mazurkas
create_allpairs_datasets(test_files, 'Chopin_Op024No2', 'cfg_files/filelist.test_full.txt')
create_allpairs_datasets(test_files, 'Chopin_Op030No2', 'cfg_files/filelist.test_full.txt')
create_allpairs_datasets(test_files, 'Chopin_Op068No3', 'cfg_files/filelist.test_full.txt')

### Compute features on clean audio

First we compute features on the audio.

In [4]:
def compute_chroma_single(infile, outfile, sr = 22050, hop_length=512):
    y, sr = lb.core.load(infile, sr = sr)
    #F = lb.feature.chroma_cens(y, sr=sr, hop_length=hop_length)
    F = lb.feature.chroma_cqt(y, sr=sr, hop_length=hop_length, norm=2)
    np.save(outfile, F)
    return

In [5]:
def compute_chroma_batch(filelist, outdir, n_cores):
    
    # prep inputs for parallelization
    inputs = []
    with open(filelist, 'r') as f:
        for line in f:
            relpath = line.strip()
            reldir, fileid = os.path.split(relpath)
            featdir = outdir / reldir
            featdir.mkdir(parents=True, exist_ok=True)
            featfile = (featdir / fileid).with_suffix('.npy')
            audiofile = (AUDIO_ROOT / relpath).with_suffix('.wav')
            if os.path.exists(featfile):
                print(f"Skipping {featfile}")
            else:
                inputs.append((audiofile, featfile))

    # process files in parallel
    pool = multiprocessing.Pool(processes = n_cores)
    pool.starmap(compute_chroma_single, inputs)
    
    return

In [7]:
FEATS_CLEAN_DIR = FEATURES_ROOT / 'clean'
compute_chroma_batch(train_files, FEATS_CLEAN_DIR, 24)
compute_chroma_batch(test_files, FEATS_CLEAN_DIR, 24)

### Generate query list

Here we generate a file containing each pair of files to be aligned.

In [8]:
def generate_query_list(filelist, outfile):
    
    # group files by piece
    d = {}
    with open(filelist, 'r') as f:
        for line in f:
            parts = line.strip().split('/')
            assert len(parts) == 2
            piece, fileid = parts
            if piece not in d:
                d[piece] = []
            d[piece].append(fileid)
            
    # print out all pairings
    with open(outfile, 'w') as fout:
        for piece in d:
            num_recordings = len(d[piece])
            for i in range(num_recordings):
                fileid1 = d[piece][i]
                for j in range(i+1, num_recordings):
                    fileid2 = d[piece][j]
                    line = f'{piece}/{fileid1} {piece}/{fileid2}\n'
                    fout.write(line)
                    
    return

In [9]:
train_queries = 'cfg_files/query.train.list'
test_queries = 'cfg_files/query.test.list'
generate_query_list(train_files, train_queries)
generate_query_list(test_files, test_queries)