# Data Prep

The goal of this notebook is to prep the data for the alignment task.  This includes computing audio features and generating a query list.

In [None]:
import glob 
import librosa as lb
import numpy as np
import os
import os.path
import pandas as pd
from pathlib import Path
import multiprocessing

In [None]:
DATA_ROOT = Path('../../data/')
AUDIO_ROOT = DATA_ROOT / Path('Chopin_Mazurkas/wav_22050_mono')
ANNOTATIONS_ROOT = DATA_ROOT / Path('Chopin_Mazurkas/annotations_beat')
FEATURES_ROOT = Path('features') # directory which will store chroma features
TRAIN_DATASET = "toy"

%store DATA_ROOT
%store FEATURES_ROOT
%store TRAIN_DATASET

train_files = Path('cfg_files/filelist.' + TRAIN_DATASET + '.txt')
test_files = Path('cfg_files/filelist.test.txt')

In [None]:
if not os.path.exists(FEATURES_ROOT):
    os.mkdir(FEATURES_ROOT)

### Compute features on clean audio

First we compute features on the audio.

In [None]:
def compute_chroma_single(infile, outfile, sr = 22050, hop_length=512):
    y, sr = lb.core.load(infile, sr = sr)
    #F = lb.feature.chroma_cens(y, sr=sr, hop_length=hop_length)
    F = lb.feature.chroma_cqt(y, sr=sr, hop_length=hop_length, norm=2)
    np.save(outfile, F)
    return

In [None]:
def compute_chroma_batch(filelist, outdir, n_cores):

    # prep inputs for parallelization
    inputs = []
    with open(filelist, 'r') as f:
        for line in f:
            relpath = line.strip()
            reldir, fileid = os.path.split(relpath)
            featdir = outdir / reldir
            featdir.mkdir(parents=True, exist_ok=True)
            featfile = (featdir / fileid).with_suffix('.npy')
            audiofile = (AUDIO_ROOT / relpath).with_suffix('.wav')
            if os.path.exists(featfile):
                print(f"Skipping {featfile}")
            else:
                inputs.append((audiofile, featfile))

    # process files in parallel
    pool = multiprocessing.Pool(processes = n_cores)
    pool.starmap(compute_chroma_single, inputs)
    
    return

In [None]:
# Start by setting up DTW for audio without modification
FEATS_CLEAN_DIR = FEATURES_ROOT / 'no_modification'
compute_chroma_batch(train_files, FEATS_CLEAN_DIR, 24)
compute_chroma_batch(test_files, FEATS_CLEAN_DIR, 24)

### Set up modified data directories

Set up directory structure below.
```
pre5
├annotations_beat
└wav_22050_mono
```

In [None]:
# Will eventually include the following new_data_folder names:
# subseq10, subseq30, partial_overlap,
# pre5, pre10, pre15, pre20,
# post5, post10, post15, post20,
# pre_post5, pre_post10, pre_post15, pre_post20

new_data_folder = DATA_ROOT / 'pre5'
new_annotation_folder = new_data_folder / 'annotations_beat'
new_audio_folder = new_data_folder / 'wav_22050_mono'

new_data_folder.mkdir(exist_ok=True)
new_annotation_folder.mkdir(exist_ok=True)
new_audio_folder.mkdir(exist_ok=True)

Fill in `annotations_beat` directory. Annotation files do NOT have correct annotations for now. Instead, they are just copied over. The audio files directory `wav_22050_mono` will remain empty.

In [None]:
# The /*/*.beat pattern ignores files outside of the mazurka directories 
# by specifying exactly one level of recursion. To remove this restriction,
# use /**/*.beat.
for old_annotations_file in glob.glob(str(ANNOTATIONS_ROOT) + '/*/*.beat', recursive=True):
    mazurka, performance = os.path.split(
                         os.path.relpath(old_annotations_file, ANNOTATIONS_ROOT))
    mazurka = Path(mazurka)
    performance = Path(performance)
    
    mazurka_dir = new_annotation_folder / mazurka
    performance_file = mazurka_dir / performance
    
    if os.path.exists(mazurka_dir):
        print(f"Skipping creating {mazurka_dir} directory")
    else:
        mazurka_dir.mkdir(exist_ok=True, parents=True)
        
    if os.path.exists(performance_file):
        print(f"Skipping creating {performance_file}")
    else:
        with open(old_annotations_file, 'r') as f:
            preamble = [f.readline(), 
                        f.readline(),
                        f.readline()]
            
            with open(performance_file, 'w') as new_annotations_file:
                new_annotations_file.writelines(preamble)
                
                old_annotations = pd.read_csv(f, header=None, sep='\s+')
                new_annotations = old_annotations
                
                # Note this changes the times many decimal places beyond the annotation precision                
                new_annotations.to_csv(new_annotations_file, header=False, index=False, sep='\t')

In [None]:
# Just playing around with pandas for the other modifications

# Select all beats greater than 7 seconds (if we've deleted the first 7 seconds)
new_annotations = old_annotations[old_annotations[0] > 7]

### Compute features on additional datasets

In [None]:
def make_feature_dir(data_root, beg_silence, end_silence):
    # Should dynamically construct audio file and compute chroma features
    pass

### Generate query list

Here we generate a file containing each pair of files to be aligned.

The lines of this outfile have the form
```
piece_name/recording_name_1 piece_name/recording_name_2
```

In [None]:
def generate_query_list(filelist, outfile):
    
    # group files by piece
    d = {}
    with open(filelist, 'r') as f:
        for line in f:
            parts = line.strip().split('/')
            assert len(parts) == 2
            piece, fileid = parts
            if piece not in d:
                d[piece] = []
            d[piece].append(fileid)
            
    # print out all pairings
    with open(outfile, 'w') as fout:
        for piece in d:
            num_recordings = len(d[piece])
            for i in range(num_recordings):
                fileid1 = d[piece][i]
                for j in range(i+1, num_recordings):
                    fileid2 = d[piece][j]
                    line = f'{piece}/{fileid1} {piece}/{fileid2}\n'
                    fout.write(line)
                    
    return

In [None]:
train_queries = 'cfg_files/query.' + TRAIN_DATASET + '.list'
test_queries = 'cfg_files/query.test.list'
generate_query_list(train_files, train_queries)
generate_query_list(test_files, test_queries)