# Prep Data 🧑‍🍳

The goal of this notebook is to prepare the data for the alignment task.  We start with the original Mazurka dataset. We then perform time scale modification to normalize all performances within a given Mazurka to be of set duration. We vary this duration among several factors. Beat annotations are then constructed accordingly.

### Imports

In [1]:
import numpy as np
import librosa as lb
import os
from pathlib import Path
import multiprocessing as mp
import glob
import pandas as pd
import random
from itertools import combinations
import soundfile as sf
from tsm_tools import *
from pydub import AudioSegment
import math

### Constants

In [2]:
ANNOTATIONS_ROOT = Path('Chopin_Mazurkas/annotations_beat')
AUDIO_ROOT = Path('Chopin_Mazurkas/wav_22050_mono')
FEATURES_ROOT = Path('features')
TRAIN_FILE = Path('cfg_files/train.txt')
TEST_FILE = Path('cfg_files/test.txt')
FACTORS = ['0.500','0.630','0.794','1.000','1.260','1.588','2.000']
TRAIN_MAZURKAS = ['Chopin_Op017No4','Chopin_Op063No3']
TEST_MAZURKAS = ['Chopin_Op024No2','Chopin_Op030No2','Chopin_Op068No3']
N_CORES = mp.cpu_count()
FEATS_CLEAN_DIR = FEATURES_ROOT / 'clean'
DIRECTORIES = ['Chopin_Mazurkas'] + [f"median_x{factor}" for factor in FACTORS]

## 1. Download Mazurkas

Note that the Chopin_Mazurkas folder can be found in '/mnt/data0/Datasets/Chopin_Mazurkas'. You must copy this folder to this working directory.

In [3]:
! [ -d "Chopin_Mazurkas" ] && echo "already exists" || cp -r /mnt/data0/Datasets/Chopin_Mazurkas .

### Delete exceptional pieces

In [4]:
try:
    os.remove("./Chopin_Mazurkas/wav_22050_mono/Chopin_Op017No4/Chopin_Op017No4_Ginzburg-1957_pid9156-10.wav")
    print("Removed Chopin_Op017No4/Chopin_Op017No4_Ginzburg-1957_pid9156-10")
except OSError:
    pass
try:
    os.remove("./Chopin_Mazurkas/wav_22050_mono/Chopin_Op068No3/Chopin_Op068No3_Koczalski-1948_pid9140-05.wav")
    print("Removed Chopin_Op068No3_Koczalski-1948_pid9140-05")
except OSError:
    pass

Removed Chopin_Op017No4/Chopin_Op017No4_Ginzburg-1957_pid9156-10
Removed Chopin_Op068No3_Koczalski-1948_pid9140-05


## 2. Create filelists text files

We consider all pairwise combinations between performances for a given Mazurka. We allocate 2 Mazukras towards the training set while the remaining 3 Mazukras are used for testing. Once we get the pairs, we select varying amounts for the training set to allow for quicker development and evaluation.

In [5]:
def get_combinations(piece, filelist, seed=42):
    """
    For a given piece (ie. Chopin_Op017No4), return a list
    of tuples of all pair wise combinations between performances.
    """
    performances = []
    with open(filelist, 'r') as infile:
        for performance in infile:
            if piece in performance:
                performances.append(performance.split())
    pairs = list(combinations(performances, 2))
    random.seed(seed)
    random.shuffle(pairs)
    return pairs

def create_file_list(pairs, outdir, limit=None):
    """ Save pairs to *.txt file """
    if limit:
        pairs = pairs[:limit]
    with open(outdir, 'w') as o:
        for a, b in pairs:
            o.write(f"{a[0]} {b[0]}\n")

In [6]:
train_pairs = [pair for mazukra in TRAIN_MAZURKAS for pair in get_combinations(mazukra, TRAIN_FILE)]
test_pairs = [pair for mazukra in TEST_MAZURKAS for pair in get_combinations(mazukra, TEST_FILE)]

# train-toy: randomly select 5 pairs from Op 17 #4
create_file_list(train_pairs, 'cfg_files/train_toy.txt', 5)
# train-small: randomly select 200 pairs from Op 17 #4
create_file_list(train_pairs, 'cfg_files/train_small.txt', 200)
# train-medium: includes all (63 choose 2) pairs for Op 17 #4
create_file_list(train_pairs, 'cfg_files/train_medium.txt', 1953)
# train-full: includes all (63 choose 2) + (88, choose 2) pairs for Op 17 #4 and Op 63 #3
create_file_list(train_pairs, 'cfg_files/train_full.txt')
# test-full: includes all pairs from the 3 test Mazurkas
create_file_list(test_pairs, 'cfg_files/test_full.txt')

## 3. Find median duration for each Mazurka

Each Mazurka as a number of performances that represent the same piece. However, they may all differ slightly in time. We first compute the median duration for all 5 pieces.

In [7]:
def get_median():
    medians = {}
    dirs = glob.glob('Chopin_Mazurkas/wav_22050_mono/*/**/', recursive=True)
    for dir in dirs:
        piece = os.path.split(os.path.normpath(dir))[-1]
        performances = glob.glob(f'{dir}*.wav', recursive=True)
        durations = [lb.get_duration(filename=path) for path in performances]
        durations.sort()
        medians[piece] = durations[len(durations)//2]
    return medians

In [8]:
MEDIANS = get_median()
MEDIANS

{'Chopin_Op030No2': 87.30149659863946,
 'Chopin_Op063No3': 128.22149659863945,
 'Chopin_Op068No3': 99.64816326530612,
 'Chopin_Op024No2': 136.6748299319728,
 'Chopin_Op017No4': 254.92816326530613}

## 4. Perform time-scale modification

We first need to create 7 different directories corresponding to the 7 chosen duration factors.

In [9]:
def create_mazurka_directories(factors:str, subseq=None):
    for factor in factors:
        dir = f'median_x{factor}_subseq{subseq}' if subseq else f'median_x{factor}'
        ann_dir = f'{dir}/annotations_beat'
        wav_dir = f'{dir}/wav_22050_mono'
        for mazurka in TRAIN_MAZURKAS + TEST_MAZURKAS:
            ann_piece_dir = f'{ann_dir}/{mazurka}'
            wav_piece_dir = f'{wav_dir}/{mazurka}'
            Path(ann_piece_dir).mkdir(parents=True, exist_ok=True)
            Path(wav_piece_dir).mkdir(parents=True, exist_ok=True)
            
create_mazurka_directories(FACTORS, subseq=None)

We now need to time scale each performance to its Mazurka's median duration times one of 7 chosen duration factors.

In [10]:
def time_scale_single(old_wav:str, new_wav:str, median:float, factor:str):
    """
    Take the normal wav file and time warp to its specific factor * median duration.
    """
    y, sr = lb.load(old_wav)
    duration = lb.get_duration(y=y, sr=sr)
    ratio = float(factor) * (median / duration)
    y_mod = tsm_hybrid(y, alpha=ratio) # time scale modification
    sf.write(new_wav, y_mod, sr)

In [11]:
def get_timescale_inputs(filelist, piece, factor):
    inputs = []
    with open(filelist, 'r') as f:
        for line in f:
            if piece in line:
                old_wav = f'Chopin_Mazurkas/wav_22050_mono/{line.strip()}.wav'
                new_wav = old_wav.replace("Chopin_Mazurkas", f"median_x{factor}")
                if not os.path.exists(Path(new_wav)):
                    inputs.append((old_wav, new_wav, MEDIANS[piece], factor))
    return inputs

Time scale modification should take around 20-30 minutes.

In [12]:
inputs = [input for factor in FACTORS for mazurka in TRAIN_MAZURKAS for input in get_timescale_inputs(TRAIN_FILE, mazurka, factor)]
inputs += [input for factor in FACTORS for mazurka in TEST_MAZURKAS for input in get_timescale_inputs(TEST_FILE, mazurka, factor)]
with mp.Pool(processes = N_CORES) as pool:
    pool.starmap(time_scale_single, inputs)

## 5. Generate modified beat files

Now we need to create modified beat files to ensure we correctly evaluate beat alignments. We take the original timestamps and then multiply by the median and factor and then divide by the original_duration.

In [14]:
def modify_beat_single(in_path_beat, in_path_wav, out_path, num_multiply):
    """
    Modify beat annotations for one given wav file
    """
        
    ts = np.array(pd.read_csv(in_path_beat, header=None, sep='\s+', skiprows=3)[0])
    mod_ts = (ts * num_multiply) / lb.get_duration(filename=in_path_wav)
    i, header, lines = 0, 0, []

    with open(in_path_beat, 'r') as f:
        for line in f:
            if header < 3: # copy first 3 lines
                lines.append(line)
                header += 1
            else: # modify the rest of the lines
                start = mod_ts[i]
                end = 0.0 if i + 1 == len(mod_ts) else mod_ts[i+1]
                label = line.split('\t')[-1]
                lines.append(f'{start}\t{end}\t{label}')
                i += 1
    with open(out_path, 'w') as o:
        for line in lines:
            o.write(line)

In [15]:
def get_beat_inputs(filelist, piece, factor): # e.g. outdir is 'median_x2.000/annotations_beat/Chopin_Op017No4'
    inputs = []
    with open(filelist, 'r') as f:
        for line in f:
            if piece in line:
                path_to_beat = f'Chopin_Mazurkas/annotations_beat/{line[:-1]}.beat'
                path_to_wav = f'Chopin_Mazurkas/wav_22050_mono/{line[:-1]}.wav'
                out_path = f'median_x{factor}/annotations_beat/{line[:-1]}.beat'
                num_multiply = MEDIANS[piece] * float(factor)
                if not os.path.exists(Path(out_path)):
                    inputs.append((path_to_beat, path_to_wav, out_path, num_multiply))
    return inputs

In [16]:
inputs = [input for factor in FACTORS for mazurka in TRAIN_MAZURKAS for input in get_beat_inputs(TRAIN_FILE, mazurka, factor)]
inputs += [input for factor in FACTORS for mazurka in TEST_MAZURKAS for input in get_beat_inputs(TEST_FILE, mazurka, factor)]
with mp.Pool(processes = N_CORES) as pool:
    pool.starmap(modify_beat_single, inputs)

## 6. Precompute all chroma features

We now compute all chrome features for the new wav files and save them in each median directory under `features/clean`.

In [17]:
def compute_chroma_single(infile, outfile, sr=22050, hop_length=512):
    y, sr = lb.core.load(infile, sr=sr)
    F = lb.feature.chroma_cqt(y, sr=sr, hop_length=hop_length, norm=2)
    np.save(outfile, F)

In [18]:
def get_chroma_inputs(filelist, dir):
    inputs = []
    with open(filelist, 'r') as f:
        for line in f:
            relpath = line.strip()
            reldir, fileid = os.path.split(relpath)
            featdir = dir / FEATS_CLEAN_DIR / reldir
            featdir.mkdir(parents=True, exist_ok=True)
            featfile = (featdir / fileid).with_suffix('.npy')
            audiofile = (dir / Path('wav_22050_mono') / relpath).with_suffix('.wav')
            if not os.path.exists(featfile):
                inputs.append((audiofile, featfile))
    return inputs

Computing chroma features should take around 5-10 mins.

In [19]:
inputs = [input for dir in DIRECTORIES for input in get_chroma_inputs(TRAIN_FILE, dir)]
inputs += [input for dir in DIRECTORIES for input in get_chroma_inputs(TEST_FILE, dir)]  
with mp.Pool(processes = N_CORES) as pool:
    pool.starmap(compute_chroma_single, inputs)

## 7. Generate subsequence dataset

In [24]:
create_mazurka_directories(FACTORS, 20) # 20 for 'subseq20' directories
MEDIANS_SUB = {key:value*0.2 for key, value in MEDIANS.items()}
MEDIANS_SUB

{'Chopin_Op030No2': 17.46029931972789,
 'Chopin_Op063No3': 25.644299319727892,
 'Chopin_Op068No3': 19.929632653061226,
 'Chopin_Op024No2': 27.33496598639456,
 'Chopin_Op017No4': 50.98563265306123}

In [None]:
def get_random_time(total_dur, target_len):
    """
    Select a random time to get a segment. Everything is in milliseconds.
    """
    start = random.randint(0, math.floor(total_dur-target_len))
    end = start + target_len
    return start, end

### Create new wav and beat files

In [None]:
def shift_beat_annotation(in_path_beat, out_path_beat, start_time, end_time):
    """
    Shift beat annotations for one wav file. start_time is in seconds.
    """
    i, header, lines = 0, 0, []
    with open(in_path_beat, 'r') as f:
        for line in f:
            if header < 3: # copy first 3 lines
                lines.append(line)
                header += 1
            else: # modify the rest of the lines
                info = line.split('\t')
                start, end, label = float(info[0]), float(info[1]), info[2]
                if(end > end_time): # check that we haven't reached the end
                    break
                elif(start > start_time):
                    start, end = start-start_time, end-start_time
                    if len(lines) == 3:
                        label = f'{label.strip()}-{i}\n'
                    line = f'{start}\t{end}\t{label}'
                    lines.append(line)
                i += 1
    with open(out_path_beat, 'w') as o:
        for line in lines:
            o.write(line)

In [None]:
def modify_subseq_single(path_to_wav, out_path_wav, median, factor, outdir):
    """
    Modify a single wav and beat file pair
    """
        
    # cut the .wav file into target segment
    new_audio = AudioSegment.from_wav(path_to_wav) # load old audio
    old_duration = new_audio.duration_seconds*1000
    start_time, end_time = get_random_time(old_duration, median*1000*float(factor)) # calculate time to modify to
    new_audio = new_audio[start_time:end_time] # cut the .wav file

    # save to new .wav file
    with open(out_path_wav, 'wb') as f:
        new_audio.export(f, format='wav')
    
    # make beat file
    in_path_beat = path_to_wav.replace('wav_22050_mono', 'annotations_beat').replace('.wav', '.beat')
    out_path_beat = f'{outdir}{in_path_beat[22:]}'
    shift_beat_annotation(in_path_beat, out_path_beat, start_time/1000, end_time/1000)

In [None]:
def get_subseq_inputs(filelist, piece, median_subseq, factor):
    inputs, indir, outdir = [], f'median_x{factor}', f'median_x{factor}_sub20'
    with open(filelist, 'r') as f:
        for line in f:
            if piece in line:
                path_to_wav = f'{indir}/wav_22050_mono/{line[:-1]}.wav'
                out_path_wav = f'{outdir}{path_to_wav[22:]}'
                if not os.path.exists(out_path_wav):
                    inputs.append((path_to_wav, out_path_wav, MEDIANS_SUB[piece], factor, outdir))
    return inputs

In [None]:
inputs = [input for factor in FACTORS for mazurka in TRAIN_MAZURKAS for input in get_subseq_inputs(TRAIN_FILE, mazurka, factor)]
inputs += [input for factor in FACTORS for mazurka in TEST_MAZURKAS for input in get_subseq_inputs(TEST_FILE, mazurka, factor)]
with mp.Pool(processes = N_CORES) as pool:
    pool.starmap(modify_subseq_single, inputs)

### Compute chroma features

In [None]:
SUB_DIRECTORIES = ['median_x0.500_sub20', 'median_x0.630_sub20', 'median_x0.794_sub20', 'median_x1.000_sub20', 'median_x1.260_sub20', 'median_x1.588_sub20', 'median_x2.000_sub20']
inputs = [input for dir in SUB_DIRECTORIES for input in get_chroma_inputs(TRAIN_FILE, dir)]
inputs += [input for dir in SUB_DIRECTORIES for input in get_chroma_inputs(TEST_FILE, dir)]  
with mp.Pool(processes = N_CORES) as pool:
    pool.starmap(compute_chroma_single, inputs)