# Data Prep

The goal of this notebook is to prep the data for the alignment task.  This includes computing audio features and generating a query list.

In [None]:
import numpy as np
import librosa as lb
import pickle as pkl
import os
import os.path
from pathlib import Path
import multiprocessing
import scipy.io.wavfile as wav
from math import sqrt
import IPython.display as ipd
import glob

In [None]:
##### Change this cell to suit your file structure #####
MAZURKAS_ROOT = Path('/data/Datasets/Chopin_Mazurkas') # Path to Mazurkas dataset root directory
OUT_ROOT = Path().absolute() # Output root directory (this is where features, paths, etc. will be saved)
########################################################

In [None]:
ANNOTATIONS_ROOT = MAZURKAS_ROOT / 'annotations_beat'
AUDIO_ROOT = MAZURKAS_ROOT / 'wav_22050_mono'
FEATURES_ROOT = OUT_ROOT / 'features'
train_files = Path('cfg_files/filelist.train.txt')
test_files = Path('cfg_files/filelist.test.txt')

In [None]:
if not os.path.exists(FEATURES_ROOT):
    os.mkdir(FEATURES_ROOT)

### Compute features on clean audio

First we compute features on the audio.

In [None]:
def compute_chroma_single(infile, outfile, sr = 22050, hop_length=512):
    '''Compute and save the chroma features for a single audio file'''
    y, sr = lb.core.load(infile, sr = sr)
    #F = lb.feature.chroma_cens(y, sr=sr, hop_length=hop_length)
    F = lb.feature.chroma_cqt(y, sr=sr, hop_length=hop_length, norm=2)
    np.save(outfile, F)
    return

In [None]:
def compute_chroma_batch(filelist, outdir, n_cores):
    '''Compute and save the chroma features for all files in a filelist'''
    # prep inputs for parallelization
    inputs = []
    with open(filelist, 'r') as f:
        for line in f:
            relpath = line.strip()
            reldir, fileid = os.path.split(relpath)
            featdir = outdir / reldir
            featdir.mkdir(parents=True, exist_ok=True)
            featfile = (featdir / fileid).with_suffix('.npy')
            audiofile = (AUDIO_ROOT / relpath).with_suffix('.wav')
            if os.path.exists(featfile):
                print(f"Skipping {featfile}")
            else:
                inputs.append((audiofile, featfile))

    # process files in parallel
    pool = multiprocessing.Pool(processes = n_cores)
    pool.starmap(compute_chroma_single, inputs)
    
    return

In [None]:
FEATS_CLEAN_DIR = FEATURES_ROOT / 'clean'
compute_chroma_batch(train_files, FEATS_CLEAN_DIR, 24)
compute_chroma_batch(test_files, FEATS_CLEAN_DIR, 24)

### Generate query list

Here we generate a file containing each pair of files to be aligned.

In [None]:
def generate_query_list(filelist, outfile):
    
    # group files by piece
    d = {}
    with open(filelist, 'r') as f:
        for line in f:
            parts = line.strip().split('/')
            assert len(parts) == 2
            piece, fileid = parts
            if piece not in d:
                d[piece] = []
            d[piece].append(fileid)
            
    # print out all pairings
    with open(outfile, 'w') as fout:
        for piece in d:
            num_recordings = len(d[piece])
            for i in range(num_recordings):
                fileid1 = d[piece][i]
                for j in range(i+1, num_recordings):
                    fileid2 = d[piece][j]
                    line = f'{piece}/{fileid1} {piece}/{fileid2}\n'
                    fout.write(line)
                    
    return

In [None]:
train_queries = 'cfg_files/query.train.list'
test_queries = 'cfg_files/query.test.list'
generate_query_list(train_files, train_queries)
generate_query_list(test_files, test_queries)

## Generate Noisy Data

In [None]:
def generateNoisyData(clean, outdir, SNR):
    '''
    clean -- Directory of clean data
    outdir -- Output directory of noisy data
    SNR -- Desired SNR in dB
    '''
    # Set up file structure
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not os.path.exists(outdir / "wav_22050_mono"):
        os.mkdir(outdir / "wav_22050_mono")
    
    cleanDirList = glob.glob(str(clean) + "/wav_22050_mono/*")
    for cleanDir in cleanDirList:
        newDir = outdir / "wav_22050_mono" / os.path.basename(cleanDir)
        if not os.path.exists(newDir):
            os.mkdir(newDir)
    
    # Fill up annotation directory
    os.system("cp -r "+ str(clean) + "/annotations_beat "+ str(outdir) + "/annotations_beat")
    
    # Add noise to all files
    for cleanDir in cleanDirList:
        cleanFileList = glob.glob(cleanDir + "/*")
        for cleanFile in cleanFileList:
            rate, cleanAudio = wav.read(cleanFile)
            # Need to make sure the dtype isn't too small to handle the squares when finding the power
            cleanAudio = np.array(cleanAudio, dtype = np.int64)
            
            noisyAudio = addNoise(cleanAudio, SNR)
            
            relPath = os.path.relpath(cleanFile, start=clean)
            noisyFile = outdir / relPath
            wav.write(noisyFile, rate, noisyAudio)

In [None]:
def addNoise(cleanAudio, SNR):
    """Add SNR to clean audio"""
    P_signal = np.sum(cleanAudio*cleanAudio)/len(cleanAudio)
    P_noise = P_signal * (10 ** (-1 * SNR / 10))
    # P_noise = sigma ^ 2
    noise = np.random.normal(size = len(cleanAudio)) * sqrt(P_noise)
    noisyAudio = cleanAudio + noise
    return noisyAudio

In [None]:
noiseLevels = [20, 15, 10, 5, 0, -5, -10]

In [None]:
if not os.path.exists("noisyData"):
    os.mkdir("noisyData")

# Add noise to Chopin Mazurka data
for SNR in noiseLevels:
    clean = MAZURKAS_ROOT
    outdir = OUT_ROOT / ("noisyData/Chopin_Mazurkas_Noisy_%sdB" % str(SNR))
    generateNoisyData(clean, outdir, SNR)

In [None]:
pool = multiprocessing.Pool(processes = 10)
# Compute chroma features for noisy data
for SNR in [20, 15, 10, 5, 0, -5, -10]:
    FEATS_DIR = FEATURES_ROOT / ('noisy_%sdB' % str(SNR))
    if not os.path.exists(FEATS_DIR):
        os.mkdir(FEATS_DIR)
    
    dirList = glob.glob(str(OUT_ROOT) + "noisyData/Chopin_Mazurkas_Noisy_%sdB/wav_22050_mono/*" % str(SNR))
    for inDir in dirList:
        outdir = FEATS_DIR / os.path.basename(inDir)
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        filelist = glob.glob(inDir + "/*")
        # prep inputs for parallelization
        inputs = []
        for file in filelist:
            outfile = outdir / (os.path.basename(file)[:-4])
            if os.path.exists(outfile.with_suffix(".npy")):
                continue
            else:
                inputs.append((file, outfile))

        # process files in parallel
        pool.starmap(compute_chroma_single, inputs)

## Generate Random Feature Matrices for Runtime Profiling

In [None]:
def saveRandomFeatureMatrices(sizes, outdir):
    
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    
    np.random.seed(0)
    for sz in sizes:
        F = np.random.rand(12, sz)
        outfile = outdir / ('F_%s.npy' % sz)
        np.save(outfile, F)
    
    return

In [None]:
sizes = [1000, 2000, 5000, 10000, 20000, 50000]
rand_feat_dir = FEATURES_ROOT / 'random'
saveRandomFeatureMatrices(sizes, rand_feat_dir)