## Pre-processing MAST rhythmic data

### This is an auxiliary notebook which targets at the preparation of data towards the automatic assessment analysis. It requires the data downloaded using the _DownloadDataFromMASTDataset_ notebook.

### Along this notebook we will extract onset times information from both references and performances, scale performances according to their respective references and convert the onset times to a binary representation which might be more suitable to the application of distance measures

### Loading Essentia's necessary tools

In [1]:
from essentia.standard import *
from essentia import Pool, array

import os
import numpy
import math

SAMPLE_RATE = 44100
WINDOW_SIZE = 1024
HOP_SIZE = 512

WINDOWING_METHOD = 'hann'
ONSET_DETECTION_METHOD = 'hfc'

def _load_file_as_monophonic_waveform(file_path):
    fs = SAMPLE_RATE

    x = MonoLoader(filename = file_path, sampleRate = fs)()
    return x/numpy.max(numpy.abs(x))

def _extract_onset_vectors(waveform):
    od_hfc = OnsetDetection(method=ONSET_DETECTION_METHOD)
    w = Windowing(type = WINDOWING_METHOD)
    fft = FFT()
    c2p = CartesianToPolar()
    onsets = Onsets()

    pool = Pool()
    for frame in FrameGenerator(waveform, frameSize = WINDOW_SIZE, hopSize = HOP_SIZE):
        mag, phase, = c2p(fft(w(frame)))
        pool.add('features', od_hfc(mag, phase))

    onsets = onsets(array([pool['features']]),[1])
    return onsets

### Extracting onset times and writing output to auxiliar file. After executing the below one can check the onset times in a file inside the data folders

In [2]:
def extract_onsets(base_dir, list_files, output_file):
    with open(list_files, 'r') as listfiles:
        with open(output_file, 'w') as output:
            for audio_file in listfiles.readlines():
                audio_file = audio_file.strip()
                audio_file_full_path = '%s%s' % (base_dir, audio_file)

                w = _load_file_as_monophonic_waveform(audio_file_full_path)
                onsets = _extract_onset_vectors(w)

                output.write('%s\n' % " ".join(list(str(x) for x in onsets)))

DATA_FOLDER = "../data/"

# extracting onset times for performances
base_dir_performances = os.path.join(DATA_FOLDER, 'Only Performances/') 
list_files_performances = os.path.join(DATA_FOLDER, 'Only Performances/listperformances')
output_file_performances = os.path.join(DATA_FOLDER, 'Only Performances/MAST Onsets [Performances]')

extract_onsets(base_dir_performances, list_files_performances, output_file_performances)

# extracting onset times for references
base_dir_references = os.path.join(DATA_FOLDER, 'Only References/')
list_files_references = os.path.join(DATA_FOLDER, 'Only References/listreferences')
output_file_references = os.path.join(DATA_FOLDER, 'Only References/MAST Onsets [References]')

extract_onsets(base_dir_references, list_files_references, output_file_references)

performance_onsets = open(os.path.join(DATA_FOLDER, 'Only Performances/MAST Onsets [Performances]'))
print(performance_onsets.read())



0.02322 0.325079 0.743039 1.01007 1.35837 1.67184 2.11302 2.36844 2.71673 2.97215 3.39011
0.15093 0.650159 0.893968 1.14939 1.59057 2.07819 2.5658 3.04181 3.29723 3.52943 3.98222
0.0696599 0.487619 0.893968 1.07973 1.30032 1.76472 1.96209 2.18268 2.60063 2.798 2.99537 3.18113 3.3785 3.59909 3.83129
0.0464399 0.13932 0.928798 1.21905 1.61379 2.04336 2.63546 2.9141 3.20435 3.51782 4.13315
0.0464399 0.0812698 0.348299 0.952018 1.20744 1.46286 2.04336 2.44971 2.82122 3.64553
0.1161 0.15093 0.359909 0.545669 0.731429 0.963628 1.38159 1.81116 2.22912 2.65868 2.86766 3.06503 3.25079 3.48299 3.88934
0.20898 0.673379 1.12617 1.67184 1.84599 2.07819 2.28717 2.48454 2.71673 3.09986 3.33206 3.54104
0.02322 0.0812698 0.313469 0.766258 0.998458 1.19583 1.43964 2.15946 2.62385 3.16952
0.01161 0.290249 0.847528 1.06812 1.31193 1.89243 2.48454 2.73995 2.99537 3.56426 4.22603
0.12771 0.673379 0.917188 1.18422 1.71828 2.24073 2.50776 2.78639 3.35528 3.88934
0.02322 0.2322 0.719819 1.14939 1.64862 1.88082

### Rescaling performances onsets times according to their reference lengths and converting them into a binary representation (a 1 in the i-eth position means the presence of a beat in the i-eth time window). After executing the below one can check the binary representation of the onset times in a file inside the data folders

In [3]:
def rescale_and_make_bins(performances_file, references_file, window_size=0.2):
    with open(performances_file, 'r') as perf_onsets_file:
        with open(references_file, 'r') as ref_onsets_file:
            perfs_onsets = perf_onsets_file.readlines()
            refs_onsets = ref_onsets_file.readlines()
            with open('%s [BINS]' % performances_file, 'w') as performances_file_bin, open('%s [BINS]' % references_file, 'w') as references_file_bin:
                for i in range(len(perfs_onsets)):
                    perf_onsets = [ float(x) for x in perfs_onsets[i].strip().split(" ") ]
                    ref_onsets = [ float(x) for x in refs_onsets[i].strip().split(" ") ]

                    # setting onset boundaries from first to last onsets
                    perf_onsets = [ x - perf_onsets[0] for x in perf_onsets ]
                    ref_onsets = [ x - ref_onsets[0] for x in ref_onsets ]

                    # re-scaling performance to match reference tempo.
                    
                    # time re-scaling factor is measured according to 
                    # the growth of the performance when compared to
                    # the reference
                    perf_inner_duration = perf_onsets[-1] - perf_onsets[0]
                    ref_inner_duration = ref_onsets[-1] - ref_onsets[0]
                    time_rescaling_factor = ((perf_inner_duration - ref_inner_duration) / perf_inner_duration)

                    rescaled_perf_onsets = [adjust_onset_by_rate(onset, time_rescaling_factor) for onset in perf_onsets]

                    # converting onsets to binary representation
                    # performance is converted using its re-scaled
                    # form
                    performance_onset_bins = make_bins(rescaled_perf_onsets, window_size)
                    performances_file_bin.write('%s\n' % " ".join([str(x) for x in performance_onset_bins]))

                    references_onset_bins = make_bins(ref_onsets, window_size)
                    references_file_bin.write('%s\n' % " ".join([str(x) for x in references_onset_bins]))


def make_bins(onset_times, windows_size):
    total_length = onset_times[-1] - onset_times[0]
    n_bins = math.ceil(total_length/windows_size)
    bins = []

    for i in range(int(n_bins)):
        for onset_time in onset_times:
            if windows_size*i <= onset_time <= windows_size*(i+1):
                bins.append(1)
                break
        else:
            bins.append(0)

    return bins

def adjust_onset_by_rate(onset, time_rescaling_factor):
    return onset - (onset * time_rescaling_factor)

# running functions over data
rescale_and_make_bins(os.path.join(DATA_FOLDER, 'Only Performances/MAST Onsets [Performances]'),
                         os.path.join(DATA_FOLDER, 'Only References/MAST Onsets [References]'))

performance_binary_onsets = open(os.path.join(DATA_FOLDER, 'Only Performances/MAST Onsets [Performances] [BINS]'))
print(performance_binary_onsets.read())


1 1 0 1 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1
1 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 1 0 0 1
1 0 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1
1 0 0 0 1 0 1 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1
1 1 0 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 1
1 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1
1 0 0 1 0 0 1 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 1 0 1
1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 0 1 0 0 1
1 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 1
1 0 0 1 1 0 1 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1
1 1 0 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 0 0 1 0 1
1 0 0 0 1 0 1 0 1 0 1 0 0 0 1 1 0 1 1 0 0 1
1 0 0 1 1 0 1 0 0 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1
1 0 0 1 0 1 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1
1 1 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 0 0 0 1
1 0 0 1 1 0 1 0 0 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1
1 0 0 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 1
1 1 0 1 1 0 1 0 1 0 0 0 1 0 1 1 0 1 0 1 1
1 0 0 0 1 0 1 1 1 1 0 0 0 1 0 1 1 1 1
1 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 1
1 0 0 1 0 1 1 0 0 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1
1 0 0 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1
1 1 0 0 1 0 1 1 1 0 1 0 0 