# Data Preparation

This notebook creates all the bootleg score fragments and data for training, validation, and testing.

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os
import os.path
import pickle
import glob
import PIL
import pandas as pd
import random
import sys
from pathlib import Path
from collections import defaultdict

In [4]:
random.seed(0)

### Parse valid files

First, we loop through all the labeled files and create a dictionary with the piece id (e.g. violin_1) as keys and a list of the corresponding valid pages in that piece as values.

In [5]:
def parseFile(fname):
    """
    Parse a text file with file ids and valid pages and create a dictionary mapping the two.
    For example, one element in the dictionary would be in the form {"violin_1": [1, 2, 3]}.
    """
    valid_dict = {}
    with open (fname, 'r') as f:
        for line in f:
            data = line.strip().split(" ")
            ID = data[0]
            valid_pages = removeFiller(data)
            if valid_pages != []:
                valid_dict[ID] = valid_pages
    return valid_dict

In [6]:
def removeFiller(data):
    """
    Takes in a page range in the form "1-3, 5-6" and returns a list containing each valid page, e.g. [1, 2, 3, 5, 6].
    """
    if len(data) == 1:
        return []
    ranges = data[1:]
    valid_pages = []
    for r in ranges:
        num_range = r.replace(',',"")
        if '-' in num_range:
            left = int(num_range.split("-")[0])
            right = int(num_range.split("-")[1])
            valid_pages.extend(list(np.arange(left,right+1)))
        else:
            page = int(num_range)
            valid_pages.append(page)
    return valid_pages

In [8]:
labeled_data = parseFile('cfg_files/labeled_annotations.txt')

### Split data

Then, we split our labeled data into train, valid, and test.

In [9]:
def splitTrainValidTest(IDs, train=.6, validation=.2, test=.2, savefile = None):
    """
    Splits a list of IDs into 60% train, 20% validation, 20% test.
    """
    random.shuffle(IDs)
    length = len(IDs)
    train = IDs[:int(.6*length)]
    valid = IDs[int(.6*length):int(.8*length)]
    test = IDs[int(.8*length):]
    return train, valid, test

In [10]:
labeled_IDs = list(labeled_data.keys())
train_ids, valid_ids, test_ids = splitTrainValidTest(labeled_IDs)

In [14]:
with open("cfg_files/test_files.txt", 'w') as f:
    for i in test_ids:
        f.write(f"{i}\n")

## Generate Balanced Fragments

Then, we sample a fixed number of fragments from each instrument to generate a balanced number of fragments for each instrument.

In [8]:
def get_global_bscore(ID, pages, root = '../bootleg_data-v1/labeled/'):
    """
    Takes in an ID, list of valid pages, and the directory of the labeled data 
    and concatenates all the bootleg scores from those pages together into one long bootleg score.
    """
    instrument = ID.split("_")[0]
    fname = os.path.join(root, instrument, ID+'.bscore')
    global_bscore = []
    with open(fname, 'rb') as f:
        data = pickle.load(f)
        for page in pages:
            try:
                global_bscore.extend(data[page-1])
            except Exception as e:
                print(e)
                print(fname)
                print(page)
                print(len(data))
                print(data)
                sys.exit(1)
    return global_bscore

In [9]:
def colToString(s):     
    str1 = ""  
    for ele in s:  
        str1 += str(int(ele))+" "     
    return str1  

In [10]:
def class_balance(ID_list, labeled_data, fragment_size, samplesPerInstrument):
    """
    Takes in a list of IDs and the labeled_data dictionary containing a mapping between IDs and 
    valid pages and samples samplesPerInstrument fragments with length fragment_size from 
    each instrument.
    """
    count = {}
    fragments = {}
    
    for ID in ID_list:
        instrument = ID.split("_")[0]
        if instrument not in fragments.keys():
            fragments[instrument] = []
            count[instrument] = 0
        bootleg_score = get_global_bscore(ID, labeled_data[ID])
        valid_locations = np.arange(0,len(bootleg_score) - fragment_size+1)
        for loc in valid_locations:
            fragments[instrument].append((ID, bootleg_score[loc:loc+fragment_size]))
            count[instrument]+=1

    db_labels = []
    db_fragments = []
    for instrument in fragments.keys():
        all_fragments = list(np.arange(0,count[instrument]))
        data = sorted(random.choices(all_fragments, k=samplesPerInstrument))
        for i in data:
            ID, fragment = fragments[instrument][i]
            db_labels.append(instrument)
            db_fragments.append(colToString(fragment))
    df = pd.DataFrame({"Instrument":db_labels,"Fragment":db_fragments})
    return df

In [11]:
def getFragments(train, valid, test, fragment_size, samplesPerInstrument):
    train_df = class_balance(train, labeled_data, fragment_size, samplesPerInstrument)
    valid_df = class_balance(valid, labeled_data, fragment_size, samplesPerInstrument//3)
    test_df = class_balance(test, labeled_data, fragment_size, samplesPerInstrument//3)
    train_df.to_csv(f"train_df-frag{fragment_size}.csv", index = False)
    valid_df.to_csv(f"valid_df-frag{fragment_size}.csv", index = False)
    test_df.to_csv(f"test_df-frag{fragment_size}.csv", index = False)
    return train_df, valid_df, test_df

In [None]:
fragment_size = 64 
samplesPerInstrument = 3600
train, valid, test = getFragments(train_ids, valid_ids, test_ids, fragment_size, samplesPerInstrument)

### Classification data

Here we prepare the train.csv, valid.csv, and test.csv files for the proxy classification task.

In [77]:
def generateBootlegCSVFiles(infile_train, infile_valid, infile_test, outfile_train, outfile_valid, outfile_test):
    '''
    Generates train.csv and test.csv from bootleg score fragments.
    '''
    train_df = pd.read_csv("train_df-frag64.csv")
    valid_df = pd.read_csv("valid_df-frag64.csv")
    test_df = pd.read_csv("test_df-frag64.csv")
    train_df = train_df.rename(columns={"Instrument": "label", "Fragment": "text"})
    valid_df = valid_df.rename(columns={"Instrument": "label", "Fragment": "text"})
    test_df = test_df.rename(columns={"Instrument": "label", "Fragment": "text"})
    
    for df, outfile in [(train_df, outfile_train), (valid_df, outfile_valid), (test_df, outfile_test)]:
        # convert each row of bootleg scores to decimal
        df["text"] = df["text"].apply(lambda x: ''.join(x.split(',')))
        df.to_csv(outfile, index = False)
    return

In [78]:
train_infile = "data/train_df-frag64.csv"
test_infile = "data/test_df-frag64.csv"
valid_infile = "data/valid_df-frag64.csv"
csv_train_file = path/'train64.csv'
csv_valid_file = path/'valid64.csv'
csv_test_file = path/'test64.csv'

In [79]:
generateBootlegCSVFiles(train_infile, valid_infile, test_infile, csv_train_file, csv_valid_file, csv_test_file)

We also generate csv files for evaluating on the original page classification task.

In [53]:
def generateFullPageCSVFiles(train_ids, valid_ids, test_ids, outfile_train, outfile_valid, outfile_test):
    '''
    Generates csv files for the original task of classifying full pages of music.
    '''
    
    generateFullPageCSV(train_ids, outfile_train)
    generateFullPageCSV(valid_ids, outfile_valid)
    generateFullPageCSV(test_ids, outfile_test)

In [54]:
def generateFullPageCSV(id_list, outfile, root = '/home/kji/InstrumentID/bootleg_data-v1/labeled/'):
    
    with open(outfile, 'w') as fout:
        fout.write('label,text\n')
        for id_string in id_list: # e.g. violin_1
            instrument = id_string.split("_")[0]
            fname = os.path.join(root, instrument, id_string+'.bscore')
            with open(fname, 'rb') as f:
                data = pickle.load(f)
                for m in data:
                    if m:
                        textStr = ' '.join([str(i) for i in m])
                        fout.write(f'{instrument},{textStr}\n')

In [55]:
csv_train_file = path/'train.fullpage.csv'
csv_valid_file = path/'valid.fullpage.csv'
csv_test_file = path/'test.fullpage.csv'
generateFullPageCSVFiles(train_ids, valid_ids, test_ids, csv_train_file, csv_valid_file, csv_test_file)

Finally, we generate CSV files for averaging the predictions on multiple fixed-length windows of samples.

In [88]:
def generateEnsembleCSV(id_list, chunkSz, outfile_test, root = '/home/kji/InstrumentID/bootleg_data-v1/labeled/'):
    '''
    Generates a csv file to facilitate evaluating fixed-length classifiers on the full page classification task.
    Each line in the file corresponds to a fixed-length window of samples within a page.  The predictions from
    all windows within a single page can then be averaged and evaluated.
    '''
    with open(outfile_test, 'w') as fout:
        fout.write('id,label,text\n')
        for pieceID in id_list:
            instrument = pieceID.split("_")[0]
            fname = os.path.join(root, instrument, pieceID+'.bscore')
            with open(fname, 'rb') as f:
                data = pickle.load(f)
                for i, page in enumerate(labeled_data[pieceID]):
                    if data[page-1] and len(data[page-1]) > 0:
                        if len(data[page-1]) < chunkSz:  # only 1 window
                            ints = data[page-1]
                            textStr = ' '.join([str(i) for i in ints])
                            idString = f'{pieceID}_{i}_0'
                            fout.write(f'{idString},{instrument},{textStr}\n')
                        else: # multiple windows
                            numWindows = int(np.ceil(len(data[page-1])/(chunkSz/2))) - 1 # hop by half the chunk size
                            for j in range(numWindows - 1):
                                startIdx = chunkSz // 2 * j
                                endIdx = startIdx + chunkSz
                                ints = data[page-1][startIdx: endIdx]
                                textStr = ' '.join([str(i) for i in ints])
                                idString = f'{pieceID}_{i}_0'
                                fout.write(f'{idString},{instrument},{textStr}\n')
                            # handle last window
                            ints = data[page-1][-chunkSz:]
                            textStr = ' '.join([str(i) for i in ints])
                            idString = f'{pieceID}_{i}_{numWindows-1}' 
                            fout.write(f'{idString},{instrument},{textStr}\n')

In [89]:
generateEnsembleCSV(test_ids, chunkSz, path/'test.ensemble256.csv')

# Prepare data for fastai

In the sections below, we will prepare the data for use with the fastai library.  This is adapted from the fast.ai [ULMFit tutorial](https://github.com/fastai/course-nlp/blob/master/nn-vietnamese.ipynb).

In [16]:
%reload_ext autoreload
%autoreload 2

In [51]:
from fastai import *
from fastai.text import *
import glob

In [52]:
bs=48

In [53]:
torch.cuda.set_device(0)

In [54]:
data_path = Config.data_path()

In [55]:
name = 'solo_bscore_lm'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)

### Target Language Model Databunch

Convert the bootleg score features into string representations of decimal integers.  Generate one document per pdf.

In [36]:
path_target = path/'solo_target'
path_target.mkdir(exist_ok=True)

In [37]:
def generateBootlegStringFiles(bscore_feats_dir, outdir):
    '''
    Converts the bootleg score features to string decimal representation, and writes them
    to text files in the specified directory.
    '''    
    # e.g. /home/dyang/InstrumentID/bootleg_data-v1/labeled/violin/violin_1.bscore
    for pieceDir in bscore_feats_dir.rglob("*.bscore"): 
        label = pieceDir.parts[-1].split('.')[0]
        outfile = outdir/(label + '.txt')
        with open(outfile,'w') as fout:
            with open(pieceDir, 'rb') as fin:
                data = pickle.load(fin)
                for ints in data:
                    if ints:
                        pageStr = ' '.join([str(i) for i in ints]) 
                        fout.write(pageStr)
#                         fout.write('\n\n')
                fout.write('</doc>')

In [38]:
bscore_dir = Path('/home/kji/InstrumentID/bootleg_data-v1/labeled')

In [39]:
generateBootlegStringFiles(bscore_dir, path_target)

In [41]:
path_target

PosixPath('/home/kji/.fastai/data/solo_bscore_lm/solo_target')

In [42]:
basicTokenizer = Tokenizer(pre_rules=[], post_rules=[])
lm_target_data = (TextList.from_folder(path_target, processor=[OpenFileProcessor(), TokenizeProcessor(tokenizer=basicTokenizer), NumericalizeProcessor()])
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()           
            .databunch(bs=bs, num_workers=1))

lm_target_data.save(path/'solo_lm_target_databunch')

### All Solo Music Language Model Databunch

Same as above, but using the bootleg score dataset for all solo pieces for the set of instruments.

In [43]:
path_all = path/'solo_all'
path_all.mkdir(exist_ok=True)

In [44]:
bscore_dir = Path('/home/kji/InstrumentID/bootleg_data-v1/all/')

In [45]:
generateBootlegStringFiles(bscore_dir, path_all)

### Classification data

# Transformers

Here we prepare the train.csv, valid.csv, and test.csv files for the proxy classification task.

In [31]:
def generateBootlegCSVFiles(infile_train, infile_valid, infile_test, outfile_train, outfile_valid, outfile_test):
    '''
    Generates train.csv and test.csv from bootleg score fragments.
    '''
    train_df = pd.read_csv("train_df-frag64.csv")
    valid_df = pd.read_csv("valid_df-frag64.csv")
    test_df = pd.read_csv("test_df-frag64.csv")
    train_df.rename(columns={"Instrument": "label", "Fragment": "text"})
    valid_df.rename(columns={"Instrument": "label", "Fragment": "text"})
    test_df.rename(columns={"Instrument": "label", "Fragment": "text"})
    
    for df, outfile in [(train_df, outfile_train), (valid_df, outfile_valid), (test_df, outfile_test)]:
        # convert each row of bootleg scores to decimal
        df.text = df.text.map(lambda x: ' 'join([str(i) for i in convertBinaryToInt(x)]))
        df.to_csv(outfile)
    return

In [38]:
train_infile = "train_df-frag64.csv"
test_infile = "test_df-frag64.csv"
valid_infile = "valid_df-frag64.csv"
csv_train_file = path/'train64.csv'
csv_valid_file = path/'valid64.csv'
csv_test_file = path/'test64.csv'

In [None]:
generateBootlegCSVFiles(train_infile, valid_infile, test_infile, csv_train_file, csv_valid_file, csv_test_file)

We also generate csv files for evaluating on the original page classification task.

In [59]:
def generateFullPageCSVFiles(train_ids, valid_ids, test_ids, outfile_train, outfile_valid, outfile_test):
    '''
    Generates csv files for the original task of classifying full pages of music.
    '''
    
    generateFullPageCSV(train_ids, outfile_train)
    generateFullPageCSV(valid_ids, outfile_valid)
    generateFullPageCSV(test_ids, outfile_test)

In [60]:
def generateFullPageCSV(id_list, outfile):
    
    with open(outfile, 'w') as fout:
        fout.write('label,text\n')
        for ID in id_list: # e.g. violin_1
            instrument = ID.split("_")[0]
            fname = os.path.join(root, instrument, ID+'.bscore')
            with open(fname, 'rb') as f:
                data = pickle.load(f)
                for m in data:
                    if m:
                        ints = convertBinaryToInt(m)
                        textStr = ' '.join([str(i) for i in ints])
                        fout.write(f'{instrument},{textStr}\n')

In [61]:
csv_train_file = path/'train.fullpage.csv'
csv_valid_file = path/'valid.fullpage.csv'
csv_test_file = path/'test.fullpage.csv'
generateFullPageCSVFiles(train, valid, test, csv_train_file, csv_valid_file, csv_test_file)

# not doing ensembling this week
Finally, we also generate csv files to facilitate evaluating fixed-length classifiers on the full page classification task.  These classifiers will be applied to multiple windows of features, and the predictions will be averaged.

In [35]:
def generateEnsembleCSV(test_ids, chunkSz, outfile_test, root = '/home/kji/InstrumentID/bootleg_data-v1/labeled/'):
    '''
    Generates a csv file to facilitate evaluating fixed-length classifiers on the full page classification task.
    Each line in the file corresponds to a fixed-length window of samples within a page.  The predictions from
    all windows within a single page can then be averaged and evaluated.
    '''
        
    with open(outfile_test, 'w') as fout:
        fout.write('id,label,text\n')
        for pieceID in test_ids:
            instrument = pieceID.split("_")[0]
            fname = os.path.join(root, instrument, pieceID+'.bscore')
            with open(fname, 'rb') as f:
                data = pickle.load(f)
                for page in labeled_data[pieceID]:
                    if data[page-1] and len(data[page-1]) > 0:
                        ints = data[page-1]
                        print(ints)
            for i, m in enumerate(d[pieceDir]): # d[pieceDir] -> list of binary bootleg score matrices, one per page
                if m is not None and m.shape[1] > 0:
                    if m.shape[1] <= chunkSz: # only 1 window
                        ints = convertBinaryToInt(m)
                        textStr = ' '.join([str(i) for i in ints])
                        idString = f'{pieceID}_{i}_0' # id: pieceID_pageIdx_chunkIdx
                        fout.write(f'{idString},{composer},{textStr}\n')
                    else: # multiple windows
                        numWindows = int(np.ceil(m.shape[1]/(chunkSz/2))) - 1 # hop by half the chunk size
                        for j in range(numWindows - 1):
                            startIdx = chunkSz // 2 * j
                            endIdx = startIdx + chunkSz
                            ints = convertBinaryToInt(m[:,startIdx:endIdx])
                            textStr = ' '.join([str(i) for i in ints])
                            idString = f'{pieceID}_{i}_{j}' # id: pieceID_pageIdx_chunkIdx
                            fout.write(f'{idString},{composer},{textStr}\n')
                        # handle last window
                        ints = convertBinaryToInt(m[:,-chunkSz:])
                        textStr = ' '.join([str(i) for i in ints])
                        idString = f'{pieceID}_{i}_{numWindows-1}' 
                        fout.write(f'{idString},{composer},{textStr}\n')

In [39]:
csv_test_file = path/'test.ensemble256.csv'
generateEnsembleCSV(save_pages_file, 256, csv_test_file)

# Data Preparation for Transformer models

Here we prepare the data for training and testing the Transformer-based models.  Instead of using decimal string representations, we represent each 62-bit bootleg score feature as a sequence of 8 one-byte characters.  Rather than generating these from scratch, we will simply convert the existing files to the new format.

### Prep data for language modeling

In [None]:
def generateLMTrainFiles(indir, out_train, out_valid, val_frac=0.1):
    
    # split train/validation by file
    filelist = sorted(glob.glob('{}/*.txt'.format(indir)))
    np.random.seed(0)
    np.random.shuffle(filelist)
    endIdx = int(len(filelist) * (1-val_frac)) + 1
    train_files = filelist[0:endIdx]
    valid_files = filelist[endIdx:]
    
    # convert to binary string representation
    convertToByteChars(train_files, out_train)
    convertToByteChars(valid_files, out_valid)

In [None]:
def convertToByteChars(filelist, outfile):
    '''
    Split each 62-bit bootleg score feature into 8 bytes, and express each byte as a single character.
    Consecutive bootleg score feature `words' will be separated by space.
    '''
    with open(outfile, 'w') as fout:
        for infile in filelist:
            with open(infile, 'r') as fin:
                for line in fin:
                    line = line.strip()
                    if len(line) > 0:
                        if line == '</doc>':
                            pass # skip
                        else:
                            converted = convertLineToCharSeq(line)
                            fout.write(f'{converted}\n')
            fout.write('\n')

In [None]:
def convertLineToCharSeq(line):
    ints = [int(p) for p in line.split()]
    result = ' '.join([int2charseq(i) for i in ints])
    return result

In [None]:
def int2charseq(int64):
    chars = ''
    for i in range(8):
        numshift = i * 8
        charidx = (int64 >> numshift) & 255
        chars += chr(19968 + charidx) # 19968 ensures that all chars are chinese characters (not newline, space, etc)
    return ''.join(chars)

In [None]:
bpe_path = path/'bpe_data'
bpe_path.mkdir(exist_ok=True, parents=True)

In [None]:
# Convert target data
lm_train_file = bpe_path/'bpe_lm_target_train.txt'
lm_valid_file = bpe_path/'bpe_lm_target_valid.txt'
dir_to_convert = path/'solo_target'
generateLMTrainFiles(dir_to_convert, lm_train_file, lm_valid_file)

In [None]:
# Convert IMSLP data
lm_train_file = bpe_path/'bpe_lm_all_train.txt'
lm_valid_file = bpe_path/'bpe_lm_all_valid.txt'
dir_to_convert = path/'solo_all'
generateLMTrainFiles(dir_to_convert, lm_train_file, lm_valid_file)

### Prep data for classification

In [None]:
def convertSingleCSVFile(infile, outfile):
    '''
    Convert .csv file with decimal string representation of bootleg score features to
    a .csv file with byte character representation.
    '''
    with open(infile, 'r') as f:
        lines = f.readlines()
    with open(outfile, 'w') as fout:
        for i, line in enumerate(lines):
            if i==0: 
                fout.write(line) # header
            else:
                parts = line.strip().split(',')
                feats = parts.pop()
                charseq = convertLineToCharSeq(feats)
                strToWrite = ','.join(parts) + ',' + charseq + '\n'
                fout.write(strToWrite)

In [None]:
def convertAllCSVFiles(indir, outdir):
    assert indir != outdir
    os.makedirs(outdir, exist_ok = True)
    for infile in glob.glob(f'{indir}/*.csv'):
        print(f'Converting {os.path.basename(infile)}')
        basename = os.path.splitext(os.path.basename(infile))[0]
        outfile = f'{outdir}/{basename}.char.csv'
        convertSingleCSVFile(infile, outfile)

In [None]:
convertAllCSVFiles(str(path), str(bpe_path))