# Extracting Bootleg Score Features

This notebook extracts bootleg score features from all the sheet music images.

In [2]:
import os
import os.path
import subprocess
import multiprocessing
import glob
import extractBootlegFeatures

ModuleNotFoundError: No module named 'extractBootlegFeatures'

### Convert PDF to PNG

Use ImageMagick to convert PDF files to PNG images.  It is important to have appropriate settings for ImageMagick to be able to process all files.  Here are the recommended settings
- memory: 8GiB
- map: 4GiB
- disk: 8GiB




These settings can be changed in e.g. /etc/ImageMagick-6/policy.xml


In [2]:
def convertPDF2PNG(pdffile, pngfile):
    firstpage = pngfile[0:-4] + '-0.png'
    if os.path.exists(pngfile) or os.path.exists(firstpage):
        #print('Skipping {}'.format(os.path.basename(pdffile)))
        pass
    else:
        outdir = os.path.dirname(pngfile)
        if not os.path.isdir(outdir):
            os.makedirs(outdir)
        print('Converting {}'.format(pdffile))
        subprocess.call(['convert', '-density', '300', '-alpha', 'remove', '-resize', '2550', pdffile, pngfile])

In [3]:
# PDF to PNG conversion
pdf_list = '../cfg_files/pdfs.list' # list of pdfs
png_dir = '../data/png' # where to save converted png files
n_cores = 28 #multiprocessing.cpu_count()

# prep inputs for parallelization
inputs = []
with open(pdf_list, 'r') as f:
    for line in f:
        pdffile = line.strip() # data/pdf/Bach/00756.pdf
        basename = os.path.splitext(os.path.basename(pdffile))[0] 
        composer = os.path.basename(os.path.dirname(pdffile)) 
        outdir = '{}/{}/{}'.format(png_dir, composer, basename)
        pngfile = '{}/{}.png'.format(outdir, basename)
        inputs.append((pdffile, pngfile))

# process queries in parallel
pool = multiprocessing.Pool(processes=n_cores)
outputs = list(pool.starmap(convertPDF2PNG, inputs))

In [4]:
def renameSinglePageFiles(png_dir):
    '''
    If the pdf contains only 1 page, the name of the file will be p123.png, not p123-0.png.
    to keep a consistent naming convention, we rename these to p123-0.png.
    '''
    for dirname in glob.glob('{}/*/*'.format(png_dir)):
        pieceid = os.path.basename(dirname)
        singlePageFilename = '{}/{}.png'.format(dirname, pieceid)
        multiPageFilename = '{}/{}-0.png'.format(dirname, pieceid)
        if os.path.exists(singlePageFilename):
            os.rename(singlePageFilename, multiPageFilename)

In [5]:
renameSinglePageFiles(png_dir)

### Extract Features

In [18]:
# use multiple cores
pdflist = '../cfg_files/pdfs.list' # list of pdf scores
png_dir = '../data/png' # root directory containing image data
feat_dir = '../score_feat' # where to save bootleg scores
n_cores = 24 #multiprocessing.cpu_count()

if not os.path.isdir(feat_dir):
    os.makedirs(feat_dir)

# prep inputs for parallelization
inputs = []
with open(pdflist, 'r') as f:
    for line in f:
        pdffile = line.rstrip() # e.g. data/pdf/Bach/00735.pdf
        pieceid = os.path.splitext(os.path.basename(pdffile))[0] # e.g. 00735
        composer = os.path.basename(os.path.dirname(pdffile)) # e.g. Bach
        indir = '{}/{}/{}'.format(png_dir, composer, pieceid) # e.g. data/png/Bach/00735
        outdir = '{}/{}/{}'.format(feat_dir, composer, pieceid) # e.g. score_feat/Bach/00735
        if not os.path.isdir(outdir):
            os.makedirs(outdir)
        for imagefile in glob.glob('{}/*.png'.format(indir)):
            basename = os.path.splitext(os.path.basename(imagefile))[0] # e.g. 00735-0
            outfile = '{}/{}.pkl'.format(outdir, basename)
            if os.path.exists(outfile):
                #print('Skipping {}'.format(os.path.basename(outfile)))
                pass
            else:
                inputs.append((imagefile, outfile))
print('{} remaining files to process'.format(len(inputs)))

# process queries in parallel
pool = multiprocessing.Pool(processes=n_cores)
outputs = list(pool.starmap(extractBootlegFeatures.processImageFile, inputs))

0 remaining files to process
