# 1. Create bash script to copy instrument PDFs

In [9]:
import os
import glob
from bs4 import BeautifulSoup
import re
import PyPDF2
import shutil
import random
from collections import defaultdict
from pathlib import Path
import string

In [10]:
def isValidPDF(infile):
    isValid = True
    try:
        PyPDF2.PdfFileReader(open(infile, "rb"))
    except:
        isValid = False
    return isValid

In [11]:
def parsePdfInfo(string):
    # an example pdf info string:
    # '*#400812 - 22.72MB, 8 pp. - 0.0/10\n\n2\n4\n6\n8\n10\n\n (-) - V/V/V - 2151×⇩ - Piupianissimo'
    string = string.replace('\n','')
    m = re.search('#(\d+)\s', string)
    pdf_id = m.group(1)
    m = re.search('-\s(\d+)×', string)
    if m:
        num_downloads = int(m.group(1))
    else:
        num_downloads = 0 # no downloads field
    return pdf_id, num_downloads

In [12]:
def parseHtml(html_file):
    with open(html_file,'r') as f:
        text = f.read()
    soup = BeautifulSoup(text, 'html.parser')
    return soup

In [13]:
def extractScoreInfo(node):
    
    # read all table entries
    d = defaultdict(str)
    for row in node.find("table").find("table").findAll("tr"):
        row_header = row.find("th").text.strip().lower()
        row_field = row.find("td").text.strip().lower()
        d[row_header] = row_field
#         print('{}|{}'.format(row_header, row_field))
    
    # extract relevant fields
    publisherInfo = d['publisher. info.']
    copyright = d['copyright']
    
    # check if valid
    isPublicDomain = ('public domain' in copyright) and ('non-pd' not in copyright)
    isCreativeCommons = 'creative commons' in copyright
    isShareable = isPublicDomain or isCreativeCommons
    isManuscript = 'manuscript' in publisherInfo
    
    return isShareable, isManuscript

In [27]:
def is_solo_piece(soup, instruments):
    try:
        category_tags = soup.find(class_ = "wp_header").find("table").findAll("span", class_ = "plainlinks")
    except:
        return (False, "")
    for tag in category_tags:
        category = tag.text  # we want categories of the form "For (instrument)"
        if category.startswith("For"):
            words = category.lower().split()
            if len(words) == 2 and words[1] in instruments:
                return (True, words[1])
             # augment data with trumpet + piano, trumpet + orchestra data
            if category in ["For trumpet, piano", "For trumpet, orchestra"]: 
                return (True, "trumpet")
#             if category in ["For oboe, piano", "For oboe, orchestra"]:
            if category == "For oboe, orchestra":
                return (True, "oboe")
    
    return (False, "")

In [32]:
# def selectSinglePDF(html_file):
#     '''
#     Selects a single PDF file from an html metadata file.  First check if the piece category is one of the desired, categories, then
#     filter the list of pdfs to ensure that the piece has a suitable copyright and is not a manuscript,and finally,
#     within the remainining options select the most popular (i.e. most downloaded) score.  
    
#     Returns the full path to the selected pdf file and the instrument. If no valid pdfs are found, returns None.
#     '''    # get pdf ids, number of downloads
#     soup = parseHtml(html_file)
#     is_solo, instrument = is_solo_piece(soup, instruments)
#     if not is_solo:
#         return "", ""
#     scores_list = soup.find_all(class_ = 'we')
#     if scores_list is None: # no scores
#         return "", ""
#     tuples = [] # populate with (pdf_id, num_downloads)
# #     return [score.find_all("table", recursive=False) for score in scores_list]
#     for i, child in enumerate(scores_list): # list of scores
#         try:
#             isShareable, isManuscript = extractScoreInfo(child) # copyright ok & not a manuscript
#         except: # incorrectly formatted entry -- skip
#             continue
#         for pdf_div_tag in child.find_all("div", recursive=False): # may have multiple pdfs
#             try:
#                 # only keep PDFs whose title is "complete score" or "complete book"
#                 title = pdf_div_tag.find("span", {"title": "Download this file"}).text.lower()
#             except:
#                 continue
#             # for oboe + piano and oboe + orchestra, only select pieces with a solo PDF available
#             if instrument == "oboe" and not ("oboe" in title or "solo" in title):
#                 continue
#             we_file_info2 = pdf_div_tag.find(class_='we_file_info2')
#             if we_file_info2 is not None: # sometimes there are div tags with additional information
#                 pdf_id, num_downloads = parsePdfInfo(we_file_info2.text)
#                 tuples.append((pdf_id, num_downloads, isShareable, not isManuscript))
#     # sort by downloads, verify that PDF can be read
#     valid_list = []
#     tuples.sort(key = lambda x: x[1], reverse = True)
#     piece_dir = html_file[:-9]
#     for (pdf_id, num_downloads, isPublicDomain, notManuscript) in tuples:
#         fullpath = '{}/{}.pdf'.format(piece_dir, pdf_id)
#         # make sure pieces are publicly available and are not manuscripts
#         # return the PDF with the most downloads
#         if isPublicDomain and notManuscript and isValidPDF(fullpath):
#             return fullpath, instrument
#     return "", ""

def selectSinglePDF(html_file):
    '''
    Selects a single PDF file from an html metadata file.  First check if the piece category is one of the desired, categories, then
    filter the list of pdfs to ensure that the piece has a suitable copyright and is not a manuscript,and finally,
    within the remainining options select the most popular (i.e. most downloaded) score.  
    
    Returns the full path to the selected pdf file and the instrument. If no valid pdfs are found, returns None.
    '''    # get pdf ids, number of downloads
    soup = parseHtml(html_file)
    is_solo, instrument = is_solo_piece(soup, instruments)
    if not is_solo:
        return "", ""
    scores_list = soup.find_all(class_ = 'we')
    if scores_list is None: # no scores
        return "", ""
    tuples = [] # populate with (pdf_id, num_downloads)
#     return [score.find_all("table", recursive=False) for score in scores_list]
    for i, child in enumerate(scores_list): # list of scores
        try:
            isShareable, isManuscript = extractScoreInfo(child) # copyright ok & not a manuscript
        except: # incorrectly formatted entry -- skip
            continue
        for pdf_div_tag in child.find_all("div", recursive=False): # may have multiple pdfs
            try:
                title = pdf_div_tag.find("span", {"title": "Download this file"}).text.lower()
                if instrument == "oboe + orchestra" and not ("oboe" in title or "solo" in title or "complete score" in title):
                    continue
            except:
                continue
            # for oboe + piano and oboe + orchestra, only select pieces with a solo PDF available
#             if instrument == "oboe + orchestra" and not ("oboe" in title or "solo" in title or "complete score" in title):
#                  continue
#             if (instrument == "trumpet + other" and not ("trumpet" in title or "solo" in title)):
#                 continue
            we_file_info2 = pdf_div_tag.find(class_='we_file_info2')
            if we_file_info2 is not None: # sometimes there are div tags with additional information
                pdf_id, num_downloads = parsePdfInfo(we_file_info2.text)
                tuples.append((pdf_id, num_downloads, isShareable, not isManuscript))
    # sort by downloads, verify that PDF can be read
    valid_list = []
    tuples.sort(key = lambda x: x[1], reverse = True)
    piece_dir = html_file[:-9]
    for (pdf_id, num_downloads, isPublicDomain, notManuscript) in tuples:
        fullpath = '{}/{}.pdf'.format(piece_dir, pdf_id)
        # make sure pieces are publicly available and are not manuscripts
        # return the PDF with the most downloads
        if isPublicDomain and notManuscript and isValidPDF(fullpath):
            return fullpath, instrument.split()[0]    # make sure oboe/trumpet + others map to the correct instrument
    return "", ""

In [72]:
instruments = ["cello", "clarinet", "flute", "oboe", "trumpet", "viola", "violin", "guitar"]

In [34]:
outfile = '/home/kji/InstrumentID/copy_script.sh'
imslp_dir = Path('/data/Datasets/imslp/score_scrape/results/composer/')
data_dir = '/home/kji/InstrumentID/data/all/'

In [24]:
imslp_dir = Path("/data/Datasets/imslp/score_scrape/results/composer/Rondeau,_Michel/Trumpet_Sonata_No.3_in_E_minor_")

In [35]:
# if not os.path.exists(outfile):
    with open(outfile, 'w') as f:
        instrument_counts = defaultdict(int)
        for file in imslp_dir.rglob("html.txt"):
            file = str(file)
            orig_path, instrument = selectSinglePDF(file)
            if orig_path != "":
                instrument_counts[instrument] += 1
#                 print(instrument_counts)
                new_path = f"{data_dir}{instrument}/{instrument}_{instrument_counts[instrument]}"
                f.write(f"cp {orig_path} {new_path}\n")

# 2. Randomly select 75 PDFs per instrument to be manually labeled

In [69]:
dest_dir = '/home/kji/InstrumentID/data/labeled/'
samples = 75
random.seed(2)

In [70]:
def random_instrument_sample(src_dir, dest_dir, num_samples):
    filenames = random.sample(os.listdir(src_dir), num_samples)
    for name in filenames:
        src_path = os.path.join(src_dir, name)
        shutil.copy(src_path, dest_dir)

In [74]:
for instrument in instruments:
    src_dir = data_dir + instrument
    dest = dest_dir + instrument
    random_instrument_sample(src_dir, dest, samples)