In [1]:
import pickle
import numpy as np
import os
from glob import glob
import requests
from bs4 import BeautifulSoup
from tqdm.autonotebook import tqdm
import json
import os

  from tqdm.autonotebook import tqdm


### Go through the bootleg directory and grab the composer and piece names corresponding to each PDF ID number

In [2]:
file_paths = glob("imslp_bootleg_dir-v1.1/**/*.pkl", recursive=True)

dir_info = {}

for path in file_paths:
    path_parts = path.split("/")

    composer = path_parts[1]
    num = path_parts[-1].replace(".pkl", "")
    piece = "/".join(path_parts[2:-1]) # Sometimes the piece name has a slash in it which is why this is necessary - thank you IMSLP
    
    dir_info[num] = {"composer":composer, "piece_name":piece, "path":path}

all_nums = dir_info.keys()

In [3]:
def get_url(num):
    composer = dir_info[num]["composer"]
    piece_name = dir_info[num]["piece_name"]

    url = f"https://imslp.org/wiki/{piece_name}({composer})"
    return url

### Extract the info from the IMSLP website for each piece

In [4]:
# Extracts table and puts entries in web_info (in-place)
def extract_table(table, web_info):
    rows = table.find_all('tr')
    for row in rows:
        th, td = row.find("th"), row.find("td")
        if th and td:
            web_info[th.text] = td.text


def get_web_info(url):
    web_info = {}


    # Grab webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')


    # Find and extract first table
    table = soup.find("div", {"class":"wp_header"})
    if table:
        extract_table(table, web_info)


    # Find and extract second table
    table = soup.find("div", {"class":"wi_body"})
    if table:
        extract_table(table, web_info)


    # Check if there's a notice
    notice = soup.find("a", {"href":"/wiki/File:Ambox_notice.png"})
    if notice:
        web_info["notice"] = notice.parent.parent.text


    web_info = {k.strip():v.strip() for k, v in web_info.items()}

    return web_info

In [5]:
import concurrent.futures

# Parallelize it to go zoom
def get_web_infos(urls):
    web_infos = {}

    def process_url(url):
        return url, get_web_info(url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Using list to keep the order same as urls
        results = list(tqdm(executor.map(process_url, urls), total=len(urls)))

    for url, web_info in results:
        web_infos[url] = web_info

    return web_infos

In [6]:
urls = {get_url(num) for num in all_nums}
# Multiple nums can correspond to the same url (diff versions of the same piece) so only need to scrape once if do it by url
print(len(all_nums), len(urls))

26998 17239


In [18]:
web_info = get_web_infos(urls)

  0%|          | 0/17239 [00:00<?, ?it/s]

### Clean up the results a little bit

In [38]:
all_headers = set()

# Clean up slightly (Can't fix this automatically since so inconsistent)
fixes = {
    "Average DurationAvg. Duration":"Average Duration",
    "Year/Date of CompositionY/D of Comp.":"Year/Date of Composition",
    "Movements/SectionsMov'ts/Sec's":"Movements/Sections",
    "Opus/Catalogue NumberOp./Cat. No.":"Opus/Catalogue Number",
    "Composer Time PeriodComp. Period":"Composer Time Period",
    "I-Catalogue NumberI-Cat. No.":"I-Catalogue Number",
}


for url in urls:
    # If Key is "see below" just get rid of it
    if 'Key' in web_info[url] and web_info[url]['Key'] == 'see below':
        del web_info[url]['Key']
    
    for key in list(web_info[url].keys()):
        if web_info[url][key] == '':
            del web_info[url][key]

        elif key in fixes:
            new_key = fixes[key]
            web_info[url][new_key] = web_info[url][key]
            del web_info[url][key]
    
    all_headers = all_headers.union(set(web_info[url].keys()))

print(all_headers)

{'Related Works', 'External Links', 'Instrumentation', 'Manuscript Sources', 'Composition Year', 'Movements/Sections', 'Alternative. Title', 'Dedication', 'InstrDetail', 'I-Catalogue Number', 'Composer Time Period', 'Discography', 'Name Translations', 'Average Duration', 'Language', 'Name Aliases', 'First Performance.', 'Incipit', 'Opus/Catalogue Number', 'notice', 'Key', 'Librettist', 'Composer', 'Copyright Information', 'First Publication', 'Text Incipit', 'Piece Style', 'Genre Categories', 'Work Title', 'Extra Information', 'Authorities', 'Year/Date of Composition', 'First Publication.', 'Extra Locations'}


### Put it all together and save it

In [39]:
url_to_pdfids = {url:[] for url in urls}
for num in all_nums:
    url_to_pdfids[get_url(num)].append(num)

In [40]:
composers = set(dir_info[num]["composer"] for num in all_nums)
pieces = set((dir_info[num]["composer"], dir_info[num]["piece_name"]) for num in all_nums)

all_metadata = {composer:{} for composer in composers}
for composer, piece_name in pieces:
    url = f"https://imslp.org/wiki/{piece_name}({composer})"
    all_metadata[composer][piece_name] = web_info[url].copy()
    all_metadata[composer][piece_name]["PDF_ids"] = url_to_pdfids[url]
    all_metadata[composer][piece_name]["url"] = url

In [10]:
with open("Metadata/all_metadata.json", "w") as f:
    json.dump(all_metadata, f, indent=4)

In [8]:
with open("Metadata/all_metadata.json", "r") as f:
    all_metadata = json.load(f)

### Use it to get 9 way and 100 way metadata

In [11]:
with open("9_way_dataset.pkl", "rb") as f:
    _, _, _, _, _, _, meta_train, meta_val, meta_test = pickle.load(f)

meta_9_nums = [m[0] for m in meta_train + meta_val + meta_test]

meta_9_way = {num:all_metadata[dir_info[num]['composer']][dir_info[num]['piece_name']] for num in meta_9_nums}

with open("Metadata/9_way_metadata.json", "w+") as f:
    json.dump(meta_9_way, f, indent=4)

In [12]:
with open("100_way_dataset.pkl", "rb") as f:
    _, _, _, _, _, _, meta_train, meta_val, meta_test = pickle.load(f)

meta_100_nums = [m[0] for m in meta_train + meta_val + meta_test]

meta_100_way = {num:all_metadata[dir_info[num]['composer']][dir_info[num]['piece_name']] for num in meta_100_nums}

with open("Metadata/100_way_metadata.json", "w+") as f:
    json.dump(meta_100_way, f, indent=4)