In [53]:
import requests
import numpy as np
import os
from bs4 import BeautifulSoup
import re
import pandas as pd
import zipfile
import pickle
import subprocess
from tqdm import tqdm
import time
import urllib.parse

In [40]:
DIRECTORY_PATH = "/mnt/data0/Datasets/imslp/score_scrape/results/composer"

def find_all_folders(directory_path):
    folders = []
    for root, dirs, files in os.walk(directory_path):
        for dir_name in dirs:
            folders.append(os.path.join(root, dir_name))
    return folders

def find_second_level_folders(directory_path):
    second_level_folders = []
    for root, dirs, files in os.walk(directory_path):
        # Calculate the depth by counting the number of separators in the path
        depth = root[len(directory_path):].count(os.sep)
        if depth == 1:  # We're looking for directories at depth 1 (second level)
            for dir_name in dirs:
                second_level_folders.append(os.path.join(root, dir_name))
    return second_level_folders

folders = find_all_folders(DIRECTORY_PATH)
second_level_folders = find_second_level_folders(DIRECTORY_PATH)

In [41]:
def find_files(directory_path):
    files = []
    for root, dirs, file_names in os.walk(directory_path):
        for file_name in file_names:
            if file_name.endswith('.pdf'):
                files.append(os.path.join(root, file_name))
    return files

find_files(second_level_folders[25])

['/mnt/data0/Datasets/imslp/score_scrape/results/composer/Schrammel,_Johann/Wien_bleibt_Wien!_/397957.pdf',
 '/mnt/data0/Datasets/imslp/score_scrape/results/composer/Schrammel,_Johann/Wien_bleibt_Wien!_/414872.pdf',
 '/mnt/data0/Datasets/imslp/score_scrape/results/composer/Schrammel,_Johann/Wien_bleibt_Wien!_/450459.pdf',
 '/mnt/data0/Datasets/imslp/score_scrape/results/composer/Schrammel,_Johann/Wien_bleibt_Wien!_/420122.pdf']

In [42]:
# find all files in the second level folders
all_files = []
for folder in tqdm(second_level_folders):
    all_files += find_files(folder)
    
len(all_files)

100%|██████████| 129607/129607 [00:01<00:00, 76764.95it/s]


420264

In [50]:
def match_id_to_composer_and_title(path):
    split_path = path.split('/')
    composer = split_path[-3]
    composer = urllib.parse.unquote(composer).replace('_', ' ')
    
    title = split_path[-2]
    title = urllib.parse.unquote(title).replace('_', ' ')[:-1]
    # check if title contains comma and number
    if ',' in title and any(char.isdigit() for char in title):
        title = ",".join(title.split(",")[:-1])
    
    pdf_id = split_path[-1].split('.')[0]
    return {pdf_id: (composer, title)}
    
match_id_to_composer_and_title(find_files(second_level_folders[235])[0])

{'220875': ('Frescobaldi, Girolamo', 'Recercar sopra Sol, mi, fa, la, sol')}

In [51]:
# match all files to their composer and title
file_to_composer_and_title = {}
for file in tqdm(all_files):
    file_to_composer_and_title.update(match_id_to_composer_and_title(file))
    
len(file_to_composer_and_title)

100%|██████████| 420264/420264 [00:01<00:00, 250745.95it/s]


364007

In [52]:
# save the dictionary
with open('/home/ctang/ttmp/MMCSR/dataset_metadata/id_composer_title.pkl', 'wb') as f:
    pickle.dump(file_to_composer_and_title, f)

In [54]:
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/arr_piano_metadata.pkl', 'rb') as f:
    arr_piano_metadata = pickle.load(f)
    
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/harpsichord_metadata.pkl', 'rb') as f:
    harpsichord_metadata = pickle.load(f)
    
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/solo_piano_metadata.pkl', 'rb') as f:
    solo_piano_metadata = pickle.load(f)
    
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/keyboard_metadata.pkl', 'rb') as f:
    keyboard_metadata = pickle.load(f)

In [55]:
# add up all the metadata
all_metadata = {**arr_piano_metadata, **harpsichord_metadata, **solo_piano_metadata, **keyboard_metadata}

In [56]:
c,p = ('Sinding, Christian', '8 Intermezzi, Op.65')
p = p.split(',')[0]
all_metadata[c][p]

{'Work Title': '8 Intermezzi',
 'Composer': 'Sinding, Christian',
 'Opus/Catalogue Number': 'Op.72',
 'I-Catalogue Number': 'ICS 28',
 'Movements/Sections': '8 pieces',
 'Composer Time Period': 'Romantic',
 'Piece Style': 'Romantic',
 'Instrumentation': 'Piano',
 'url': 'https://imslp.org/wiki/8_Intermezzi,_Op.72_(Sinding,_Christian)',
 'Alternative Title': '',
 'First Publication': '.\n1905 (ca.)'}

In [57]:
# save all metadata to disk
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/all_metadata.pkl', 'wb') as f:
    pickle.dump(all_metadata, f)