In [45]:
import requests
import numpy as np
import os
from bs4 import BeautifulSoup
import re
import pandas as pd
import zipfile
import pickle
import subprocess
from tqdm import tqdm
import time
import urllib.parse

In [4]:
DIRECTORY_PATH = "/mnt/data0/Datasets/imslp/score_scrape/results/composer"

def find_all_folders(directory_path):
    folders = []
    for root, dirs, files in os.walk(directory_path):
        for dir_name in dirs:
            folders.append(os.path.join(root, dir_name))
    return folders

def find_second_level_folders(directory_path):
    second_level_folders = []
    for root, dirs, files in os.walk(directory_path):
        # Calculate the depth by counting the number of separators in the path
        depth = root[len(directory_path):].count(os.sep)
        if depth == 1:  # We're looking for directories at depth 1 (second level)
            for dir_name in dirs:
                second_level_folders.append(os.path.join(root, dir_name))
    return second_level_folders

folders = find_all_folders(DIRECTORY_PATH)
second_level_folders = find_second_level_folders(DIRECTORY_PATH)

In [8]:
def find_files(directory_path):
    files = []
    for root, dirs, file_names in os.walk(directory_path):
        for file_name in file_names:
            if file_name.endswith('.pdf'):
                files.append(os.path.join(root, file_name))
    return files

find_files(second_level_folders[25])

['/mnt/data0/Datasets/imslp/score_scrape/results/composer/Schrammel,_Johann/Wien_bleibt_Wien!_/397957.pdf',
 '/mnt/data0/Datasets/imslp/score_scrape/results/composer/Schrammel,_Johann/Wien_bleibt_Wien!_/414872.pdf',
 '/mnt/data0/Datasets/imslp/score_scrape/results/composer/Schrammel,_Johann/Wien_bleibt_Wien!_/450459.pdf',
 '/mnt/data0/Datasets/imslp/score_scrape/results/composer/Schrammel,_Johann/Wien_bleibt_Wien!_/420122.pdf']

In [54]:
# find all files in the second level folders
all_files = []
for folder in tqdm(second_level_folders):
    all_files += find_files(folder)
    
len(all_files)

100%|██████████| 129607/129607 [00:01<00:00, 73742.30it/s]


420264

In [53]:
def match_id_to_composer_and_title(path):
    split_path = path.split('/')
    composer = split_path[-3]
    composer = urllib.parse.unquote(composer).replace('_', ' ')
    
    title = split_path[-2]
    title = urllib.parse.unquote(title).replace('_', ' ')[:-1]
    
    pdf_id = split_path[-1].split('.')[0]
    return {pdf_id: (composer, title)}
    
match_id_to_composer_and_title(find_files(second_level_folders[12525])[0])

{'319434': ('Martini, Johannes', 'Tant que dieu vosdra')}

In [55]:
# match all files to their composer and title
file_to_composer_and_title = {}
for file in tqdm(all_files):
    file_to_composer_and_title.update(match_id_to_composer_and_title(file))
    
len(file_to_composer_and_title)

100%|██████████| 420264/420264 [00:01<00:00, 332964.30it/s]


364007

In [56]:
# save the dictionary
with open('/home/ctang/ttmp/MMCSR/dataset_metadata/id_composer_title.pkl', 'wb') as f:
    pickle.dump(file_to_composer_and_title, f)

In [12]:
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/arr_piano_metadata.pkl', 'rb') as f:
    arr_piano_metadata = pickle.load(f)
    
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/harpsichord_metadata.pkl', 'rb') as f:
    harpsichord_metadata = pickle.load(f)
    
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/solo_piano_metadata.pkl', 'rb') as f:
    solo_piano_metadata = pickle.load(f)
    
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/keyboard_metadata.pkl', 'rb') as f:
    keyboard_metadata = pickle.load(f)

In [None]:
all_metadata = arr_piano_metadata + harpsichord_metadata + solo_piano_metadata + keyboard_metadata

In [44]:
all_metadata['Chopin, Frédéric']

{'Allegretto and Mazurka': {'Work Title': 'Allegretto and Mazurka',
  'Composer': 'Chopin, Frédéric',
  'Opus/Catalogue Number': 'KK.VIIb/7-8',
  'I-Catalogue Number': 'IFC 123',
  'Key': 'see below',
  'Movements/Sections': '2 sections:\nAllegretto (A major, 24 bars)\nMazurka (D minor, 15 bars)',
  'Year/Date of Composition': '1835 ca.\xa0?\nFirst Perf\normance\n.\n1992',
  'Average Duration': '1 minute',
  'Composer Time Period': 'Romantic',
  'Piece Style': 'Romantic',
  'Instrumentation': 'Piano',
  'url': 'https://imslp.org/wiki/Allegretto_and_Mazurka_(Chopin,_Fr%C3%A9d%C3%A9ric)',
  'Alternative Title': '',
  'First Publication': '.\n1975 - Zurich: Schweizerische Musikzeitung'},
 'Allegretto in F-sharp major': {'Work Title': 'Allegretto in F-sharp major',
  'Composer': 'Chopin, Frédéric',
  'Opus/Catalogue Number': 'WN 36',
  'I-Catalogue Number': 'IFC 1',
  'Key': 'F-sharp major',
  'Movements/Sections': "1 piece (31 bars in Ekier's reconstruction)",
  'Year/Date of Composition'

In [14]:
# merge all metadata, which are dictionaries

all_metadata = {**arr_piano_metadata, **harpsichord_metadata, **solo_piano_metadata, **keyboard_metadata}

In [22]:
# save all metadata to disk
with open('/home/ctang/ttmp/MMCSR/pretraining_metadata/all_metadata.pkl', 'wb') as f:
    pickle.dump(all_metadata, f)