This notebook scrapes the metadata given a piece url.

In [18]:
import requests
import numpy as np
import os
from bs4 import BeautifulSoup
import re
import pandas as pd
import zipfile
import pickle

In [19]:
# get the list of ids from 9_way_dataset.zip

with zipfile.ZipFile('9_way_dataset.zip', 'r') as zip_file:
    # Assuming there's only one file inside the ZIP
    filename = zip_file.namelist()[0]
    # Open the file inside the ZIP
    with zip_file.open(filename) as f:
        # Load the pickled object
        data_9 = pickle.load(f)

In [20]:
# get the list of ids from 100_way_dataset.zip

with zipfile.ZipFile('100_way_dataset.zip', 'r') as zip_file:
    # Assuming there's only one file inside the ZIP
    filename = zip_file.namelist()[0]
    # Open the file inside the ZIP
    with zip_file.open(filename) as f:
        # Load the pickled object
        data_100 = pickle.load(f)

In [21]:
# get the list of ids from 9_way_dataset
ids_9 = []
for i in range(6,9):
    for data in data_9[i]:
        ids_9.append(data[0])

ids_9 = list(set(ids_9))
print(len(ids_9))

882


In [22]:
# get the list of ids from 100_way_dataset
ids_100 = []
for i in range(6,9):
    for data in data_100[i]:
        ids_100.append(data[0])

ids_100 = list(set(ids_100))
print(len(ids_100))

4930


In [23]:
# get pickle file
with open('9_way_pdf_dict.pkl', 'rb') as f:
    dict_ids_9 = pickle.load(f)
    
url_dict_9={}
not_valid_ids_9 = []
for i in ids_9:
    try:
        url_dict_9[i]=dict_ids_9[i]
    except:
        not_valid_ids_9.append(i)
        continue
print(len(ids_9))
print(len(url_dict_9))
print(len(not_valid_ids_9))

882
882
0


In [17]:
# get pickle file
with open('100_way_pdf_dict.pkl', 'rb') as f:
    dict_ids_100 = pickle.load(f)
    
url_dict_100={}
not_valid_ids_100 = []
for i in ids_100:
    try:
        url_dict_100[i]=dict_ids_100[i]
    except:
        not_valid_ids_100.append(i)
        continue
print(len(ids_100))
print(len(url_dict_100))
print(len(not_valid_ids_100))

4930
4094
836


In [30]:
# functions to get page text and parse out the general information sectioin

def get_page_text(url):
    '''Retrive the text from url.'''
    # parse web content into English text
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for any request errors

        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract all text from the page, excluding script and style tags
        text = soup.get_text(separator='\n', strip=True)

        return text
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None

def extract_text_between_strings_second(text, start_string, end_string):
    '''
    Extracts the substring between the second occurrence of start_string and end_string in text.
    '''
    # Find the second occurrence of the start_string (the second occurence of "general information" gives us our metadata)
    if text:
        start_index = text.find(start_string, text.find(start_string) + len(start_string))
        if start_index == -1:  # If start_string is not found
            return None

        # Find the occurrence of the end_string
        end_index = text.find(end_string, start_index)
        if end_index == -1:  # If end_string is not found
            return None

        # Extract the substring between the second occurrence of start_string and end_string
        extracted_text = text[start_index + len(start_string):end_index].strip()

        return extracted_text

def retrieve_general_information(url):
    '''Retrieves the general information section from the IMSLP page of a musical score until Instrumentation.'''
    # retrieve the general information section
    parsed_text = extract_text_between_strings_second(get_page_text(url),'General Information','Instrumentation')
    return parsed_text

def extract_text_between_strings_new(text, start_string, end_string):
    '''
    Extracts the substring between start_string and end_string in text.
    '''
    start_index = text.find(start_string)
    if start_index == -1:  # If start_string is not found
        return None

    # Find the occurrence of the end_string
    end_index = text.find(end_string, start_index)
    if end_index == -1:  # If end_string is not found
        return None

    # Extract the substring between the second occurrence of start_string and end_string
    extracted_text = text[start_index + len(start_string):end_index].strip()

    return extracted_text

In [32]:
# find general information section

def get_gen_info(url_dict):
    attribute_list = ['Work Title', 'Alt\nernative\n.\nTitle', 'Name Translations', 'Name Aliases', 'Authorities', 'Composer',
                            'Opus/Catalogue Number', 'I-Catalogue Number', 'Key', 'Movements/Sections', 'Year/Date of Composition', 
                            'First Pub\nlication', 'Librettist', 'Language', 'Copyright Information','Dedication', 'Average Duration', 'Composer Time Period', "Piece Style"]
    list_of_deletable_headings = ['Opus/Catalogue Number','I-Catalogue Number','Movements/Sections','Year/Date of Composition','First Pub\nlication','Average Duration','Composer Time Period']
    bad_heading_pattern = r'^.*\n'

    info = {}
    for pdf_id, url in url_dict.items():
        parsed_text = retrieve_general_information(url)

        if parsed_text:

            info_dic = {}

            present_attributes = []
            # check which attributes are present in the parsed text
            for attribute in attribute_list:
                if parsed_text.find(attribute) != -1:
                    present_attributes.append(attribute)

            # extract the text for each present attribute
            for attribute in attribute_list[:-1]:
                if attribute in present_attributes:
                    info_dic[attribute] = extract_text_between_strings_new(parsed_text, attribute, present_attributes[present_attributes.index(attribute) + 1] if present_attributes.index(attribute) != len(present_attributes) - 1 else "Romantic")

            info_dic['Piece Style'] = parsed_text[parsed_text.find('Piece Style')+12:] # extract the piece style from the end of the general information section
            info_dic['Instrumentation'] = 'Piano'
            info_dic['url'] = url

            if 'Alt\nernative\n.\nTitle' in present_attributes:
                alt_title = info_dic['Alt\nernative\n.\nTitle']
                info_dic['Alternative Title'] = alt_title
                del info_dic['Alt\nernative\n.\nTitle']

            if 'First Pub\nlication' in present_attributes:
                alt_title = info_dic['First Pub\nlication']
                info_dic['First Publication'] = alt_title
                del info_dic['First Pub\nlication']

            for heading in list_of_deletable_headings:
                if heading in info_dic.keys():
                    try:
                        info_dic[heading] = re.sub(bad_heading_pattern, '', info_dic[heading])
                    except:
                        pass

            info[pdf_id] = info_dic

        else:
            print("String not found or invalid input.")
            
    return info

In [33]:
info_9 = get_gen_info(url_dict_9)

In [12]:
info_100 = get_gen_info(url_dict_100)

In [35]:
# create pickle file from the dictionary
with open('9_way_metadata.pkl', 'wb') as f:
    pickle.dump(info_9, f)

In [13]:
with open('100_way_metadata.pkl', 'wb') as f:
    pickle.dump(info_100, f)

In [40]:
# create a dataframe from the dictionary
df_9 = pd.DataFrame.from_dict(info_9, orient='index')
df_9.info()

# save the dataframe to a csv file
df_9.to_csv('9_way_metadata.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 882 entries, 53763 to 20432
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Work Title                882 non-null    object
 1   Composer                  882 non-null    object
 2   Opus/Catalogue Number     870 non-null    object
 3   I-Catalogue Number        881 non-null    object
 4   Key                       564 non-null    object
 5   Movements/Sections        846 non-null    object
 6   Composer Time Period      882 non-null    object
 7   Piece Style               882 non-null    object
 8   Instrumentation           882 non-null    object
 9   url                       882 non-null    object
 10  Alternative Title         882 non-null    object
 11  Year/Date of Composition  762 non-null    object
 12  Name Translations         322 non-null    object
 13  Authorities               309 non-null    object
 14  Average Duration         