In [1]:
import requests
import numpy as np
import os
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
# get links in top50composer.txt
with open('top50composer.txt', 'r') as file:
    composers = file.readlines()

composers = [composer.strip() for composer in composers]

In [3]:
# code the previous team uses to find all the urls for a composer

def find_score_url(url):
    """
    This function takes in one arguments:
    1) url, a composer url page to parse
    This function returns a list of piece url for a specific composer.
    """
    list_page = [url]
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    # add everything on the first page to the list
    result = soup.find_all('div', class_="jq-ui-tabs" )
    if result == []:
        return "badlink"
    scores = result[0].find_all('a',class_='categorypagelink')
    url_list = [link.get('href') for link in scores]
    # check sub-page
    list_page = find_next_score_url(url)
    if list_page != []:
        for page in list_page:
            r = requests.get(page)
            soup = BeautifulSoup(r.text, "html.parser")
            url_list += [link.get('href') for link in soup.find_all('a',class_='categorypagelink')]
    return url_list

def find_next_score_url(url):
    """
    Helper function for find_score_url
    """
    result = []
    while True:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        url_list = [link.get('href') for link in soup.find_all('a',class_='categorypaginglink',string='next 200')]
        if url_list == []:
            break
        url = 'http://imslp.org' + url_list[-1]
        result.append(url)
    return result

urls = []
for composer in composers:
    urls += find_score_url(composer)

urls= ['https://imslp.org' + s for s in urls]
print(urls[0:10])

['https://imslp.org/wiki/Ach_bleib_bei_uns,_Herr_Jesu_Christ,_BWV_649_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_bleib_bei_uns,_Herr_Jesu_Christ,_BWV_253_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_Gott_und_Herr,_BWV_692_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_Gott_und_Herr,_BWV_714_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_Gott_und_Herr,_BWV_255_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_Gott_und_Herr,_BWV_692a_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_Gott_vom_Himmel_sieh_darein,_BWV_741_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_Gott,_erh%C3%B6r_mein_Seufzen_und_Wehklagen,_BWV_254_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_Gott,_tu_dich_erbarmen,_BWV_1109_(Bach,_Johann_Sebastian)', 'https://imslp.org/wiki/Ach_Gott,_vom_Himmel_sieh_darein,_BWV_2_(Bach,_Johann_Sebastian)']


In [4]:
# functions to get page text and parse out the general information sectioin

def get_page_text(url):
    # parse web content into English text
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for any request errors

        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract all text from the page, excluding script and style tags
        text = soup.get_text(separator='\n', strip=True)

        return text
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None

def extract_text_between_strings_second(text, start_string, end_string):
    # Find the second occurrence of the start_string (the second occurence of "general information" gives us our metadata)
    if text:
        start_index = text.find(start_string, text.find(start_string) + len(start_string))
        if start_index == -1:  # If start_string is not found
            return None

        # Find the occurrence of the end_string
        end_index = text.find(end_string, start_index)
        if end_index == -1:  # If end_string is not found
            return None

        # Extract the substring between the second occurrence of start_string and end_string
        extracted_text = text[start_index + len(start_string):end_index].strip()

        return extracted_text

def retrieve_general_information(url):
    # retrieve the general information section
    parsed_text = extract_text_between_strings_second(get_page_text(url),'General Information','Instrumentation')
    return parsed_text

In [6]:
# an example parsing

def extract_text_between_strings_new(text, start_string, end_string):
            # Find the second occurrence of the start_string (the second occurence of "general information" gives us our metadata)
            start_index = text.find(start_string)
            if start_index == -1:  # If start_string is not found
                return None

            # Find the occurrence of the end_string
            end_index = text.find(end_string, start_index)
            if end_index == -1:  # If end_string is not found
                return None

            # Extract the substring between the second occurrence of start_string and end_string
            extracted_text = text[start_index + len(start_string):end_index].strip()

            return extracted_text
        
attribute_list = ['Work Title', 'Alt\nernative\n.\nTitle', 'Name Translations', 'Name Aliases', 'Authorities', 'Composer',
                        'Opus/Catalogue Number', 'I-Catalogue Number', 'Key', 'Movements/Sections', 'Year/Date of Composition', 
                        'First Pub\nlication', 'Copyright Information','Dedication', 'Average Duration', 'Composer Time Period', "Piece Style"]
list_of_deletable_headings = ['Opus/Catalogue Number','I-Catalogue Number','Movements/Sections','Year/Date of Composition','First Pub\nlication','Average Duration','Composer Time Period']
bad_heading_pattern = r'^.*\n'

info = []
for url in urls:
    parsed_text = retrieve_general_information(url)
    
    if parsed_text:

        info_dic = {}
        
        present_attributes = []
        # check which attributes are present in the parsed text
        for attribute in attribute_list:
            if parsed_text.find(attribute) != -1:
                present_attributes.append(attribute)
        
        # extract the text for each present attribute
        for attribute in attribute_list:
            if attribute in present_attributes:
                info_dic[attribute] = extract_text_between_strings_new(parsed_text, attribute, present_attributes[present_attributes.index(attribute) + 1] if present_attributes.index(attribute) != len(present_attributes) - 1 else "Romantic")
                
            else:
                info_dic[attribute] = None
                
        info_dic['Piece Style'] = parsed_text[parsed_text.find('Piece Style')+12:] # extract the piece style from the end of the general information section
        info_dic['url'] = url
        for heading in list_of_deletable_headings:
            if info_dic[heading]:
                info_dic[heading] = re.sub(bad_heading_pattern, '', info_dic[heading])
        info.append(info_dic)
        
    else:
        print("String not found or invalid input.")
        
# convert the dictionary to a pandas dataframe
df_info = pd.DataFrame(info)
del df_info['Alt\nernative\n.\nTitle'] # drop unncecessary column
df_info.rename(columns={'First Pub\nlication': 'First Publication'}, inplace=True) # rename column
print(df_info.head())

String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or invalid input.
String not found or 

In [8]:
# examine the dataframe
print(df_info.info())

# save the dataframe to a csv file
df_info.to_csv('imslp_piano_music_new.csv', index=False)

print(df_info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10007 entries, 0 to 10006
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Work Title                10007 non-null  object
 1   Name Translations         1584 non-null   object
 2   Name Aliases              1209 non-null   object
 3   Authorities               1595 non-null   object
 4   Composer                  10007 non-null  object
 5   Opus/Catalogue Number     7987 non-null   object
 6   I-Catalogue Number        10006 non-null  object
 7   Key                       5723 non-null   object
 8   Movements/Sections        7694 non-null   object
 9   Year/Date of Composition  6177 non-null   object
 10  First Publication         6771 non-null   object
 11  Copyright Information     124 non-null    object
 12  Dedication                2023 non-null   object
 13  Average Duration          3129 non-null   object
 14  Composer Time Period  

In [7]:
1+1

2