In [75]:
import pandas as pd
import music21

MSCORE_PATH = '/usr/bin/mscore'
LILYPOND_PATH = '/usr/bin/lilypond'

settings = music21.environment.UserSettings()

settings['musicxmlPath'] = MSCORE_PATH
settings['musescoreDirectPNGPath'] = MSCORE_PATH

%load_ext music21.ipython21

In [35]:
mutopia_data = pd.read_csv('data/mutopia.csv')
mutopia_data.head()

Unnamed: 0,year,genre,instruments,files,tags,details_url,file_urls,catalogue_code,name,key,composer
0,c. 1846,Song,Voice and Piano,[{'checksum': '0459962270e7f9775a3c7aaa9449c76...,,piece-info.cgi?id=439,http://www.mutopiaproject.org/ftp/AbtF/swallow...,,When the Swallows Homeward Fly (Agathe),,F. Abt
1,,Classical,Piano,[{'checksum': 'dcb1f12009c3b7e068a21c5e15395b6...,,piece-info.cgi?id=897,http://www.mutopiaproject.org/ftp/AdamA/gisell...,,Giselle - Pas de deux (1er Acte),,A. Adam
2,1887,Romantic,Piano,[{'checksum': '5172ce1a911e866807370918c869399...,,piece-info.cgi?id=898,http://www.mutopiaproject.org/ftp/AlbenizIMF/O...,71.0,Rumores de la Caleta,,I. M. F. Albéniz
3,,Hymn,Voice and Piano,[{'checksum': 'b10215f7bbb6e39ebaf8617d5948da6...,,piece-info.cgi?id=640,http://www.mutopiaproject.org/ftp/AdamA/minuit...,,Minuit Chrétiens,,A. Adam
4,,Song,Voice and Piano,[{'checksum': 'f5712e5ceb550e6a038dd6c5e6c3c12...,,piece-info.cgi?id=440,http://www.mutopiaproject.org/ftp/AdamsS/bluem...,,The Blue Alsatian Mountains,,S. Adams


In [120]:
import os
import json
from music21 import converter

OUTPUT_DIR = 'data/processed/'

def _get_output_path(path):
    filename = os.path.basename(path)
    base_filename, ext = os.path.splitext(filename)
    return os.path.join(OUTPUT_DIR, base_filename + '.xml')
        
def process_file(file_descriptor_string, base_dir):
    file_descriptor_string = file_descriptor_string.replace("'", '"')
    file_descriptor = json.loads(file_descriptor_string)[0]
    
    file_path = os.path.join(base_dir, file_descriptor['path'].replace('full/', ''))
    
    score = converter.parseFile(file_path)

    output_path = _get_output_path(file_path)
    key_signature = score.analyze('key')
    score.write('musicxml', output_path)
    
    return {
        'processed_file_path': output_path,
        'raw_file_path': file_path,
        'source_url': file_descriptor['url'],
        'key': str(key_signature)
    }

In [None]:
import time

def process_files(dataframe, files_dir):
    if 'raw_file_path' not in dataframe:
        dataframe['raw_file_path'] = None
    
    if 'processed_file_path' not in dataframe:
        dataframe['processed_file_path'] = None
        
    if 'source_url' not in dataframe:
        dataframe['source_url'] = None
        
    if 'processed' not in dataframe:
        dataframe['processed'] = False
        
    time_started = time.time()
    total_rows = len(dataframe.index)
    rows_processed = 0

    print('Processing files in dataframe. Total items: {0}'.format(total_rows))

    for index, row in dataframe.iterrows():
        try:
            processing_result = process_file(row['files'], files_dir)
            dataframe.set_value(index, 'raw_file_path', processing_result['raw_file_path'])
            dataframe.set_value(index, 'processed_file_path', processing_result['processed_file_path'])
            dataframe.set_value(index, 'source_url', processing_result['source_url'])
            dataframe.set_value(index, 'key', processing_result['key'])
            dataframe.set_value(index, 'processed', True)
            print('Processed item #{0}. Running time: {1}'.format(index, time.time() - time_started))
        except Exception as e:
            print('Error processing item #{0}. Skipping...'.format(index))

In [None]:
process_files(mutopia_data, 'data/files_mutopia')

Processing files in dataframe
...processed 5 rows of 740. Running time: 46.59191298484802
...processed 10 rows of 740. Running time: 61.05108451843262
...processed 15 rows of 740. Running time: 101.8463146686554
...processed 20 rows of 740. Running time: 150.3099148273468
...processed 25 rows of 740. Running time: 158.64779353141785
...processed 30 rows of 740. Running time: 178.4195442199707
...processed 35 rows of 740. Running time: 195.325847864151
...processed 40 rows of 740. Running time: 211.00888228416443
...processed 45 rows of 740. Running time: 225.91394567489624
...processed 50 rows of 740. Running time: 244.21918749809265
...processed 55 rows of 740. Running time: 281.6947383880615
Error processing item #59. Skipping...
...processed 60 rows of 740. Running time: 300.0761272907257
Error processing item #60. Skipping...
Error processing item #61. Skipping...
Error processing item #62. Skipping...
Error processing item #63. Skipping...
Error processing item #64. Skipping...
..