In [75]:
import pandas as pd
import music21

MSCORE_PATH = '/usr/bin/mscore'
LILYPOND_PATH = '/usr/bin/lilypond'

settings = music21.environment.UserSettings()

settings['musicxmlPath'] = MSCORE_PATH
settings['musescoreDirectPNGPath'] = MSCORE_PATH

%load_ext music21.ipython21

In [239]:
mutopia_data = pd.read_csv('data/mutopia.csv')
pianomidi_data = pd.read_csv('data/pianomidi.csv')
kern_data = pd.read_csv('data/kernscores.csv')

In [271]:
kern_data['files'] = kern_data['files'].map(lambda x: x.replace('.krn&f=midi', '.mid'))

In [287]:
matched_instruments = [
    'Piano', 
    'Harpsichord,Piano', 
    'Harpsichord,Piano,Clavichord',
    'Piano Duet',
    'Harpsichord,Clavichord,Piano',
    'Piano,Harpsichord'
]

mutopia_data = mutopia_data[mutopia_data['instruments'].isin(matched_instruments) & ~mutopia_data['files'].isnull()]

In [244]:
pianomidi_data = pianomidi_data[~pianomidi_data['files'].isnull()]
kern_data = kern_data[~kern_data['files'].isnull()]

In [330]:
from music21.key import Key
from music21.interval import Interval
from music21.pitch import Pitch
from music21.converter import parseFile

def normalize(score, key):
    if key.mode == 'major':
        i = Interval(key.tonic, Pitch('C'))

    elif key.mode == 'minor':
        i = Interval(key.tonic, Pitch('A'))
        
    print('transposing {0} from {1} to {2}'.format(str(score), str(key), key.transpose(i)))
    score.transpose(i, inPlace=True)

In [338]:
import os
import json

def _get_output_path(path):
    filename = os.path.basename(path)
    base_filename, ext = os.path.splitext(filename)
    return os.path.join(OUTPUT_DIR, base_filename + '.xml')
        
def process_file(file_descriptor_string, base_dir, output_dir):
    file_descriptor_string = file_descriptor_string.replace("'", '"')
    file_descriptor = json.loads(file_descriptor_string)[0]
    
    file_path = os.path.join(base_dir, file_descriptor['path'].replace('full/', ''))
    
    score = parseFile(file_path)

    output_path = _get_output_path(file_path)
    key_signature = score.analyze('key')
    normalize(score, key_signature)

    score.write('musicxml', output_path)
        
    return {
        'processed_file_path': output_path,
        'raw_file_path': file_path,
        'source_url': file_descriptor['url'],
        'key': str(key_signature)
    }

In [314]:
import time

def process_files(dataframe, files_dir, output_dir):
    if 'raw_file_path' not in dataframe:
        dataframe['raw_file_path'] = None
    
    if 'processed_file_path' not in dataframe:
        dataframe['processed_file_path'] = None
        
    if 'normalized_file_path' not in dataframe:
        dataframe['normalized_file_path'] = None
    
    if 'source_url' not in dataframe:
        dataframe['source_url'] = None
        
    if 'processed' not in dataframe:
        dataframe['processed'] = False
        
    dataframe['key'] = ''
    
    time_started = time.time()
    total_rows = len(dataframe.index)
    rows_processed = 0

    print('Processing files in dataframe. Total items: {0}'.format(total_rows))

    for index, row in dataframe.iterrows():
        if row['processed']:
            continue
            
        try:
            processing_result = process_file(row['files'], files_dir, output_dir)
            dataframe.set_value(index, 'raw_file_path', processing_result['raw_file_path'])
            dataframe.set_value(index, 'processed_file_path', processing_result['processed_file_path'])
            dataframe.set_value(index, 'source_url', processing_result['source_url'])
            dataframe.set_value(index, 'key', processing_result['key'])
            dataframe.set_value(index, 'processed', True)
            print('Processed item #{0}. Running time: {1}'.format(index, time.time() - time_started))
        except Exception as e:
            print('Error processing item #{0} ({1}). Skipping...'.format(index, str(e)))

In [339]:
kern_data['processed'] = False
process_files(kern_data, 'data/files_kernscores', 'data/normalized')

Processing files in dataframe. Total items: 265
transposing <music21.stream.Score 0x7fcbd44ae438> from a minor to a minor
Processed item #3. Running time: 0.5236232280731201
transposing <music21.stream.Score 0x7fcbd4183c50> from c# minor to a minor
Processed item #9. Running time: 6.575222969055176
transposing <music21.stream.Score 0x7fcbcf0dc2b0> from b minor to a minor
Processed item #10. Running time: 19.76342248916626
transposing <music21.stream.Score 0x7fcbcfecf748> from B- major to C major
Processed item #11. Running time: 67.3283269405365
transposing <music21.stream.Score 0x7fcbd60806d8> from g# minor to a minor
Processed item #12. Running time: 87.92300391197205
transposing <music21.stream.Score 0x7fcbd5453c88> from b minor to a minor
Processed item #13. Running time: 97.48641204833984
transposing <music21.stream.Score 0x7fcbbe6fe080> from A major to C major
Processed item #14. Running time: 117.55440378189087
transposing <music21.stream.Score 0x7fcbd58c95c0> from g# minor to a

In [341]:
pianomidi_data['processed'] = False
process_files(pianomidi_data, 'data/files_pianomidi', 'data/normalized')

Processing files in dataframe. Total items: 333
transposing <music21.stream.Score 0x7fcbd5a079e8> from F major to C major
Processed item #0. Running time: 5.6664252281188965
transposing <music21.stream.Score 0x7fcbd62282b0> from C major to C major
Error processing item #1 (cannot place element <music21.tempo.MetronomeMark Quarter=199.69> with start/end 304.0/304.0 within any measures). Skipping...
transposing <music21.stream.Score 0x7fcbd5885f28> from G major to C major
Processed item #2. Running time: 52.506197690963745
transposing <music21.stream.Score 0x7fcbcccc1908> from C major to C major
Processed item #3. Running time: 60.36204218864441
transposing <music21.stream.Score 0x7fcbc51fb358> from C major to C major
Processed item #4. Running time: 79.69125056266785
transposing <music21.stream.Score 0x7fcbce703898> from C major to C major
Processed item #5. Running time: 82.23900580406189
transposing <music21.stream.Score 0x7fcbd6ca1d68> from C major to C major
Processed item #6. Runni

In [342]:
mutopia_data['processed'] = False
process_files(mutopia_data, 'data/files_mutopia', 'data/normalized')

Processing files in dataframe. Total items: 598
transposing <music21.stream.Score 0x7fcbd4de1e48> from G major to C major
Processed item #1. Running time: 5.784152269363403
transposing <music21.stream.Score 0x7fcbcf636940> from d minor to a minor
Processed item #2. Running time: 37.26753330230713
transposing <music21.stream.Score 0x7fcbce5b5da0> from C major to C major
Processed item #7. Running time: 42.24971079826355
transposing <music21.stream.Score 0x7fcbcfbe42e8> from D major to C major
Processed item #8. Running time: 43.405428647994995
transposing <music21.stream.Score 0x7fcbcf36b2e8> from G major to C major
Processed item #9. Running time: 44.52947974205017
transposing <music21.stream.Score 0x7fcbcf5934e0> from f minor to a minor
Processed item #11. Running time: 78.80513548851013
transposing <music21.stream.Score 0x7fcbc77fdf98> from F major to C major
Processed item #12. Running time: 79.8423924446106
transposing <music21.stream.Score 0x7fcbcf205e10> from G major to C major
P

In [300]:
from tqdm import tqdm, tqdm_pandas

def _get_key(key_string):
    return Key(key_string.split(' ')[0])

total_files = len(mutopia_data.index)
processed_files = 0

for index, row in mutopia_data.iterrows():
    if processed_files < 200:
        processed_files += 1
        continue
    
    if not row['processed']:
        continue
        
    path = row['processed_file_path']
    key = _get_key(row['key'])
    score = parseFile(path)
    score.write('musicxml', path + '.bak')
    normalize(score, key)
    score.write('musicxml', path)
    processed_files += 1
    
    if processed_files % 10 == 0:
        print('processed {0} of {1}'.format(processed_files, total_files))

processed 210 of 598
processed 220 of 598
processed 230 of 598
processed 240 of 598
processed 250 of 598
processed 260 of 598
processed 270 of 598
processed 280 of 598
processed 290 of 598
processed 300 of 598
processed 310 of 598
processed 320 of 598
processed 330 of 598
processed 340 of 598
processed 350 of 598
processed 360 of 598
processed 370 of 598
processed 380 of 598
processed 390 of 598
processed 400 of 598
processed 410 of 598
processed 420 of 598
processed 430 of 598
processed 440 of 598
processed 450 of 598
processed 460 of 598
processed 470 of 598
processed 480 of 598
processed 490 of 598
processed 500 of 598
processed 510 of 598
processed 520 of 598
processed 530 of 598
processed 540 of 598
processed 550 of 598
processed 560 of 598
processed 570 of 598
processed 580 of 598


In [376]:
columns = ['name', 
           'composer', 
           'genre', 
           'key', 
           'year',
           'instruments',
           'details_url', 
           'tags',
           'catalogue_code', 
           'file_urls', 
           'raw_file_path',
           'source_url',
           'processed_file_path']

all_metadata = pd.concat([kern_data[columns], pianomidi_data[columns], mutopia_data[columns]])

all_metadata.reindex()
all_metadata.to_csv('all_metadata.csv')

In [414]:
del all_metadata['uid']

In [416]:
all_metadata.to_csv('all_metadata.csv', index=False)