In [2]:
import pandas as pd
import numpy as np
import datetime
import json
import time
import os
import music21 as m21
from music21.key import Key
from music21.interval import Interval
from music21.pitch import Pitch
from music21.converter import parseFile

MSCORE_PATH = '/usr/bin/mscore'
LILYPOND_PATH = '/usr/bin/lilypond'

settings = m21.environment.UserSettings()

settings['musicxmlPath'] = MSCORE_PATH
settings['musescoreDirectPNGPath'] = MSCORE_PATH

%load_ext music21.ipython21

In [3]:
import logging
import imp

imp.reload(logging)
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

logger = logging.getLogger()
logger.handlers.clear()

#file handler
log_filename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_processing.log')
fh = logging.FileHandler('../logs/' + log_filename)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)

#console handler
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
logger.addHandler(ch)

logger.debug('debug')
logger.info('info')

info


In [3]:
def extract_filename(row):
    filename, url = parse_descriptor(row['files'])
    return pd.Series({ 'filename': filename, 'url': url })

In [4]:
def parse_descriptor(files_string):
    try:
        descriptors = json.loads(files_string.replace("'", '"'))
    except ValueError as e:
        logger.error('... files bad descriptor: %s', files_string)
        raise
        
    if(len(descriptors) > 1):
        logger.warn('Descriptor contains several files: %s', files_string)
        
    filename = os.path.basename(descriptors[0]['path'])
    filename = filename.replace('.krn&f=midi', '.mid')
    url = descriptors[0]['url']
    
    return filename, url

In [5]:
def process_file_descriptors(dataframe):
    descriptor_data = pd.DataFrame(dataframe.apply(extract_filename, axis=1))
    return pd.concat([dataframe, descriptor_data], axis=1)

In [6]:
def load_dataframe(metadata_path, files_path):
    dataframe = pd.read_csv(metadata_path, dtype=str)
    dataframe['input_dir'] = files_path
    dataframe = dataframe[~dataframe['files'].isnull()]
    dataframe = process_file_descriptors(dataframe)
    dataframe = dataframe.fillna('')
    return dataframe[dataframe['filename'].str.endswith('.mid', na=False)]

In [None]:
kern_data = load_dataframe('../data/kernscores.csv', '../data/files_kernscores/')

In [None]:
matched_instruments = [
    'Piano', 
    'Harpsichord,Piano', 
    'Harpsichord,Piano,Clavichord',
    'Piano Duet',
    'Harpsichord,Clavichord,Piano',
    'Piano,Harpsichord'
]

mutopia_data = load_dataframe('../data/mutopia.csv', '../data/files_mutopia/') 
mutopia_data = mutopia_data[mutopia_data['instruments'].isin(matched_instruments)]

In [None]:
pianomidi_data = load_dataframe('../data/pianomidi.csv', '../data/files_pianomidi/') 

In [None]:
yamaha_data = pd.read_csv('../data/yamaha/metadata.csv', dtype='str')
yamaha_data.fillna('', inplace=True)

In [7]:
def call_safe(func, args, default=None):
    try:
        return func(*args)
    except Exception as e:
        logger.exception(e)
        return default

In [8]:
def normalize(score, key_str):
    key = m21.key.Key(key_str.split()[0])
    
    if key.mode == 'major':
        i = Interval(key.tonic, Pitch('C'))

    elif key.mode == 'minor':
        i = Interval(key.tonic, Pitch('A'))
        
    score.transpose(i, inPlace=True)

In [9]:
def update_metadata(score, score_metadata):
    if score.metadata is None:
        score.metadata = m21.stream.metadata.Metadata()
        
    score.metadata.composer = score_metadata['composer']
    score.metadata.title = score_metadata['name']
    score.metadata.movementName = score_metadata['name']
    
    try:
        score.metadata.date = score_metadata['year']
    except:
        pass

In [10]:
def extract_time_signatures(score):
    for part in score.parts:
        time_signatures = [m.timeSignature for m in part.getElementsByClass(m21.stream.Measure) if m.timeSignature is not None]
        if time_signatures:
            return [ts.ratioString for ts in time_signatures]
    
    return [None]

In [None]:
# Hard way to extract duration:
# metronome_data = pd.DataFrame(score.metronomeMarkBoundaries(), columns=['start', 'end', 'tempo'])
# metronome_data['duration'] = metronome_data['end'] - metronome_data['start']
# metronome_data['bpm'] = metronome_data['tempo'].map(lambda x: x.number)
# metronome_data['weighted_duration'] = metronome_data['bpm'] * metronome_data['duration']
# tempo = metronome_data['weighted_duration'].sum() /  metronome_data['duration'].sum()
# tempo

In [11]:
def extract_stats(score):
    key_signature = score.analyze('key')
    primary_ts, *alternative_ts = call_safe(extract_time_signatures, [score], default=[None])
    duration = call_safe(lambda s: max([x['endTimeSeconds'] for x in s.secondsMap]), [score])      
    tempo = call_safe(lambda s: s.metronomeMarkBoundaries()[0][2].number, [score]) 
        
    
    return {
        'key': str(key_signature),
        'primary_time_signature': primary_ts,
        'secondary_time_signatures': alternative_ts,
        'duration': duration,
        'tempo': tempo
    }

In [12]:
def set_extension(filename, new_ext):
    base, ext = os.path.splitext(filename)
    return base + '.' + new_ext

In [13]:
def parse_file(input_path):
    _, ext = os.path.splitext(input_path)
    if ext != '.xml' and ext != '.mxl':
        tmp_path = input_path + '.tmp.xml'
        subprocess.call('mscore "{0}" -o "{1}"'.format(input_path, tmp_path), shell=True)
        return m21.converter.parseFile(tmp_path)
    else:
        return m21.converter.parseFile(input_path)

In [None]:
import os
import json
import subprocess
    
def process_row(row, output_dir, global_time_started):
    hasError = False
    output_path = None
    key_signature = None
    primary_ts = None
    alternative_ts = None
    url = None
    score_stats = {
        'key': None,
        'primary_time_signature': None,
        'secondary_time_signatures': None,
        'duration': None,
        'tempo': None
    }
    
    try:
        logger.debug('Processing %s...', row['name'])
        time_started = time.time()
        
        input_path = os.path.join(row['input_dir'], row['filename'])
        output_path = os.path.join(output_dir, set_extension(row['filename'], 'xml'))
        
        score = parse_file(input_path)
        logger.debug('\t...file parsed')
        
        score_stats = extract_stats(score)
        logger.debug('\t...stats extractes: %s', json.dumps(score_stats))
    
        normalize(score, score_stats['key'])
        update_metadata(score, row)
        logger.debug('\t...score normalized and updated')
        
        score.write('musicxml', output_path)
        logger.debug('\t...file processed. Processing time: %s', time.time() - time_started)
        logger.info('Processed file %i. Time elapsed %s', row.name, time.time() - global_time_started)
        
    except Exception as e:
        logger.error('Exception occured')
        logger.debug('Exception details: ', exc_info=e)
        hasError = True
        
    return pd.Series({
            'genre': row['genre'],
            'input_dir': row['input_dir'],
            'details_url': row['details_url'],
            'filename_mid' : rowp['filename'],
            'name': row['name'],
            'year': row['year'],
            'composer': row['composer'],
            'filename_xml': output_path,
            'source_url': url,
            'key': score_stats['key'],
            'primary_time_signature': score_stats['primary_time_signature'],
            'secondary_time_signatures': score_stats['secondary_time_signatures'],
            'duration': score_stats['duration'],
            'tempo': score_stats['tempo'],
            'hasError': hasError
        })

In [None]:
import time

def process_dataframe(dataframe, output_dir):
    time_started = time.time()
    total_rows = len(dataframe.index)
    logger.info('Processing files in dataframe. Total items: %i', total_rows)
    result = dataframe.apply(lambda row: process_row(row, '../data/output/', time_started), axis=1)
    logger.info('Finished processing dataframe')
        
    return result

In [None]:
dataset = pd.concat([mutopia_data, kern_data, pianomidi_data, yamaha_data])
dataset.reset_index(inplace=True)
del dataset['index']
dataset.describe()

In [None]:
metadata = pd.DataFrame()
CHUNK_SIZE = 100

for i in range(0, len(dataset.index), CHUNK_SIZE):
    chunk = dataset[i:i+CHUNK_SIZE]
    logger.info('processing chunk {0} to {1}'.format(i, i+CHUNK_SIZE))
    metadata = pd.concat([metadata, process_dataframe(chunk, '../data/output/xml')])
    metadata.to_csv('metatada.backup.{0}.csv'.format(i))
    logger.info('...done')

In [None]:
def get_filename_abc(input_filename):
    output_file, _ = os.path.splitext(os.path.basename(input_filename))
    return '../data/output/abc/' +  output_file + '.abc'

In [None]:
def get_semitones_to_cdur(key_string):
    try:
        key = m21.key.Key(key_string.split()[0])
        if key.mode == 'major':
            interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch('C'))
            return interval.semitones
        if key.mode == 'minor':
            interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch('A'))
            return interval.semitones
    except:
        pass
    
    logger.error('Bad key string: %s', key_string)
    return None

In [None]:
import subprocess

for index, row in metadata.iterrows():
    filename = row['filename_abc']
    basename = os.path.basename(filename)
    semitones = get_semitones_to_cdur(row['key'])
    
    if semitones is not None:
        command = 'abc2abc "{0}" -d -t {1} > "../data/output/abc_transposed/{2}"'.format(filename, semitones, basename)
        subprocess.call(command, shell=True)

In [None]:
# checks
metadata = metadata[~metadata['hasError']]

if not all(metadata['filename_abc'].map(lambda fname: os.path.isfile(fname))):
    logger.error('missing abc files!')
    
if not all(metadata['filename_xml'].map(lambda fname: os.path.isfile(fname))):
    logger.error('missing xml files!')
    
if not all(metadata['filename_mid'].map(lambda fname: os.path.isfile(fname))):
    logger.error('missing mid files!')

In [14]:
metadata = pd.read_csv('../data/output/metadata.csv')

In [15]:
metadata['time_signature'] = None
metadata['secondary_time_signatures'] = None

In [16]:
def parse_time_signature(row):
    if row['time_signature'] is not None:
        return pd.Series({
            'time_signature': row['time_signature'],
            'secondary_time_signatures': row['secondary_time_signatures']
        })
    
    filename = row['filename_xml']
    score = m21.converter.parseFile(filename)
    primary_ts, *alternative_ts = call_safe(extract_time_signatures, [score], default=[None])
    logger.debug('%s: extracted primary time signature: %s', filename, primary_ts)
    return pd.Series({
            'time_signature': primary_ts,
            'secondary_time_signatures': '; '.join(alternative_ts)
        })

In [17]:
result = metadata.apply(parse_time_signature, axis=1)

In [20]:
head()

0    2/4
1    3/4
2    2/2
3    6/8
4    6/8
Name: primary_time_signature, dtype: object

In [23]:

del metadata['primary_time_signature']

In [25]:
del metadata['secondary_time_signatures']
del metadata['time_signature']
metadata = pd.concat([metadata, result], axis=1)

In [26]:
metadata.describe()

Unnamed: 0,duration,hasError,source_url,tempo,year,year_exact
count,2187.0,2187,0.0,2187.0,822.0,2187
mean,223.170912,0,,115.157293,1819.451338,0.275263
std,221.95901,0,,42.066431,60.970446,0.446749
min,8.727273,False,,18.0,1720.0,False
25%,90.0,0,,84.0,1775.0,0
50%,160.76145,0,,112.0,1828.0,0
75%,272.497926,0,,141.0,1850.0,1
max,2156.893319,False,,300.0,2008.0,True


In [27]:
metadata.to_csv('../output/metadata.csv')