In [19]:
import pandas as pd
import music21 as m21
import os
import re

In [24]:
def get_header_line(lines, prefix):
    return [l.replace(prefix, '').strip() for l in lines if l.startswith(prefix)][0]


def parse_abc(filename):
    filename = str(filename)
    time_signature = None,
    tempo = None,
    part_count = None,
    parts = None
    first_part = None
    part_start_lines = []
    
    if os.path.isfile(filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
            lines = [l for l in lines if not l.startswith('%')]
            
            time_signature = get_header_line(lines, 'M:')
            tempo = get_header_line(lines, 'Q:')
            
            for index, line in enumerate(lines):
                if line.startswith('V:'):
                    part_start_lines.append(index)
            
            if part_start_lines:
                part_start_lines.append(len(lines))
                first_part = ''.join(lines[part_start_lines[0] + 1 : part_start_lines[1]])
                first_part = re.sub('\n', '', first_part)
                first_part = re.sub('\\\\', '', first_part)
                first_part = re.sub('|x\d', '', first_part)
                
                
    return pd.Series({
       'time_signature': time_signature,
       'tempo': tempo,
       'part_count': len(part_start_lines),
       'first_part': first_part
    })

In [25]:
metadata = pd.read_csv('../output/metadata.csv', index_col=0)
metadata.head()

Unnamed: 0,composer,details_url,duration,filename_xml,genre,hasError,key,name,source_url,tempo,year,filename,filename_mid,filename_abc,year_exact,period,musical_form,secondary_time_signatures,time_signature
0,adam,piece-info.cgi?id=897,72.0,../output/xml/6e355a07d5531f372a2f7fb9ac084a12...,Classical,False,G major,Giselle - Pas de deux (1er Acte),,80,,6e355a07d5531f372a2f7fb9ac084a120c34440a.mid,../output/mid/6e355a07d5531f372a2f7fb9ac084a12...,../output/abc/6e355a07d5531f372a2f7fb9ac084a12...,False,classical,,,2/4
1,albeniz,piece-info.cgi?id=898,262.0,../output/xml/cf0e37cd297b4b030a506f1e1e2c6373...,Romantic,False,d minor,Rumores de la Caleta,,60,1887.0,cf0e37cd297b4b030a506f1e1e2c637352dcc12d.mid,../output/mid/cf0e37cd297b4b030a506f1e1e2c6373...,../output/abc/cf0e37cd297b4b030a506f1e1e2c6373...,True,romantic,,3/8; 4/4; 3/8; 4/4,3/4
2,andre,piece-info.cgi?id=207,95.769231,../output/xml/9235e4d8f77349089d45f475fb852ab7...,Classical,False,C major,Sonatine,,104,1750.0,9235e4d8f77349089d45f475fb852ab71c954de8.mid,../output/mid/9235e4d8f77349089d45f475fb852ab7...,../output/abc/9235e4d8f77349089d45f475fb852ab7...,False,classical,,2/4,2/2
3,anonymous,piece-info.cgi?id=679,33.626374,../output/xml/6ece453ca4108e48ea6e6d0fec699644...,Popular / Dance,False,D major,La Native,,91,1860.0,6ece453ca4108e48ea6e6d0fec6996441adc09b5.mid,../output/mid/6ece453ca4108e48ea6e6d0fec699644...,../output/abc/6ece453ca4108e48ea6e6d0fec699644...,False,popular / dance,,,6/8
4,anonymous,piece-info.cgi?id=680,41.860465,../output/xml/96b2c5072caf99d739a42bad844e11b6...,Popular / Dance,False,G major,Les Graces,,86,1860.0,96b2c5072caf99d739a42bad844e11b6003445aa.mid,../output/mid/96b2c5072caf99d739a42bad844e11b6...,../output/abc/96b2c5072caf99d739a42bad844e11b6...,False,popular / dance,,,6/8


In [28]:
abc_data = metadata['filename_abc'].str.replace('abc/', 'abc_merged/').apply(parse_abc)
abc_data.head()

Unnamed: 0,first_part,part_count,tempo,time_signature
0,,0,1/4=120,4/4
1,,0,1/4=120,4/4
2,,0,1/4=120,4/4
3,,0,1/4=120,4/4
4,,0,1/4=120,4/4


In [5]:
metadata.columns

Index(['composer', 'details_url', 'duration', 'filename_xml', 'genre',
       'hasError', 'key', 'name', 'primary_time_signature',
       'secondary_time_signatures', 'source_url', 'tempo', 'year', 'filename',
       'filename_mid', 'filename_abc', 'Composer', 'year_exact', 'period',
       'musical_form'],
      dtype='object')

In [26]:
metadata.to_csv('../data/output/metadata.csv', index=False)

In [72]:
with open('../data/prepared/44.txt', 'w') as f:
    f.write(result)

In [25]:
BASEDIR = '../data/result/'

def prepare(files, output, m=None):
    with open(output, 'w') as output_file:
        for i, input_filename in files.iteritems():
            part_started = False
            for line in open(BASEDIR + input_filename, 'r'):
                line = line.replace('\\', '').strip()
                
                if line.startswith('%'):
                    continue
                
                if m is not None and line.startswith('M:'):
                    file_m = line.replace('M:', '').strip()
                    if m != file_m:
                        print('M: {0} != {1}'.format(m, file_m))
                        break
                        
                if line.startswith('V:'):
                    output_file.write('\n')
                    part_started = True
                    continue
                        
                if part_started:
                    if lines.startswith('V:'):
                        break
                        
                    output_file.write(line)

In [25]:
prepare(metadata['filename_abc'].dropna(), '../data/prepared/all.txt', m=None)

In [6]:
romanticism_data = metadata[metadata['period'] == 'romanticism']['filename_abc'].dropna()
prepare(romanticism_data, '../data/prepared/romantic.txt')

0     abc/859c17427b612d62aac51ab14ad809c7ff62d722.abc
1     abc/c05aff259adc395acca7cb42cff8b941f63dfb0a.abc
13    abc/27c5556cf82b3420c9bd407dcbd8dbcf544f1470.abc
22    abc/bd5b0d48c4f11614d12af67eb5d03d964070845e.abc
23    abc/9f900dd904c27e2c0864b028071015cb4d967d4f.abc
Name: filename_abc, dtype: object

In [5]:
baroque_data = metadata[metadata['period'] == 'baroque']['filename_abc'].dropna()
prepare(baroque_data, '../data/prepared/baroque.txt')

2     abc/cd1f471606732412448ecafc853a2b68da0c95f8.abc
4     abc/5e5c35310383699098841f2a7760a6f6e0a09a36.abc
5     abc/7d0d99d532e9c92a6e5ee054bddf8a4cc6a22a1c.abc
21    abc/9fa36529f99c7700aaa454ecb0bf020d87c871c3.abc
55    abc/cae436575bba126ddd2c8013e3e527bff6638db3.abc
Name: filename_abc, dtype: object

In [10]:
baroque_data = metadata[metadata['period'] == 'classicism']['filename_abc'].dropna()
prepare(baroque_data, '../data/prepared/classicism.txt')

In [12]:
prepare(metadata['filename_abc'].dropna(), '../data/prepared/all_data.txt')