In [None]:
import pandas as pd
import music21 as m21
import os
import re

In [None]:
def get_header_line(lines, prefix):
    return [l.replace(prefix, '').strip() for l in lines if l.startswith(prefix)][0]

def preprocess_lines(lines):
    empty_bar_regex = re.compile('|x\d|')

    lines = [l for l in lines if not l.startswith('%')]
    
    for line in lines:
        while empty_bar_regex.search(line):
            line = re.sub(empty_bar_regex, '|', line)
            print(line)
            
    return lines

def parse_abc(row):
    filename = str(row['filename_abc'])
    time_signature = None,
    tempo = None,
    part_count = None,
    parts = None
    part_strings = []
    part_start_lines = []
    
    if os.path.isfile(filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
            lines = preprocess_lines(lines)
            
            time_signature = get_header_line(lines, 'M:')
            tempo = get_header_line(lines, 'Q:')
            
            for index, line in enumerate(lines):
                if line.startswith('V:'):
                    part_start_lines.append(index)
                
            for i in range(len(part_start_lines) - 1):
                 part_strings =  lines[part_start_lines[i] + 1 : part_start_lines[i+1]]
            
    return pd.Series({
       'time_signature': time_signature,
       'tempo': tempo,
       'part_count': len(part_start_lines),
       'parts': part_strings
    })

In [None]:
abc_data = metadata[:1].apply(parse_abc, axis=1)
print(abc_data)

In [129]:
abc_data.head()

0    % uppera\n% control track\n% creator: \n% GNU ...
1    % upper\nx2\n% control track\n% creator: \n% G...
2    % up:VcI\n%%MIDI program 0\n%%MIDI program 0\n...
3    % upper\n% Creator: GNU LilyPond 2.6.4.3      ...
4    % upper\n% Creator: GNU LilyPond 2.6.4.3      ...
dtype: object

In [24]:
BASEDIR = '../data/result/'

def prepare(files, output, m=None):
    with open(output, 'w') as output_file:
        for i, input_filename in files.iteritems():
            part_started = False
            for line in open(BASEDIR + input_filename, 'r'):
                line = line.replace('\\', '').strip()
                
                if line.startswith('%'):
                    continue
                
                if m is not None and line.startswith('M:'):
                    file_m = line.replace('M:', '').strip()
                    if m != file_m:
                        print('M: {0} != {1}'.format(m, file_m))
                        break
                        
                if line.startswith('V:'):
                    output_file.write('\n')
                    part_started = True
                    continue
                        
                if part_started:
                    output_file.write(line)

In [25]:
prepare(metadata['filename_abc'].dropna(), '../data/prepared/all.txt', m=None)

In [6]:
romanticism_data = metadata[metadata['period'] == 'romanticism']['filename_abc'].dropna()
prepare(romanticism_data, '../data/prepared/romantic.txt')

0     abc/859c17427b612d62aac51ab14ad809c7ff62d722.abc
1     abc/c05aff259adc395acca7cb42cff8b941f63dfb0a.abc
13    abc/27c5556cf82b3420c9bd407dcbd8dbcf544f1470.abc
22    abc/bd5b0d48c4f11614d12af67eb5d03d964070845e.abc
23    abc/9f900dd904c27e2c0864b028071015cb4d967d4f.abc
Name: filename_abc, dtype: object

In [5]:
baroque_data = metadata[metadata['period'] == 'baroque']['filename_abc'].dropna()
prepare(baroque_data, '../data/prepared/baroque.txt')

2     abc/cd1f471606732412448ecafc853a2b68da0c95f8.abc
4     abc/5e5c35310383699098841f2a7760a6f6e0a09a36.abc
5     abc/7d0d99d532e9c92a6e5ee054bddf8a4cc6a22a1c.abc
21    abc/9fa36529f99c7700aaa454ecb0bf020d87c871c3.abc
55    abc/cae436575bba126ddd2c8013e3e527bff6638db3.abc
Name: filename_abc, dtype: object

In [10]:
baroque_data = metadata[metadata['period'] == 'classicism']['filename_abc'].dropna()
prepare(baroque_data, '../data/prepared/classicism.txt')

In [12]:
prepare(metadata['filename_abc'].dropna(), '../data/prepared/all_data.txt')