In [1]:
import pandas as pd
import music21 as m21
import os
import re

In [2]:
def get_header_line(lines, prefix):
    return [l.replace(prefix, '').strip() for l in lines if l.startswith(prefix)][0]


def parse_abc(row):
    filename = str(row['filename_abc'])
    time_signature = None,
    tempo = None,
    part_count = None,
    parts = None
    first_part = None
    part_start_lines = []
    
    if os.path.isfile(filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
            lines = [l for l in lines if not l.startswith('%')]
            
            time_signature = get_header_line(lines, 'M:')
            tempo = get_header_line(lines, 'Q:')
            
            for index, line in enumerate(lines):
                if line.startswith('V:'):
                    part_start_lines.append(index)
            
            if part_start_lines:
                part_start_lines.append(len(lines))
                first_part = ''.join(lines[part_start_lines[0] + 1 : part_start_lines[1]])
                first_part = re.sub('\n', '', first_part)
                first_part = re.sub('\\\\', '', first_part)
                first_part = re.sub('|x\d', '', first_part)
                
                
    return pd.Series({
       'time_signature': time_signature,
       'tempo': tempo,
       'part_count': len(part_start_lines),
       'first_part': first_part
    })

In [3]:
metadata = pd.read_csv('../data/output/metadata.csv')
metadata.head()

OSError: File b'../data/output/metadata.csv' does not exist

In [4]:
abc_data = metadata.apply(parse_abc, axis=1)
abc_data.head()

Unnamed: 0,first_part,part_count,tempo,time_signature
0,"E4 [F4B,4] D4 A,4| B,4 C4 E4 G,4| A,4 G,4 A,4 ...",5,1/4=80,2/4
1,x2(3_ABe[e'-_a]e'-| e'6 x2M:3/8L:1/16_A--| _A3...,3,1/4=60,3/4
2,G4 c4 B4 c4| d8 f8| e4 c4 d4 B4| c8 G8|e4 B4 c...,3,1/4=104,2/2
3,EF2<G2A| G2G3A G2c4| G2G4 EF2<G2A| G2G3A G2d2d...,3,1/4=91,6/8
4,G2F2G2| c4c2 [=FE-]ED2E2| G4G2 c2G2^G2| A2D2E2...,3,1/4=86,6/8


In [5]:
metadata.columns

Index(['composer', 'details_url', 'duration', 'filename_xml', 'genre',
       'hasError', 'key', 'name', 'primary_time_signature',
       'secondary_time_signatures', 'source_url', 'tempo', 'year', 'filename',
       'filename_mid', 'filename_abc', 'Composer', 'year_exact', 'period',
       'musical_form'],
      dtype='object')

In [26]:
metadata.to_csv('../data/output/metadata.csv', index=False)

In [72]:
with open('../data/prepared/44.txt', 'w') as f:
    f.write(result)

In [25]:
BASEDIR = '../data/result/'

def prepare(files, output, m=None):
    with open(output, 'w') as output_file:
        for i, input_filename in files.iteritems():
            part_started = False
            for line in open(BASEDIR + input_filename, 'r'):
                line = line.replace('\\', '').strip()
                
                if line.startswith('%'):
                    continue
                
                if m is not None and line.startswith('M:'):
                    file_m = line.replace('M:', '').strip()
                    if m != file_m:
                        print('M: {0} != {1}'.format(m, file_m))
                        break
                        
                if line.startswith('V:'):
                    output_file.write('\n')
                    part_started = True
                    continue
                        
                if part_started:
                    if lines.startswith('V:'):
                        break
                        
                    output_file.write(line)

In [25]:
prepare(metadata['filename_abc'].dropna(), '../data/prepared/all.txt', m=None)

In [6]:
romanticism_data = metadata[metadata['period'] == 'romanticism']['filename_abc'].dropna()
prepare(romanticism_data, '../data/prepared/romantic.txt')

0     abc/859c17427b612d62aac51ab14ad809c7ff62d722.abc
1     abc/c05aff259adc395acca7cb42cff8b941f63dfb0a.abc
13    abc/27c5556cf82b3420c9bd407dcbd8dbcf544f1470.abc
22    abc/bd5b0d48c4f11614d12af67eb5d03d964070845e.abc
23    abc/9f900dd904c27e2c0864b028071015cb4d967d4f.abc
Name: filename_abc, dtype: object

In [5]:
baroque_data = metadata[metadata['period'] == 'baroque']['filename_abc'].dropna()
prepare(baroque_data, '../data/prepared/baroque.txt')

2     abc/cd1f471606732412448ecafc853a2b68da0c95f8.abc
4     abc/5e5c35310383699098841f2a7760a6f6e0a09a36.abc
5     abc/7d0d99d532e9c92a6e5ee054bddf8a4cc6a22a1c.abc
21    abc/9fa36529f99c7700aaa454ecb0bf020d87c871c3.abc
55    abc/cae436575bba126ddd2c8013e3e527bff6638db3.abc
Name: filename_abc, dtype: object

In [10]:
baroque_data = metadata[metadata['period'] == 'classicism']['filename_abc'].dropna()
prepare(baroque_data, '../data/prepared/classicism.txt')

In [12]:
prepare(metadata['filename_abc'].dropna(), '../data/prepared/all_data.txt')