In [24]:
import shutil
from pathlib import Path
import os
from tqdm import tqdm
import re
from PIL import Image
import numpy as np

In [25]:
PRIMUS_LOCATION = Path('/home/macosta/ttmp/primus-data/primus/')
PMA_PATH = Path('/home/macosta/ttmp/primus-data/primus-mei-agnostic-png/')
CROPPED_PATH = Path('/home/macosta/ttmp/primus-data/cropped/cropped-txt/')
PMA_PATH.mkdir(exist_ok=True)

In [14]:
for package in os.listdir(PRIMUS_LOCATION):
    for incipit_dir in tqdm(os.listdir(PRIMUS_LOCATION / package)):
        incipit_patho = PRIMUS_LOCATION / package / incipit_dir
        files = os.listdir(incipit_path)
        mei_file = incipit_path / [f for f in files if f[0] != '.' and f[-4:] == '.mei'][0]
        agnostic_file = incipit_path / [f for f in files if f[0] != '.' and f[-9:] == '.agnostic'][0]
        png_file = ncipit_path / [f for f in files if f[0] != '.' and f[-4:] == '.png'][0]
        incipit_savepath = PMA_PATH / incipit_dir
        incipit_savepath.mkdir(exist_ok=True)
        shutil.copyfile(mei_file, incipit_savepath / mei_file.name)
        shutil.copyfile(agnostic_file, incipit_savepath / agnostic_file.name)
        shutil.copyfile(png_file, incipit_savepath / png_file.name)

  7%|██████▌                                                                                            | 2922/44084 [00:56<13:12, 51.97it/s]


KeyboardInterrupt: 

In [46]:
for incipit_txt in tqdm(os.listdir(CROPPED_PATH)):
    from_path = CROPPED_PATH / incipit_txt
    if not os.path.isfile(from_path):
        continue
    to_path = PMA_PATH / Path(incipit_txt[:-4]) / incipit_txt
    shutil.copyfile(from_path, to_path)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 87679/87679 [02:54<00:00, 503.56it/s]


In [26]:
class StafflineTokenizer():
    def __init__(self, stafflines, ignore, n_rep=1):
        stafflines_w_spaces = [f"{x} " for x in stafflines]
        ignore_w_spaces = [f"{x} " for x in ignore]
        self.staff_pattern = re.compile(f"({'|'.join(stafflines_w_spaces)})" + "{" + str(n_rep) + ",}")
        self.ignore_pattern = re.compile(f"({'|'.join(ignore_w_spaces)})+")
    def __call__(self, items):
        return (self.tokenize_item(x) for x in items)
    def tokenize_item(self, item):
        if item[-1] != ' ':
            item += ' '
        item = self.ignore_pattern.sub("", item)
        staffline_stripped = self.staff_pattern.sub("& ", item)
        tokens = staffline_stripped.split("& ")
        tokens = [t.strip().replace(' ', '_') for t in tokens if t]
        return tokens

In [27]:
STAFFLINE_A = "0000000000000000000000000000000000000000000000000001000000000000000001000000000000000001000000000000000001000000000000000001000000000000000000000000000000000000000000000000000"
STAFFLINE_B = "0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011100000000000000011100000000000000011100000000000000000000000000000000000000000000000000"
IGNORE = "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
staffline_tokenizer = StafflineTokenizer(stafflines=[STAFFLINE_A, STAFFLINE_B], n_rep=3, ignore=[IGNORE])


In [28]:
def show_token(token):
    cols = token.split('_')
    arr = np.zeros((len(cols), 175), dtype=np.uint8)
    for i, col in enumerate(cols):
        arr[i] = [(1-int(x)) for x in col]
    Image.fromarray(arr.T * 255).show()

In [29]:
CLEF = 'clef'
C_CLEF = 'C'
G_CLEF = 'G'
F_CLEF = 'F'
ACCIDENTAL = 'accidental'
NOTE = 'note'
DIGIT = 'digit'
BARLINE = 'barline'
DOT = 'dot'
BEAMED = 'beamed'
BEAM_LEFT = 'beamedLeft'
BEAM_RIGHT = 'beamedRight'
BEAM_BOTH = 'beamedBoth'
SLUR = 'slur'
START = 'start'
WHOLE = 'whole'
SPACE = 'S'

In [30]:
i = 0
correct = 0
attempted = 0
for incipit_dir in tqdm(os.listdir(PMA_PATH)):
#     if incipit_dir != '000105782-1_1_1':
#         continue
    incipit_path = PMA_PATH / incipit_dir
    agnostic_file = incipit_path / f"{incipit_dir}.agnostic"
    png_file = incipit_path / f"{incipit_dir}.png"
#     Image.open(png_file).show()
    txt_file = incipit_path / f"{incipit_dir}.txt"
    with open(agnostic_file, "r") as f:
        agnostic_contents = f.read()
    with open(txt_file, "r") as f:
        txt_contents = f.read()
    agnostic_contents = re.sub(' +', ' ', agnostic_contents)
    agnostic_symbols = agnostic_contents.strip().split()
    if SLUR in [symbol[:4] for symbol in agnostic_symbols]:
        continue
    tokens = staffline_tokenizer.tokenize_item(txt_contents)
#     for token in tokens:
#         show_token(token)
    rhythmic_tokens = []
    token_index = 0
    agnostic_index = 0
    try:
        while agnostic_index < len(agnostic_symbols) and token_index < len(tokens):
            agnostic_symbol = agnostic_symbols[agnostic_index]
            token = tokens[token_index]
            split_on_hyphen = agnostic_symbol.split('-')
            symbol_type, location = split_on_hyphen[0], ''.join(split_on_hyphen[1:])
            if '.' in symbol_type:
                symbol_type, details = symbol_type.split('.')
                if symbol_type == CLEF:
                    # If C clef, combine two tokens because of whitespace in between
                    if details == C_CLEF or details == F_CLEF:
                        clef_token = '_'.join(tokens[token_index:token_index+2])
                        token_index += 1
                    # Other clefs are a single token
                    else:
                        clef_token = token
                    rhythmic_tokens.append((clef_token, agnostic_symbol))
                elif symbol_type == DIGIT:   
                    # Digits are only their own token in the time signature; combine two agnostic symbols
                    if agnostic_index < len(agnostic_symbols) - 1 \
                    and agnostic_symbols[agnostic_index + 1].split('.')[0] == DIGIT and \
                    agnostic_symbols[agnostic_index + 1].split('-')[1] != location:
                        time_sig = ' '.join(agnostic_symbols[agnostic_index:agnostic_index+2])
                        agnostic_index += 1
                        rhythmic_tokens.append((token, time_sig))
                    else:
                        token_index -= 1
                elif symbol_type == NOTE:
                    if len(details) > len(BEAMED) and details[:len(BEAMED)] == BEAMED:
                        a_symbols = []
                        while agnostic_index < len(agnostic_symbols):
                            agnostic_symbol_ = agnostic_symbols[agnostic_index]
                            split_on_hyphen = agnostic_symbol_.split('-')
                            symbol_type_, location_ = split_on_hyphen[0], ''.join(split_on_hyphen[1:])
                            if '.' not in symbol_type_:
                                a_symbols.append(agnostic_symbol_)
                                agnostic_index += 1
                                continue
                            symbol_type_, details_ = symbol_type_.split('.')
                            if len(details_) > len(BEAM_LEFT) and \
                            details_[:-1] == BEAM_LEFT:
                                a_symbols.append(agnostic_symbol_)
                                break
                            a_symbols.append(agnostic_symbol_)
                            agnostic_index += 1
                        rhythmic_tokens.append((token, ' '.join(a_symbols)))
#                     elif details == WHOLE and location[0] == SPACE and 1 <= int(location[1]) <= 4:
#                         whole_note = '_'.join(tokens[token_index:token_index+2])
#                         token_index += 1
#                         rhythmic_tokens.append((whole_note, agnostic_symbol))
                    else:
                        rhythmic_tokens.append((token, agnostic_symbol))
                else:
                    rhythmic_tokens.append((token, agnostic_symbol))
            else:
                rhythmic_tokens.append((token, agnostic_symbol))
            token_index += 1
            agnostic_index += 1
        if token_index == len(tokens) and agnostic_index == len(agnostic_symbols):
            correct += 1
#         else:
#             print(png_file)
#             print(agnostic_symbols)
#             Image.open(png_file).show()
#             for t, symbol in rhythmic_tokens:
#                 show_token(t)
#                 print(symbol)
        attempted += 1
        i += 1
        if i % 5000 == 0:
            print(f"{correct}/{attempted} {correct * 100 / attempted}")
    except Exception as e:
        print(e)
print(f"{correct}/{attempted} {correct * 100 / attempted}")

  6%|██▍                                   | 5556/87678 [02:24<36:28, 37.53it/s]

3698/5000 73.96


 13%|████▋                                | 11101/87678 [04:48<31:11, 40.91it/s]

7467/10000 74.67


 19%|███████                              | 16617/87678 [07:13<32:35, 36.34it/s]

11153/15000 74.35333333333334


 25%|█████████▎                           | 22164/87678 [09:38<24:10, 45.16it/s]

14853/20000 74.265


 32%|███████████▋                         | 27727/87678 [12:03<26:44, 37.36it/s]

18595/25000 74.38


 38%|██████████████                       | 33294/87678 [14:27<25:35, 35.41it/s]

22286/30000 74.28666666666666


 44%|████████████████▍                    | 38832/87678 [16:52<24:16, 33.54it/s]

25954/35000 74.15428571428572


 51%|██████████████████▋                  | 44385/87678 [19:17<20:01, 36.03it/s]

29692/40000 74.23


 57%|█████████████████████                | 49927/87678 [21:41<16:35, 37.92it/s]

33392/45000 74.20444444444445


 63%|███████████████████████▍             | 55491/87678 [24:06<15:39, 34.27it/s]

37071/50000 74.142


 70%|█████████████████████████▊           | 61051/87678 [26:31<11:04, 40.08it/s]

40763/55000 74.11454545454545


 76%|████████████████████████████         | 66640/87678 [28:56<09:04, 38.61it/s]

44458/60000 74.09666666666666


 82%|██████████████████████████████▍      | 72200/87678 [31:21<07:11, 35.88it/s]

48102/65000 74.00307692307692


 89%|████████████████████████████████▊    | 77790/87678 [33:46<03:53, 42.42it/s]

51761/70000 73.94428571428571


 95%|███████████████████████████████████▏ | 83325/87678 [36:11<01:49, 39.73it/s]

55412/75000 73.88266666666667


100%|█████████████████████████████████████| 87678/87678 [38:03<00:00, 38.39it/s]


58338/78916 73.92417253788838
