In [1]:
import re
from pathlib import Path
import os
from tqdm import tqdm
from PIL import Image
import numpy as np

In [2]:
TXT_DIR = Path('/home/macosta/ttmp/primus-data/leipzig-filtered/leipzig-txt/')
SAVEDIR = Path('/home/macosta/ttmp/primus-data/leipzig-filtered/leipzig-delim-7/')
SAVEDIR.mkdir(exist_ok=True)

In [3]:
class StafflineTokenizer():
    def __init__(self, stafflines, n_rep=1):
        stafflines_w_spaces = [f"{x} " for x in stafflines]
        self.staff_pattern = re.compile(f"({'|'.join(stafflines_w_spaces)})" + "{" + str(n_rep) + ",}")
    def __call__(self, items):
        return (self.tokenize_item(x) for x in items)
    def tokenize_item(self, item):
        if item[-1] != ' ':
            item += ' '
        staffline_stripped = self.staff_pattern.sub("& ", item)
        tokens = staffline_stripped.split("& ")
        tokens = [t.strip().replace(' ', '_') for t in tokens if t]
        return ' '.join(tokens)

In [4]:
STAFFLINE_A = "0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011100000000000000011100000000000000011100000000000000000000000000000000000000000000000000"
STAFFLINE_B = "0000000000000000000000000000000000000000000000000001000000000000000001000000000000000001000000000000000001000000000000000001000000000000000000000000000000000000000000000000000"
STAFFLINES = [STAFFLINE_A, STAFFLINE_B]

In [5]:
st = StafflineTokenizer(STAFFLINES, n_rep=7)

In [6]:
def show_whitespaced(txt):
    cols = txt.replace('_', ' ').split(' ')
    arr = [[int(x) for x in col] for col in cols]
    arr = (1 - np.array(arr, dtype=np.uint8)) * 255
    Image.fromarray(arr.T).show()

In [8]:
for file in tqdm(os.listdir(TXT_DIR)):
    with open(TXT_DIR / file, "r") as f:
        contents = f.read()
    tokenized = st.tokenize_item(contents)
    savepath = SAVEDIR / file
    with open(savepath, "w") as f:
        f.write(tokenized)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29155/29155 [04:23<00:00, 110.62it/s]
