In [48]:
import os
from pathlib import Path
from PIL import Image
import numpy as np
from tqdm import tqdm
import re

In [49]:
TXT_LOCATION = Path('/home/macosta/ttmp/primus-data/leipzig-filtered/leipzig-delim-7/')
TOKENIZER_SAVEDIR = Path('/home/macosta/ttmp/primus-data/primus-semantic/semantic-tokenizer/')
TOKENIZER_SAVEDIR.mkdir(exist_ok=True)

In [145]:
TXT_LOCATION = Path('/home/macosta/ttmp/primus-data/primus-semantic/semantic/')
CLEANED_TXT_LOCATION = Path('/home/macosta/ttmp/primus-data/primus-semantic/semantic-cleaned-v2/')
CLEANED_TXT_LOCATION.mkdir(exist_ok=True)
for file in tqdm(os.listdir(TXT_LOCATION)):
    contents = open(TXT_LOCATION / file, 'r').read()
    contents = re.sub('\s+', ' ', contents)
    contents = re.sub('-|_', ' ', contents)
    contents = re.sub('thirty second', 'thirty_second', contents)
    contents = re.sub('sixty fourth', 'sixty_fourth', contents)
    contents = re.sub('quadruple whole', 'quadruple_whole', contents)
    contents = re.sub('double whole', 'double_whole', contents)
    contents = re.sub('clef (\w+)', r'clef-\1', contents)
    contents = re.sub('keySignature (\w+)', r'keySignature-\1', contents)
    contents = re.sub('timeSignature (\w+)', r'timeSignature-\1', contents)
    contents = re.sub('\.\.', ' dotdot', contents)
    contents = re.sub('\.', ' dot', contents)
    with open(CLEANED_TXT_LOCATION / file, "w") as f:
        f.write(contents)

100%|███████████████████████████████████| 87678/87678 [00:11<00:00, 7419.53it/s]


In [23]:
from tokenizers import Tokenizer, Regex
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit, Split

In [37]:
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))

In [38]:
SPECIAL_TOKENS = [
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
#     "<mask>",
]
VOCAB_SIZE = 30000
trainer = WordLevelTrainer(show_progress=True, special_tokens=SPECIAL_TOKENS, vocab_size=VOCAB_SIZE)

In [39]:
tokenizer.pre_tokenizer = WhitespaceSplit()
# tokenizer.pre_tokenizer = Split(pattern=Regex('_|-|(\s+)'), behavior='removed')

In [40]:
files = [str(TXT_LOCATION / path) for path in os.listdir(TXT_LOCATION) if not os.path.isdir(TXT_LOCATION / path)]

In [41]:
tokenizer.train(files, trainer)

In [42]:
with open(files[11], "r") as f:
    text = f.read()

In [43]:
text

'0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111111111111111111111111111111111111111111111110000000000000000_0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111111111111111111111111111111111111111111111110000000000000000_0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111111111111111111111111111111111111111111111110000000000000000_0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111111111111111111111111111111111111111111111110000000000000000_00000000000000000000000000000000000000000000000000111000000000000000111000000000000000111111111111111111111111111111111

In [44]:
len(text.split())

30

In [45]:
encoding = tokenizer.encode(text)

In [46]:
decoded = tokenizer.decode(encoding.ids)
print(decoded.split())

['0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000', '0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111111111111111111111111111111111111111111111110000000000000000_0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111111111111111111111111111111111111111111111110000000000000000_0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111111111111111111111111111111111111111111111110000000000000000_0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111111111111111111111111111111111111111111111110000000000000000_0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011111111111111111111111111111

In [47]:
tokenizer.save(str(TOKENIZER_SAVEDIR / 'tokenizer.json'))

In [50]:
tokenizer = Tokenizer.from_file(str(TOKENIZER_SAVEDIR / 'tokenizer.json'))

In [59]:
words = [x[0] for x in tokenizer.get_vocab().items()]

In [63]:
keysigs = [w for w in words if w[:3] == 'key']

In [64]:
clefs = [w for w in words if w[:4] == 'clef']

In [72]:
import random

KEYSIGS = [
    'keySignature-FM',
    'keySignature-DbM',
    'keySignature-GM',
    'keySignature-EM',
    'keySignature-DM',
    'keySignature-C#M',
    'keySignature-AM',
    'keySignature-AbM',
    'keySignature-F#M',
    'keySignature-CM',
    'keySignature-GbM',
    'keySignature-BM',
    'keySignature-BbM',
    'keySignature-EbM'
]

CLEFS = [
    'clef-G2',
    'clef-C5',
    'clef-C2',
    'clef-F4',
    'clef-C1',
    'clef-C4',
    'clef-C3',
    'clef-F3',
    'clef-F5',
    'clef-G1'
]

def generate_randomized_semantic_seed(timesig, start_token="<s>"):
    keysig = KEYSIGS[int(random.random() * len(KEYSIGS))]
    clef = CLEFS[int(random.random() * len(CLEFS))]
    return ' '.join([start_token, clef, keysig, timesig])

In [84]:
generate_randomized_semantic_seed('timeSignature-3/8')

'<s> clef-C1 keySignature-F#M timeSignature-3/8'

In [88]:
KEYSIGS = [
    'key.sig= "1f" ',
    'key.sig= "5f" ',
    'key.sig= "1s" ',
    'key.sig= "4s" ',
    'key.sig= "2s" ',
    'key.sig= "7s" ',
    'key.sig= "3s" ',
    'key.sig= "4f" ',
    'key.sig= "6s" ',
    '',
    'key.sig= "6f" ',
    'key.sig= "5s" ',
    'key.sig= "2f" ',
    'key.sig= "3f" '
]

CLEFS = [
    'clef.shape= "G" clef.line= "2"',
    'clef.shape= "C" clef.line= "5"',
    'clef.shape= "C" clef.line= "2"',
    'clef.shape= "F" clef.line= "4"',
    'clef.shape= "C" clef.line= "1"',
    'clef.shape= "C" clef.line= "4"',
    'clef.shape= "C" clef.line= "3"',
    'clef.shape= "F" clef.line= "3"',
    'clef.shape= "F" clef.line= "5"',
    'clef.shape= "G" clef.line= "1"'
]

TIMESIGS = [
    'meter.count= "2" meter.unit= "4"',
    'meter.count= "3" meter.unit= "4"',
    'meter.sym= "common"',
    'meter.count= "5" meter.unit= "4"',
    'meter.count= "6" meter.unit= "4"',
    'meter.count= "7" meter.unit= "4"',
    'meter.count= "3" meter.unit= "8"',
    'meter.count= "6" meter.unit= "8"',
]

def generate_randomized_mei_seed(timesig, start_token="<s>"):
    keysig = KEYSIGS[int(random.random() * len(KEYSIGS))]
    clef = CLEFS[int(random.random() * len(CLEFS))]
    return f'{start_token} <music > <body > <mdiv > <score > <scoreDef {keysig}{timesig} > <staffGrp > <staffDef {clef}'

In [102]:
generate_randomized_mei_seed(TIMESIGS[0])

'<s> <music > <body > <mdiv > <score > <scoreDef meter.count= "2" meter.unit= "4" > <staffGrp > <staffDef clef.shape= "C" clef.line= "1"'