In [12]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from pathlib import Path
import numpy as np
from PIL import Image
import os
import json
import re
import random

In [2]:
VOCAB_SIZE = 30000
MAX_LEN = 128

In [3]:
TOKENIZER_SAVEDIR = Path('/home/macosta/ttmp/primus-data/primus-semantic/semantic-tokenizer/')
LM_MODEL_SAVEDIR = Path('/home/macosta/ttmp/primus-models/gpt2-lm-semantic-norhythm/')
PRIMUS_TXT_FILES = Path('/home/macosta/ttmp/primus-data/primus-semantic/semantic-cleaned/')

In [4]:
config = GPT2Config(
    vocab_size=VOCAB_SIZE,
    n_positions=MAX_LEN,
    n_head=12,
)

In [5]:
model = GPT2LMHeadModel(config=config).from_pretrained(str(LM_MODEL_SAVEDIR))

In [6]:
tokenizer = GPT2TokenizerFast.from_pretrained(TOKENIZER_SAVEDIR, max_len=MAX_LEN)

file /home/macosta/ttmp/primus-data/primus-semantic/semantic-tokenizer/config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
TXT_FILES = [PRIMUS_TXT_FILES / file for file in os.listdir(PRIMUS_TXT_FILES)]

In [8]:
print(open(TXT_FILES[10], "r").read())

clef-G2 keySignature-GM timeSignature-3/4 note D5 eighth dot note E5 sixteenth barline note E5 quarter dot note D5 eighth note D5 eighth dot note E5 sixteenth barline note E5 quarter note D5 quarter note D5 eighth note D5 eighth barline note G5 quarter dot note B4 eighth note F#5 eighth dot note E5 sixteenth barline note D5 quarter note C#5 quarter 


In [9]:
def join_notes(text):
    text = re.sub(r'note (.*?) ', r'note-\1_', text)
    text = re.sub(r'rest ', r'rest-', text)
    return text

In [10]:
def pseudo_semantic_to_semantic(text):
    text = re.sub(r'note (.*?) ', r'note-\1_', text)
    text = re.sub(r'rest ', r'rest-', text)
    text = re.sub(' dotdot', '..', text)
    text = re.sub(' dot', '.', text)
    return text

In [10]:
import rhythm_util as ru

In [11]:
TIMESIGS = [
    'timeSignature-3/8',
    'timeSignature-6/8',
    'timeSignature-2/4',
    'timeSignature-3/4', 
    'timeSignature-C', 
    'timeSignature-5/4', 
    'timeSignature-6/4',
    'timeSignature-7/4',
]

In [13]:
KEYSIGS = [
    'keySignature-FM',
    'keySignature-DbM',
    'keySignature-GM',
    'keySignature-EM',
    'keySignature-DM',
    'keySignature-C#M',
    'keySignature-AM',
    'keySignature-AbM',
    'keySignature-F#M',
    'keySignature-CM',
    'keySignature-GbM',
    'keySignature-BM',
    'keySignature-BbM',
    'keySignature-EbM'
]

CLEFS = [
    'clef-G2',
    'clef-C5',
    'clef-C2',
    'clef-F4',
    'clef-C1',
    'clef-C4',
    'clef-C3',
    'clef-F3',
    'clef-F5',
    'clef-G1'
]

def generate_randomized_semantic_seed(timesig, start_token="<s>"):
    keysig = KEYSIGS[int(random.random() * len(KEYSIGS))]
    clef = CLEFS[int(random.random() * len(CLEFS))]
    return ' '.join([start_token, clef, keysig, timesig])

In [None]:
for i in range(10):
    print9

In [33]:
def save_timesig_generated(n, timesig, savedir):
    savedir.mkdir(exist_ok=True, parents=True)
    input_ids = tokenizer.encode(generate_randomized_semantic_seed(timesig), return_tensors='pt')
    for i in tqdm(range(n)):
        output_tokens = model.generate(input_ids, 
                                       pad_token_id=1,
                                       eos_token_id=2,
                                       temperature=1,
                                       max_length=128,
                                       do_sample=True)[0]
        output_tokens = tokenizer.decode(output_tokens).split()
        output = join_notes(' '.join(output_tokens))
        output = re.sub(' dotdot', '..', output)
        output = re.sub(' dot', '.', output)
        if output[-5:] == ' </s>':
            output = output[:-5]
        if output[:4] == '<s> ':
            output = output[4:]
        with open(savedir / f"generated_{i}.txt", "w") as f:
            f.write(output)
        i += 1

In [34]:
N = 2000
for t in TIMESIGS:
    save_timesig_generated(N, t, Path(f'/home/macosta/ttmp/generated-semantic-final/{t.replace("/","-")}/'))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.35s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.31s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.17s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.17s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.27s/it]
100%|███████████████████████████████████████████████

In [None]:
# N = 5000
# for t in TIMESIGS:
#     correct = check_correctness(N, t, log_frequency=100)
#     print(f'\nNUM CORRECT FOR {t}: {correct} / {N}\n')

In [88]:
# import semantic_to_pae as stp

In [89]:
def create_incipits(n, directory):
    input_str = tokenizer.encode("<s>", return_tensors="pt")
    for i in tqdm(range(n)):
        output_tokens = model.generate(input_str, 
                                       pad_token_id=1,
                                       eos_token_id=2,
                                       temperature=1,
                                       max_length=128,
                                       do_sample=True)[0]
        output_tokens = tokenizer.decode(output_tokens).split()
        output = join_notes(' '.join(output_tokens))
        output = re.sub(' dotdot', '..', output)
        output = re.sub(' dot', '.', output)
        if output[-5:] == ' </s>':
            output = output[:-5]
        if output[:4] == '<s> ':
            output = output[4:]
        stp.convert_and_save(output, Path(directory) / f"output-{i}.json")

In [91]:
# create_incipits(30, '/home/macosta/ttmp/generated-semantic-nonrhythmic/')

100%|███████████████████████████████████████████| 30/30 [00:57<00:00,  1.93s/it]


In [24]:
# input_str = tokenizer.encode('clef-G2 keySignature-GM timeSignature-3/4 note D5 eighth dot note E5 sixteenth barline note E5 quarter dot note D5 eighth note D5 eighth dot note E5 sixteenth barline note E5 quarter note D5 quarter note D5 eighth note D5 eighth barline note G5 quarter dot note B4 eighth note F#5 eighth dot note E5 sixteenth barline note D5 quarter note C#5 quarter', return_tensors='pt')

In [25]:
# input_str

tensor([[21, 45, 40,  4,  9,  5, 14,  4, 16,  8,  7,  4, 16,  6, 14,  4,  9,  5,
          4,  9,  5, 14,  4, 16,  8,  7,  4, 16,  6,  4,  9,  6,  4,  9,  5,  4,
          9,  5,  7,  4, 19,  6, 14,  4, 17,  5,  4, 28,  5, 14,  4, 16,  8,  7,
          4,  9,  6,  4, 26,  6]])

In [14]:
# output_tokens = model.generate(input_str, 
#                                pad_token_id=1,
#                                eos_token_id=2,
#                                temperature=1,
#                                max_length=128,
#                                do_sample=True)

In [15]:
# output_tokens

tensor([[21, 45, 40,  4,  9,  5, 14,  4, 16,  8,  7,  4, 16,  6, 14,  4,  9,  5,
          4,  9,  5, 14,  4, 16,  8,  7,  4, 16,  6,  4,  9,  6,  4,  9,  5,  4,
          9,  5,  7,  4, 19,  6, 14,  4, 17,  5,  4, 28,  5, 14,  4, 16,  8,  7,
          4,  9,  6,  4, 26,  6,  2]])

In [23]:
# tokenizer.decode(output_tokens[0])

'clef-G2 keySignature-GM timeSignature-3/4 note D5 eighth dot note E5 sixteenth barline note E5 quarter dot note D5 eighth note D5 eighth dot note E5 sixteenth barline note E5 quarter note D5 quarter note D5 eighth note D5 eighth barline note G5 quarter dot note B4 eighth note F#5 eighth dot note E5 sixteenth barline note D5 quarter note C#5 quarter </s>'