In [9]:
import pickle
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import glob
import time
import random

# import encoders

In [10]:
username = ""  # your username
data_path = Path(f"/home/{username}/ttmp/PBSCR")
repo_path = data_path / "piano_bootleg_scores"
piano_bootleg_scores_path = repo_path / "imslp_bootleg_dir-v1.1"

In [11]:
# Grab all file locations
piece_names = glob.glob(str(Path(piano_bootleg_scores_path) / "**/*"))

# This gets only one version of each piece
# fnames = [glob.glob(str(Path(piece_name)/"*.pkl"))[0] for piece_name in piece_names if len(glob.glob(str(Path(piece_name)/"*.pkl"))) != 0]

# This gets every version of every piece
fnames = glob.glob(str(piano_bootleg_scores_path / "**/*.pkl"), recursive=True)


print(len(fnames))

29156


In [12]:
filler_file = "../filler.tsv"
filler = {}
with open(filler_file, "r") as f:
    for line in f:
        parts = line.strip("\n").split("\t")
        if not parts[0] in filler:
            filler[parts[0]] = []
        if float(parts[2]) >= 0.5:
            filler[parts[0]].append(int(parts[1]))

In [13]:
## HELPER
def ints_to_binary_matrix(score_seq):  # converts integer sequence to n x 62 matrix
    matrix = []
    for event in score_seq:
        binary_rep = list(np.binary_repr(event, 62))
        matrix.append(binary_rep)
    np_mat = np.array(matrix, dtype=np.uint8)
    # np_mat = np.flip(np_mat, axis=0)  # flip to have least significant bit at the front
    return np_mat


# CONVERTING THE DATA TO BINARY MATRICES - MIGHT TAKE A MINUTE

t0 = time.time()


# List of tuples
# Tuples contain (binary_score, composer)
pieces = []

for fname in tqdm(fnames):
    # Load the pages
    pages = pd.read_pickle(fname)

    filler_key = fname.split("imslp_bootleg_dir-v1.1/")[1].strip(".pkl")

    filler_pages = filler[filler_key] if filler_key in filler.keys() else []

    # Convert them into binary matrices
    bscores = [
        ints_to_binary_matrix(page)
        for i, page in enumerate(pages)
        if i not in filler_pages
    ]
    bscores = [page for page in bscores if len(page.shape) == 2 and page.shape[1] == 62]

    # If there were binary scores, then combine them into one and append to dataset.
    if len(bscores) > 0:
        piece = np.concatenate(bscores, axis=0)
        pieces.append(piece)

print("Seconds to complete:", round(time.time() - t0))

100%|█████████████████████████████████████| 29156/29156 [06:23<00:00, 76.07it/s]

Seconds to complete: 383





In [14]:
print(sum(len(piece[0]) for piece in pieces))

1738852


In [15]:
def create_dataset(pieces, valid_split=0.15, test_split=0.15):
    """
    Creates a train / valid / test split dataset of pieces.
    pieces: The list of binary_matrices to sample from
    valid_split: The proportion of data to use for valid
    test_split: The proportion of data to use for valid

    returns:
    x & y lists for train, valid, and test sets
    """

    # For repeatability
    np.random.seed(42)

    # shuffle pieces
    piece_list = [piece for piece in pieces]
    np.random.shuffle(piece_list)

    # Calculate starting places of each section - order is (test, valid, train)
    train_start = round((valid_split + test_split) * len(piece_list))
    valid_start = round(test_split * len(piece_list))

    # Go through and separate pieces into train, valid, test
    train_pieces = piece_list[train_start:]
    valid_pieces = piece_list[valid_start:train_start]
    test_pieces = piece_list[:valid_start]

    return train_pieces, valid_pieces, test_pieces

In [16]:
train, valid, test = create_dataset(pieces, valid_split=0.2, test_split=0)

In [17]:
print(len(train))
print(len(valid))

22437
5609


### Dense encodings


In [18]:
# Dense Encoder

# Continuous line of 256 unicode characters
start = 10060  # 931
dense_characters = [
    chr(i).encode("utf-8").decode("utf-8") for i in range(start, start + 512)
]


# This code divides the fragment into blocks (and discards any remaining info at the very edges)
# Then it uses einsum with a filter of powers of 2 to convert from binary to an integer.  Then converts integers into
# unicode characters


def merge_staff_overlaps(bscores):
    """
    Takes in either one binary score or a batch of them and merges the left and right hands
    """

    # Lower middle c is index 23
    # Upper middle c is index 33
    lower = 23
    upper = 33
    middle = (lower + upper) // 2

    # Total notes is 52
    total = 52

    # Pad out upper hand and lower hand and combine them
    padded_lower = np.concatenate(
        [bscores[..., :middle], np.zeros((*bscores.shape[:-1], total - middle))],
        axis=-1,
    )
    padded_upper = np.concatenate(
        [
            np.zeros((*bscores.shape[:-1], middle - bscores.shape[-1] + total)),
            bscores[..., middle:],
        ],
        axis=-1,
    )
    # Logical or
    merged = padded_lower + padded_upper - padded_lower * padded_upper
    return merged


def dense_encoder(fragment, block_size=[1, 1]):
    fragment = merge_staff_overlaps(fragment)
    # Rewrote this to be much faster but looks complicated
    # This filter has powers of 2 which is how the binary is turned to ints
    filter_ = np.power(2, np.arange(np.prod(block_size))).reshape(block_size)

    # The fragment is split into blocks here
    xblocks = np.stack(
        np.split(
            fragment[:, : (fragment.shape[1] // block_size[1]) * block_size[1]],
            fragment.shape[1] // block_size[1],
            axis=1,
        )
    )
    xyblocks = np.stack(
        np.split(
            xblocks[:, : (xblocks.shape[1] // block_size[0]) * block_size[0]],
            xblocks.shape[1] // block_size[0],
            axis=1,
        )
    )

    # The blocks are multiplied so they are ints
    numbers = np.einsum("ijkl,kl->ij", xyblocks, filter_)

    # The ints are turned into corresponding characters
    characters = (numbers + start).astype(np.int32).view("U1")
    return " ".join(["".join(t) for t in characters])

In [19]:
train_encoded = []
for piece in tqdm(train):
    train_encoded.append(dense_encoder(piece, block_size=[1, 8]))

valid_encoded = []
for piece in tqdm(valid):
    valid_encoded.append(dense_encoder(piece, block_size=[1, 8]))

# Data for LM pretraining
lm_pretraining_dir = data_path / "LM_pretraining_data"
lm_pretraining_dir.mkdir(exist_ok=True)
with open(data_path / "LM_pretraining_data/dense_1_8-train.txt", "w") as f:
    f.write("\n\n".join(train_encoded))
with open(data_path / "LM_pretraining_data/dense_1_8-valid.txt", "w") as f:
    f.write("\n\n".join(valid_encoded))

100%|█████████████████████████████████████| 22437/22437 [03:58<00:00, 94.11it/s]
100%|███████████████████████████████████████| 5609/5609 [01:01<00:00, 91.63it/s]
