In [11]:
import sys
sys.path.append('../../source')

from utils import *

import pickle
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob
import time
import random

import encoders

In [12]:
# This is the path to the labeled dataset
data_path = Path("/home/abunn/ttmp")
repo_path = data_path/"piano_bootleg_scores"
piano_bootleg_scores_path = repo_path/"imslp_bootleg_dir-v1"

In [13]:
# Grab all file locations
piece_names = glob.glob(str(Path(piano_bootleg_scores_path)/"**/*"))

# This gets only one version of each piece
# fnames = [glob.glob(str(Path(piece_name)/"*.pkl"))[0] for piece_name in piece_names if len(glob.glob(str(Path(piece_name)/"*.pkl"))) != 0]

# This gets every version of every piece
fnames = glob.glob(str(piano_bootleg_scores_path/"**/*.pkl"), recursive=True)

print(len(fnames))

31834


In [14]:
filler_file = '../../cfg_files/filler_imslp.txt'
filler = {}
with open(filler_file, 'r') as f:
    for line in f:
        parts = line.strip("\n").split('\t')
        assert len(parts) == 2

        filler[parts[0]] = [int(i) for i in parts[1].split(",")]

In [15]:
# CONVERTING THE DATA TO BINARY MATRICES - MIGHT TAKE A MINUTE

t0 = time.time()


# List of tuples
# Tuples contain (binary_score, composer)
pieces = []

for fname in tqdm(fnames):
    # Load the pages
    pages = load_pkl(fname)

    filler_key = fname.split("imslp_bootleg_dir-v1/")[1].strip(".pkl")

    filler_pages = filler[filler_key] if filler_key in filler.keys() else []

    # Convert them into binary matrices
    bscores = [ints_to_binary_matrix(page) for i, page in enumerate(pages) if i not in filler_pages]
    bscores = [page for page in bscores if len(page.shape) == 2 and page.shape[1] == 62]

    # If there were binary scores, then combine them into one and append to dataset.
    if len(bscores) > 0:
        piece = np.concatenate(bscores, axis=0)
        pieces.append(piece)

print("Seconds to complete:", round(time.time() - t0))

100%|██████████| 31834/31834 [06:31<00:00, 81.33it/s] 

Seconds to complete: 391





In [16]:
print(sum(len(piece[0]) for piece in pieces))

1738852


In [None]:
print(sum(len(piece[0]) for piece in pieces))

1807672


In [17]:
def create_dataset(pieces, valid_split=.15, test_split=.15):
    """
    Creates a train / valid / test split dataset of pieces.
    pieces: The list of binary_matrices to sample from
    valid_split: The proportion of data to use for valid
    test_split: The proportion of data to use for valid
    
    returns:
    x & y lists for train, valid, and test sets
    """
    
    # For repeatability
    np.random.seed(42)

    # shuffle pieces
    piece_list = [piece for piece in pieces]
    np.random.shuffle(piece_list)
    
    # Calculate starting places of each section - order is (test, valid, train)
    train_start = round((valid_split+test_split)*len(piece_list))
    valid_start = round(test_split*len(piece_list))
    
    # Go through and separate pieces into train, valid, test
    train_pieces = piece_list[train_start:]
    valid_pieces = piece_list[valid_start:train_start]
    test_pieces = piece_list[:valid_start]
    
    return train_pieces, valid_pieces, test_pieces

In [18]:
train, valid, test = create_dataset(pieces, valid_split=.2, test_split=0)

In [19]:
print(len(train))
print(len(valid))

22437
5609


### Sparse encodings

In [20]:
for enc_name, enc in encoders.sparse_encoders.items():
    train_encoded = []
    for piece in tqdm(train):
        train_encoded.append(enc(piece))
        
    valid_encoded = []
    for piece in tqdm(valid):
        valid_encoded.append(enc(piece))
    
    # Data for LM pretraining
    with open(f"LM_pretraining_data/{enc_name}-train.txt", "w") as f:
        f.write("\n\n".join(train_encoded))
    with open(f"LM_pretraining_data/{enc_name}-valid.txt", "w") as f:
        f.write("\n\n".join(valid_encoded))

100%|██████████| 22437/22437 [05:25<00:00, 68.89it/s] 
100%|██████████| 5609/5609 [01:22<00:00, 67.78it/s] 
100%|██████████| 22437/22437 [05:41<00:00, 65.69it/s] 
100%|██████████| 5609/5609 [01:28<00:00, 63.50it/s] 
100%|██████████| 22437/22437 [06:31<00:00, 57.30it/s]
100%|██████████| 5609/5609 [01:37<00:00, 57.28it/s]


### Dense encodings

In [21]:
block_sizes = [
    [1, 1],
    [1, 2],
    [1, 4],
    [1, 8],
]

for block_size in block_sizes:
    train_encoded = []
    for piece in tqdm(train):
        train_encoded.append(encoders.dense_encoder(piece, block_size=block_size))
        
    valid_encoded = []
    for piece in tqdm(valid):
        valid_encoded.append(encoders.dense_encoder(piece, block_size=block_size))
    
    # Data for LM pretraining
    with open(f"LM_pretraining_data/dense_{block_size[0]}_{block_size[1]}-train.txt", "w") as f:
        f.write("\n\n".join(train_encoded))
    with open(f"LM_pretraining_data/dense_{block_size[0]}_{block_size[1]}-valid.txt", "w") as f:
        f.write("\n\n".join(valid_encoded))

100%|██████████| 22437/22437 [14:48<00:00, 25.24it/s]
100%|██████████| 5609/5609 [03:44<00:00, 24.96it/s]
100%|██████████| 22437/22437 [09:39<00:00, 38.71it/s]
100%|██████████| 5609/5609 [02:26<00:00, 38.34it/s]
100%|██████████| 22437/22437 [06:34<00:00, 56.85it/s]
100%|██████████| 5609/5609 [01:40<00:00, 55.68it/s]
100%|██████████| 22437/22437 [04:01<00:00, 93.07it/s] 
100%|██████████| 5609/5609 [01:01<00:00, 90.56it/s] 
