# Dataset Creation

9-way and 100-way dataset generation

Imports

In [1]:
import random
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from bisect import bisect_left
import utils
from tqdm.auto import tqdm

Directory to save data

In [2]:
data_path = Path("/home/ajain/ttmp/PBSCSR_data/")  # path to store large files 

Get bootleg score data

In [3]:
# Clone IMSLP bootleg scores
repo_path = data_path/"piano_bootleg_scores"
!git clone https://github.com/HMC-MIR/piano_bootleg_scores.git {repo_path}
piano_bootleg_scores_path = repo_path/"imslp_bootleg_dir-v1"

fatal: destination path '/home/ajain/ttmp/PBSCSR_data/piano_bootleg_scores' already exists and is not an empty directory.


Random seeding

In [4]:
seed = 42
np.random.seed(seed)
random.seed(seed)

Get list of composers for 9-way and 100-way

In [5]:
# Get list of 9-way and 100-way composers
with open('../9_way_list.txt', "r") as f:
    nine_way_composers = f.read().splitlines()
with open('../100_way_list.txt', "r") as f:
    hundred_way_composers = f.read().splitlines()

Filter filler, choose longest PDF for each piece, and create pool of pieces attached to composer

In [6]:
def process_filler(filler_file, imslp_bootleg_path, filler_threshold=0.5):
    composer_dict = {}
    with open(filler_file, "r") as fin:
        lines = fin.readlines()
        lines = [line.split("\t") for line in lines]
        for path, page, score in lines:
            parts = path.split("/")
            composer, piece, id = parts[0], "/".join(parts[1:-1]), parts[-1]
            composer_dict[composer] = {} if composer not in composer_dict else composer_dict[composer]
            composer_dict[composer][piece] = {} if piece not in composer_dict[composer] else composer_dict[composer][piece]
            composer_dict[composer][piece][id] = {"valid_pages":[], "count":0} if id not in composer_dict[composer][piece] else composer_dict[composer][piece][id]
            if float(score) < filler_threshold:
                bscore_page = pd.read_pickle(imslp_bootleg_path/f"{path}.pkl")[int(page)]
                composer_dict[composer][piece][id]["valid_pages"].append(int(page))
                composer_dict[composer][piece][id]["count"] += len(bscore_page)
    return composer_dict


In [7]:
## HELPER
def ints_to_binary_matrix(score_seq):  # converts integer sequence to n x 62 matrix
    matrix = []
    for event in score_seq:
        binary_rep = list(np.binary_repr(event, 62))
        matrix.append(binary_rep)
    np_mat = np.array(matrix, dtype=np.uint8)
    #np_mat = np.flip(np_mat, axis=0)  # flip to have least significant bit at the front
    return np_mat

    
# Filter out filler and choose longest score PDF for each unique piece
composer_dict = process_filler("../filler.tsv", piano_bootleg_scores_path, filler_threshold=0.5)
valid_pdfs = {}
for composer in composer_dict:
    for piece in composer_dict[composer]:
        max_count = 0
        for id in composer_dict[composer][piece]:
            if composer_dict[composer][piece][id]["count"] > max_count:
                max_count = composer_dict[composer][piece][id]["count"]
                valid_pdfs[composer] = {} if composer not in valid_pdfs else valid_pdfs[composer]
                valid_pdfs[composer][piece] = {"id": id,
                                               "valid_pages": composer_dict[composer][piece][id]["valid_pages"],
                                               "count": max_count}

# Create pool of bootleg score binary matrices
# List of tuples containing (binary_score, composer)
pieces = []
# for composer in list(set(nine_way_composers) | set(hundred_way_composers)): # only take required composers
for composer in tqdm(list(set(nine_way_composers) | set(hundred_way_composers))):
    for piece in valid_pdfs[composer]:
        pkl = piano_bootleg_scores_path/f"{composer}/{piece}/{valid_pdfs[composer][piece]['id']}.pkl"
        page_scores  = pd.read_pickle(pkl)

        valid_pages = valid_pdfs[composer][piece]["valid_pages"]
        bscores = []
        for page in valid_pages:
            page_score = page_scores[page]
            bscores.append(ints_to_binary_matrix(page_score))
        bscores = [page for page in bscores if len(page.shape) == 2 and page.shape[1] == 62]
        
        piece = np.concatenate(bscores, axis=0)
        pieces.append((piece, composer, pkl))
            

  0%|          | 0/101 [00:00<?, ?it/s]

In [15]:
with open("../100_way_list.txt") as f:
    hundred_way_composers = f.read().splitlines()
    piece_count = 0
    for composer in hundred_way_composers:
        piece_count += len(composer_dict[composer])

print("Piece count:", piece_count)

Piece count: 4997


In [21]:
with open("../100_way_list.txt") as f:
    hundred_way_composers = f.read().splitlines()
    page_count = 0
    for composer in hundred_way_composers:
        for valid_pdf in valid_pdfs[composer]:
            d = pd.read_pickle(piano_bootleg_scores_path/f"{composer}/{valid_pdf}/{valid_pdfs[composer][valid_pdf]['id']}.pkl")
            page_count += len(d)
print("Page count:", page_count)

Page count: 70440


In [22]:

with open("../100_way_list.txt") as f:
    hundred_way_composers = f.read().splitlines()
    nonfiller_page_count = 0
    for composer in hundred_way_composers:
        for valid_pdf in valid_pdfs[composer]:
            nonfiller_page_count += len(valid_pdfs[composer][valid_pdf]["valid_pages"])

print("Page count:", nonfiller_page_count)

Page count: 64129


In [23]:
with open("../100_way_list.txt") as f:
    hundred_way_composers = f.read().splitlines()
    bscore_count = 0
    for composer in hundred_way_composers:
        for valid_pdf in valid_pdfs[composer]:
            bscore_count += valid_pdfs[composer][valid_pdf]["count"]

print("Bscore count:", bscore_count)

Bscore count: 12108749


Create new IMSLP dataset with filler pages replaced with []

In [19]:
import os
os.makedirs(str(piano_bootleg_scores_path)+".1", exist_ok=True)

for composer, info in composer_dict.items():
    os.makedirs(str(piano_bootleg_scores_path)+".1/"+composer, exist_ok=True)
    for piece_name, piece_info in info.items():
        os.makedirs(str(piano_bootleg_scores_path)+".1/"+composer+"/"+piece_name, exist_ok=True)
        for id_, valid_info in piece_info.items():
            fname = str(piano_bootleg_scores_path)+".1/"+composer+"/"+piece_name+"/"+id_+".pkl"
            old_fname = str(piano_bootleg_scores_path)+"/"+composer+"/"+piece_name+"/"+id_+".pkl"
            valid_pages = valid_info["valid_pages"]

            with open(old_fname, "rb") as f:
                pages = pickle.load(f)

            new_pages = [page if i in valid_pages else [] for i, page in enumerate(pages)]

            with open(fname, "wb") as f:
                pickle.dump(new_pages, f)

Helper functions for sampling fragments for dataset

In [18]:
    
# Shuffles two lists the same way
def co_shuffle(list1, list2, list3):
    temp = list(zip(list1, list2, list3))
    np.random.shuffle(temp)
    res1, res2, res3 = zip(*temp)
    res1, res2, res3 = list(res1), list(res2), list(res3)
    return res1, res2, res3

def fragment_data(pieces, composer_list, samples=60_000, fragment_len=64):
    """Takes list of (binary_matrix, composer) and creates fragments of fragment_len based on it.
    pieces: The list of (binary_matrix, composer) to sample from
    samles: The number of samples to gather
    fragment_len: The length of each fragment
    """

    # Organize pieces by composer
    composer_pieces = {}
    for (piece, path), composer in pieces:
        if len(piece) > 64:
            if not composer in composer_pieces:
                composer_pieces[composer] = []
            composer_pieces[composer].append((piece, path))

    x_fragments = []
    y_fragments = []
    metadata = []
    
    fragments_per_composer = round(samples / len(composer_list))
    for composer, piece_list  in composer_pieces.items():
        if not composer in composer_list:
            continue

        for i in range(fragments_per_composer):
            # Get random piece by that composer
            piece, path = random.choice(piece_list)

            # Get random fragment from piece
            start = np.random.randint(len(piece)-fragment_len)
            fragment = piece[start:start+fragment_len].copy()
            
            # Get page num to start fragment from
            d = pd.read_pickle(path)
            psum = []
            for page in d:
                psum.append(len(page))
            psum = np.cumsum(psum)
            psum = [x-1 for x in psum]
            page = bisect_left(psum, start)
            page_offset = start - psum[page-1] if page > 0 else start
            x_fragments.append(fragment)
            y_fragments.append(composer)
            metadata.append((Path(path).stem, page, page_offset))
                
    return x_fragments, y_fragments, metadata
        

def create_fragment_dataset(pieces, composer_list, valid_split = 0.15, test_split = 0.15, samples=60_000, fragment_len=64):
    """
    Creates a train / Test split dataset of fragments.
    pieces: The list of (binary_matrix, composer) to sample from
    split: The proportion of data to use to test and valid (each get this proportion and the rest is for train - split=.1 -> train=.8, test=.1, valid=.1)
    samples: The number of samples to gather (train + test)
    fragment_len: The length of each fragment
    """
    
    composer_pieces = {composer1:[(piece, path) for piece, composer2, path in pieces if composer2 == composer1 and len(piece) > fragment_len] for composer1 in composer_list}
    # Go through each composer and separate pieces into train, valid, test
    train_pieces = []
    valid_pieces = []
    test_pieces = []
    for composer in composer_list:
    # for composer, piece_list in composer_pieces.items():
        piece_list = composer_pieces[composer]
        np.random.shuffle(piece_list)
        
        # Make sure each piece is matched to the composer
        piece_list = list(zip(piece_list, [composer]*len(piece_list)))
        
        # Calculate starting places of each section - order is (test, valid, train)
        train_start = round((valid_split+test_split)*len(piece_list))
        valid_start = round(test_split*len(piece_list))
        
        # Add composer info and add each part to its respective set
        train_pieces += piece_list[train_start:]
        valid_pieces += piece_list[valid_start:train_start]
        test_pieces += piece_list[:valid_start]

    # with open("composer_pieces.pkl", "wb") as f:
    #     pickle.dump(train_pieces, f)
     # Fragment the pieces
    x_train_fragments, y_train_fragments, meta_train = fragment_data(train_pieces, composer_list, samples=round((1-(valid_split+test_split))*samples), fragment_len=fragment_len)
    x_valid_fragments, y_valid_fragments, meta_valid = fragment_data(valid_pieces, composer_list, samples=round(valid_split*samples), fragment_len=fragment_len)
    x_test_fragments, y_test_fragments, meta_test = fragment_data(test_pieces, composer_list, samples=round(test_split*samples), fragment_len=fragment_len)
    
    # Reshuffle pieces
    x_train_fragments, y_train_fragments, meta_train = co_shuffle(x_train_fragments, y_train_fragments, meta_train)
    x_valid_fragments, y_valid_fragments, meta_valid = co_shuffle(x_valid_fragments, y_valid_fragments, meta_valid)
    x_test_fragments, y_test_fragments, meta_test = co_shuffle(x_test_fragments, y_test_fragments, meta_test) 
    
    return x_train_fragments, y_train_fragments, meta_train, x_valid_fragments, y_valid_fragments, meta_valid, x_test_fragments, y_test_fragments, meta_test


Create and save 9-way dataset

In [20]:
x_train_fragments, y_train_fragments, meta_train, \
    x_valid_fragments, y_valid_fragments, meta_valid, \
    x_test_fragments, y_test_fragments, meta_test = create_fragment_dataset(pieces, nine_way_composers, samples=40_000)

with open(data_path/"9_way_dataset.pkl", "wb") as f:
    pickle.dump((x_train_fragments, y_train_fragments,
                 x_valid_fragments, y_valid_fragments,
                 x_test_fragments, y_test_fragments,
                 meta_train, meta_valid, meta_test), f)

Create and save 100-way dataset

In [21]:
x_train_fragments, y_train_fragments, meta_train, \
    x_valid_fragments, y_valid_fragments, meta_valid, \
    x_test_fragments, y_test_fragments, meta_test = create_fragment_dataset(pieces, hundred_way_composers, samples=100_000)

with open(data_path/"100_way_dataset.pkl", "wb") as f:
    pickle.dump((x_train_fragments, y_train_fragments, x_valid_fragments, y_valid_fragments, x_test_fragments, y_test_fragments), f)