In [1]:
import os
import numpy as np
import muspy
from tqdm import tqdm
from matplotlib import pyplot as plt
import tensorflow as tf
import pickle

import config
import utils

config_string = "single_instruments_type"
ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

conf = config.Config(config_string, ROOT_PATH)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(os.path.join(conf.dataset_paths["lmd_matched"], "lakh_matched_genre_vectors.pickle"), "rb") as f:
    genre_vectors = pickle.load(f)

dataset = utils.get_dataset("lmd_matched", conf)
len(dataset)

115190

## Study to choose final length of dataset

In [3]:
lengths = np.zeros(10000000)

discarded_samples = {
    0:0, # empty song
    1:0, # time_signatures not all acceptable
    2:0, # n_measures too big
    3:0, # too many notes
    4:0, # no genre
}

for song, genre in (pbar := tqdm(zip(dataset, genre_vectors), total=len(dataset))):
    pbar.set_description("Long dataset matched")
    if type(genre) == type(None):
        discarded_samples[4] += 1

    else:
        converted_song = utils.transform_representation(song, conf)

        lengths[len(converted_song)] += 1

        if len(converted_song) <= 1:
            discarded_samples[converted_song[0]] += 1


Long dataset matched: : 69031it [43:55, 22.51it/s] 

In [None]:
discarded_samples

In [None]:
max = 0
for i in range(len(lengths)):
    if lengths[i]>0:
        max = i

In [None]:
lengths = lengths[:max+1]

In [None]:
plt.scatter(np.arange(len(lengths)), lengths)

In [None]:
chosen_len=1024 
print("Percentage of song kept if discarding every song longer than {}: {}".format(chosen_len, np.sum(lengths[:chosen_len])/np.sum(lengths)))
chosen_len=2048 
print("Percentage of song kept if discarding every song longer than {}: {}".format(chosen_len, np.sum(lengths[:chosen_len])/np.sum(lengths)))

In [None]:
weighed_lenghts = lengths.astype(np.uint256)*np.arange(len(lengths))

In [None]:
chosen_len=1024
print("Percentage of dataset information by cutting every song to {} tokens: {}".format(chosen_len, (np.sum(weighed_lenghts[:chosen_len]) + np.sum(lengths[chosen_len:])*chosen_len) / np.sum(weighed_lenghts)))
chosen_len=2048
print("Percentage of dataset information by cutting every song to {} tokens: {}".format(chosen_len, (np.sum(weighed_lenghts[:chosen_len]) + np.sum(lengths[chosen_len:])*chosen_len) / np.sum(weighed_lenghts)))

In [None]:
CHOSEN_LEN = 2048

samples = []
genres = []
labels = {key:[] for key in conf.INPUT_RANGES.keys()}

lengths = np.zeros(10000000)

discarded_samples = {
    0:0, # empty song
    1:0, # time_signatures not all acceptable
    2:0, # n_measures too big
    3:0, # too many notes
    4:0, # no genre
}

for song, genre in (pbar := tqdm(zip(dataset, genre_vectors), total=len(dataset))):
    pbar.set_description("Long dataset matched")
    if type(genre) == type(None):
        discarded_samples[4] += 1

    else:
        converted_song = utils.transform_representation(song, conf)

        lengths[len(converted_song)] += 1

        if len(converted_song) <= 1:
            discarded_samples[converted_song[0]] += 1

        elif len(converted_song) > CHOSEN_LEN:
            discarded_samples[3] += 1
        
        elif len(converted_song) == CHOSEN_LEN:
            samples.append(converted_song, dtype=np.uint8)
            labels.append(genre)       
            
        else:
            padding = tf.cast(np.stack([[7]+[0]*10]*(CHOSEN_LEN-len(converted_song)), axis=0), dtype=np.uint8)
            
            sample = np.concatenate((
                    converted_song,
                    tf.identity(padding)
                ), dtype=np.uint8
            )

            samples.append(sample)
            genres.append(genre)

            for i, key in enumerate(labels.keys()):
                labels[key].append(song[:,i])

dataset = tf.data.Dataset.from_tensor_slices(((samples, genres), labels))
dataset.save(conf.lmd_matched_final)

In [None]:
tot_discarded = sum(discarded_samples.values())
print("Kept {}% of the songs".format(len(dataset)/len(samples)*100))
print("Of the discarded: ")
print("- {:.2f}% were empty".format(discarded_samples[0]/tot_discarded*100))
print("- {:.2f}% contained not accepted time signatures".format(discarded_samples[1]/tot_discarded*100))
print("- {:.2f}% had too many measures".format(discarded_samples[2]/tot_discarded*100))
print("- {:.2f}% had too many events/notes".format(discarded_samples[3]/tot_discarded*100))
print("- {:.2f}% had no accepted genre".format(discarded_samples[4]/tot_discarded*100))