In [None]:
import filesystem
import h5py

def file_len(file):
    for i, l in enumerate(file):
        pass
    file.seek(0, 0)
    return i + 1

In [None]:
def copy_dataset(old_storage_path, old_label, new_storage_path, new_label):
    with h5py.File(old_storage_path, 'r') as f1:
        with h5py.File(new_storage_path, 'a') as f2:
            f2[new_label] = f1[old_label]

copy_dataset()

In [None]:
def save_raw(data_folder_path, shuffle=False, label=""):
    """Save all audiofiles in the given folder as dataset.
    The length of data is the count of frames to save from each file."""
    new_path = filesystem.get_dataset_path('r')
    with h5py.File(new_path, 'a') as f:
        file_list = os.listdir(data_folder_path)
        if shuffle:
            random.shuffle(file_list)
        length = len(file_list)
        data_length = 24000
        dset = f.create_dataset("raw/" + label, dtype="float32",
                                shape=(length, data_length),
                                compression="lzf")
        index = 0
        with open(new_path + '.txt', 'w') as indexer:
            for filename in file_list:
                file_path = os.path.join(data_folder_path, filename)
                data = p.read(file_path)
                if len(data) > data_length:
                    dset[index, :] = data[0:data_length]
                else:
                    dset[index, 0:len(data)] = data
                if (index+1) % 100 == 0:
                    print(str(index+1)+"/"+str(length))
                indexer.write(filename + '\n')
                index += 1
        print(str(length)+"/"+str(length))

save_raw()

In [None]:
def save_mix():
    new_path = filesystem.get_dataset_path('r')
    dsets = {}
    with open(cfg.datasets_root + "mix.txt", 'r') as f:
        length = (file_len(f)+1)//4
        with h5py.File(new_path, 'a') as dset_file:
            dset_mix = dset_file.create_dataset("raw/", dtype="float32",
                                                shape=(
                                                    length*2, cfg.framerate),
                                                compression="lzf")
            dset_index = 0
            try:
                for i, line in enumerate(f):
                    if (i % 4 == 0) or (i % 4 == 3):
                        continue
                    path, index = line.split(',')
                    path = cfg.datasets_root + path
                    index = int(index)

                    if not (path in dsets):
                        dsets[path] = h5py.File(path, 'r')
                    dset_mix[dset_index] = dsets[path]['raw'][index,
                                                              :cfg.framerate]
                    dset_index += 1

                    if (dset_index+1) % 100 == 0:
                        print(str(dset_index+1)+"/"+str(length*2))
                print(str(length*2)+"/"+str(length*2))
            finally:
                for _, dset in dsets.items():
                    dset.close()

save_mix()

In [None]:
def save_dynamic_time_raw(storage_path, folder_path, time_list, framerate):
    """Save all audiofiles in subfolders as datasets.
    A name of the subfolder is a duration in seconds and label of the appropriate dataset."""
    for t in time_list:
        save_raw(storage_path, os.path.join(
            folder_path, str(t)), framerate * t, label=str(t))

save_dynamic_time_raw()

In [None]:
def join_dynamic_time_raw(old_storage_path, new_storage_path, time_list, framerate):
    """Join datasets of raw audio grouped by time to one dataset of one-second audio."""
    with h5py.File(old_storage_path, 'r') as f1:
        with h5py.File(new_storage_path, 'a') as f2:
            length = 0
            for t in time_list:
                length += len(f1["raw/" + str(t)])*t
            dset2 = f2.create_dataset("raw", dtype="float32",
                                      shape=(length, framerate), maxshape=(
                                          None, framerate),
                                      compression="lzf")
            start = 0
            for t in time_list:
                dset1 = f1["raw/" + str(t)]
                end = start+(len(dset1)*t)
                dset2[start:end] = np.reshape(dset1, (-1, framerate))
                start = end
                print(t)

join_dynamic_time_raw()

In [None]:
def save_harmonics(label=""):
    old_path = cfg.get_dataset_path('r')
    new_path = cfg.get_dataset_path('h')
    with h5py.File(old_path, 'r') as f1:
        with h5py.File(new_path, 'a') as f2:
            dset1 = f1["raw/" + label]
            length = len(dset1)
            dset2 = f2.create_dataset("harmonics/" + label, dtype="float32",
                                      shape=(
                                          length, cfg.preprocess_shape[0], cfg.preprocess_shape[1]),
                                      compression="lzf")
            for i in range(length):
                dset2[i, :, :] = sp.complete_preprocess(
                    dset1[i, :cfg.framerate])
                if (i+1) % 100 == 0:
                    print(str(i+1)+"/"+str(length))
            print(str(length)+"/"+str(length))

save_harmonics()

In [None]:
def save_embeddings(label=""):
    old_path = filesystem.get_dataset_path('h')
    new_path = filesystem.get_dataset_path('e')
    with h5py.File(old_path, 'r') as f1:
        with h5py.File(new_path, 'a') as f2:
            dset1 = f1["harmonics/" + label]
            length = len(dset1)
            dset2 = f2.create_dataset("embeddings/" + label, dtype="float32",
                                      shape=(
                                          length, cfg.embedding_shape[0], cfg.embedding_shape[1]),
                                      compression="lzf")
            index = 0
            while index+500 < length:
                dset2[index:index+500, :,
                      :] = sp.encode(dset1[index:index+500, :, :])
                index += 500
                print(str(index)+"/"+str(length))
            if index != (length - 1):
                dset2[index:, :, :] = sp.encode(dset1[index:, :, :])
                print(str(length)+"/"+str(length))

save_embeddings()

In [None]:
def save_patches(label=""):
    old_path = filesystem.get_dataset_path('e')
    new_path = filesystem.get_dataset_path('p')
    with h5py.File(old_path, 'r') as f1:
        with h5py.File(new_path, 'a') as f2:
            dset1 = f1["embeddings/" + label]
            length = len(dset1) * cfg.embedding_shape[0]
            dset2 = f2.create_dataset("patches/" + label, dtype="float32",
                                      shape=(
                                          length, cfg.embedding_shape[1]*cfg.embedding_overlap),
                                      compression="lzf")
            old_index = 0
            old_step = 100
            new_index = 0
            new_step = old_step*cfg.embedding_shape[0]
            while new_index+new_step < length:
                dset2[new_index:new_index +
                      new_step] = sp.extract_patches(dset1[old_index:old_index+old_step])
                old_index += old_step
                new_index += new_step
                print(str(new_index)+"/"+str(length))
            if new_index != (length - 1):
                dset2[new_index:] = sp.extract_patches(dset1[old_index:])
                print(str(length)+"/"+str(length))

save_patches()