# Creating the dataset for DL

In [1]:
import joblib
import pretty_midi
import visual_midi
import pandas as pd
import numpy as np
from os import listdir
from os.path import getsize
from MIDIComposingAI.utils import piano_roll_to_pretty_midi
from MIDIComposingAI.create_csv_dataset import create_simple_dataset
from scipy.sparse import csr_matrix
from tensorflow import convert_to_tensor

2021-12-01 20:29:03.663181: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-01 20:29:03.663274: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def separate_pitch_velocity(target):
    """
    Separate pitch and velocity within the target
    """
    # Lists of each velocities and pitches for each sample
    sample_velocities = []
    sample_pitches = []
    
    for sample in target:
        # Lists of velocities and pitches within the sample
        velocities = []
        pitches = []
        
        for frame in sample.T:
            frame = list(frame)
            velocity = np.sum(frame)
            velocities.append(velocity)
            pitches.append(frame.index(velocity))
        sample_velocities.append(velocities)
        sample_pitches.append(pitches)
    
    return (sample_pitches, sample_velocities)

def create_dataframe_file(file, name, dataset_type='matrix', store=True):
    """
    Save a dataset within a directory
    Args:
        file: a pretty_midi file
    """
    # First we create a dataset
    X, y = create_simple_dataset(file)
    
    # We create an empty dataframe
    df = pd.DataFrame(columns=['accompaniment', 'melody_pitches', 'melody_velocities'])
    
    # We separate pitches and velocities from the melody
    pitches, velocities = separate_pitch_velocity(y)
    
    if dataset_type == 'matrix':
        # We add the feature, accompaniment
        df['accompaniment'] = [csr_matrix(accompaniment) for accompaniment in X]
        
        # Then we add the two target to the dataframe
        df['melody_pitches'] = [csr_matrix(pitch) for pitch in pitches]
        df['melody_velocities'] = [csr_matrix(velocity) for velocity in velocities]
    
    if dataset_type == 'array':
        # We add the feature, accompaniment
        df['accompaniment'] = [accompaniment for accompaniment in X]
        
        # Then we add the two target to the dataframe
        df['melody_pitches'] = [np.array(pitch) for pitch in pitches]
        df['melody_velocities'] = [np.array(velocity) for velocity in velocities]
        
    # Then we store the data
    if store:
        joblib.dump(df, f'../raw_data/pandas_dataframes/simple_dataset/{name}')
    else:
        return df

    # We have to delete the local variables in order to make loops without overloading the RAM
    del([X, y, pitches, velocities, df])

def create_nparray_dataset(file, directory ,name, store=True):
    """
    Create a nparray dataset
    """
    X, y = create_simple_dataset(file)
    
    pitches, velocities = separate_pitch_velocity(y)
    
    X_accompaniment = np.array([accompaniment.T for accompaniment in X])
        
    # Then we add the two target to the dataframe/
    y_pitch = np.array([np.array(pitch) for pitch in pitches])
    y_velocity = np.array([np.array(velocity) for velocity in velocities])
    
    dataset = (X_accompaniment, y_pitch, y_velocity)
    
    if store:
        joblib.dump(dataset, f'../raw_data/pandas_dataframes/{directory}/{name}')
    else:
        return dataset
    del([X, y, pitches, velocities, X_accompaniment, y_pitch, y_velocity, dataset])

In [3]:
# # Let's take some examples
# examples_files = []
# path = '../raw_data/pretty_midi'
# directory = listdir(path)

# for file in directory:
#     if getsize(f'{path}/{file}') < 300000: # We don't want too big files
#         examples_files.append(joblib.load(f'{path}/{file}'))
#     if len(examples_files) >= 10:
#         break

In [4]:
# for i, file in enumerate(examples_files):
#     create_nparray_dataset(file, f'nparray{i}')

In [5]:
bigger_examples_files = []
path = '../raw_data/pretty_midi'
directory = listdir(path)

for file in directory:
    if 100_000 < getsize(f'{path}/{file}') and getsize(f'{path}/{file}') < 200_000: # We don't want too big or too little files
        bigger_examples_files.append(joblib.load(f'{path}/{file}'))
    if len(bigger_examples_files) >= 50:
        break

In [6]:
# Iterations over iterations ...
for i in range(50):
    create_nparray_dataset(bigger_examples_files[i], 'bigger_dataset', f'nparray{i}')

## Let's add another feature
Is a note played ? (0 or 1)

In [7]:
test_file = joblib.load('../raw_data/pandas_dataframes/bigger_dataset/nparray4')

In [8]:
test_file[1][0].shape

(500,)

In [9]:
def is_there_a_note(array):
    """
    Return an array of 0 and 1, 0 when no note is played, 1 when a note is played
    Args:
        array : an array of dim 2
    """
    
    # We instanciate a flattened empty array from the input array's shape
    output = np.zeros(array.shape).reshape(-1, 1)
    
    for i, note in enumerate(array.reshape(-1, 1)): # We want to iterate over all the array at once
        if int(note[0]) != 0:
            output[i][0] = 1
    
    return output.reshape(array.shape)