# Creating the dataset for DL

In [1]:
import joblib
import pretty_midi
import visual_midi
import pandas as pd
import numpy as np
from os import listdir
from os.path import getsize
from MIDIComposingAI.utils import piano_roll_to_pretty_midi
from MIDIComposingAI.create_csv_dataset import create_simple_dataset
from scipy.sparse import csr_matrix
from tensorflow import convert_to_tensor

2021-12-01 10:45:12.640597: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-01 10:45:12.640700: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def separate_pitch_velocity(target):
    """
    Separate pitch and velocity within the target
    """
    # Lists of each velocities and pitches for each sample
    sample_velocities = []
    sample_pitches = []
    
    for sample in target:
        # Lists of velocities and pitches within the sample
        velocities = []
        pitches = []
        
        for frame in sample.T:
            frame = list(frame)
            velocity = np.sum(frame)
            velocities.append(velocity)
            pitches.append(frame.index(velocity))
        sample_velocities.append(velocities)
        sample_pitches.append(pitches)
    
    return (sample_pitches, sample_velocities)

def create_data_file(file, name, mode='separate'):
    """
    Save a dataset within a directory
    Args:
        file: a pretty_midi file
    """
    # First we create a dataset
    X, y = create_simple_dataset(file)
    
    # We create an empty dataframe
    df = pd.DataFrame(columns=['accompaniment', 'melody_pitches', 'melody_velocities'])
    
    # We add the feature, accompaniment
    df['accompaniment'] = [csr_matrix(accompaniment) for accompaniment in X]
    
    # We separate pitches and velocities from the melody
    pitches, velocities = separate_pitch_velocity(y)
    
    # Then we add the two target to the dataframe
    df['melody_pitches'] = [csr_matrix(pitch) for pitch in pitches]
    df['melody_velocities'] = [csr_matrix(velocity) for velocity in velocities]
    
    # Then 
    joblib.dump(df, f'../raw_data/pandas_dataframes/simple_dataset/{name}')

    
    # We have to delete the local variables in order to make loops without overloading the RAM
    del([X, y, pitches, velocities, df])

In [3]:
# Let's take some examples
examples_files = []
path = '../raw_data/pretty_midi'
directory = listdir(path)

for file in directory:
    if getsize(f'{path}/{file}') < 300000: # We don't want too big files
        examples_files.append(joblib.load(f'{path}/{file}'))
    if len(examples_files) >= 10:
        break

In [4]:
# for i, file in enumerate(examples_files):
#     create_data_file(file, f'dataframe_{i}')

In [10]:
data = pd.DataFrame(columns=['accompaniment', 'melody_pitches', 'melody_velocities'])
for i in range(10):
    loaded = joblib.load(f'../raw_data/pandas_dataframes/simple_dataset/dataframe_{i}')
    data = pd.concat((data, loaded))
data.reset_index(drop=True, inplace=True)

In [11]:
data.accompaniment[0].shape, data.melody_pitches[0].shape

((128, 10000), (1, 10000))

In [13]:
for column in data:
    for i in range(len(data[column])):
        data[column][i] = convert_to_tensor(data[column][i].todense(), dtype='float32')

In [15]:
type(data.accompaniment[0])

tensorflow.python.framework.ops.EagerTensor