In [1]:
import os
import time
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import tqdm
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Masking, Bidirectional, LSTM, TimeDistributed, Dense, Activation

2023-01-10 19:43:06.170661: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
BATCH_SIZE = 128
VALIDATION_RATIO = 0.1

PIECE_LEN = 1024
n_feature = 43
n_hidden = 200
n_pitch = 53
learning_rate = 0.001

## Build dataset

In [3]:
def load_npy_data(x_path, y_path, offset):
    x = np.load(x_path)
    y = np.load(y_path)
    if x.shape[0] >= offset+PIECE_LEN:
        return x[offset:offset+PIECE_LEN].astype(np.float32), y[offset:offset+PIECE_LEN].astype(np.float32)
    else:
        pad_count = offset + PIECE_LEN - x.shape[0]
        x = np.pad(x[offset:], ((0, pad_count), (0, 0)), 'constant', constant_values=-1).astype(np.float32)
        y = np.pad(y[offset:], ((0, pad_count), (0, 0)), 'constant', constant_values=-1).astype(np.float32)
        return x, y

def generate_dataset(input_dir: str):
    # using tf.data.Dataset API to create dataset
    x_paths = [] # input path
    y_paths = [] # ans file path
    offsets = [] # starting point of a piece
    for file_name in sorted(os.listdir(input_dir)):
        if file_name.endswith(".ans.npy"):
            y_path = str(os.path.join(input_dir, file_name))
            x_path = str(os.path.join(input_dir, file_name.removesuffix(".ans.npy") + ".npy"))
            assert os.path.exists(x_path), f"corresponding input file {x_path} doesn't exist"

            # split and pad data into PIECE_LEN
            y_content = np.load(y_path)
            for offset in range(0, y_content.shape[0], PIECE_LEN):
                y_paths.append(y_path)
                x_paths.append(x_path)
                offsets.append(offset)

    
    train_dataset = tf.data.Dataset.from_tensor_slices((x_paths, y_paths, offsets)).shuffle(100000)
    train_dataset = train_dataset.map(lambda x_path, y_path, offset: tf.numpy_function(load_npy_data, [x_path, y_path, offset], [tf.float32, tf.float32]))
    train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(-1).cache()
    
    return train_dataset


train_dataset = generate_dataset("preprocessed_dataset/irealpro_dataset_v2")

2023-01-10 19:43:28.307646: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## build model

In [4]:
input = tf.keras.Input(shape=(PIECE_LEN, n_feature))  
x = Masking(mask_value=-1, input_shape=(PIECE_LEN, n_feature))(input) # Ignore Padded Data
state = Bidirectional(LSTM(units=n_hidden, input_shape=(1, n_feature), return_sequences=True))(x)

pitch_prob = TimeDistributed(Dense(n_pitch, activation="softmax"))(state)
is_onset = TimeDistributed(Dense(1, activation="sigmoid"))(state)
start_correction = TimeDistributed(Dense(1))(state)
end_correction = TimeDistributed(Dense(1))(state)
velocity = TimeDistributed(Dense(1))(state)

output = tf.keras.layers.Concatenate(axis=2)([pitch_prob, is_onset, start_correction, end_correction, velocity])
model = tf.keras.Model(inputs=input, outputs=output)

print(model.summary())

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1024, 43)]   0           []                               
                                                                                                  
 masking (Masking)              (None, 1024, 43)     0           ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 1024, 400)    390400      ['masking[0][0]']                
                                                                                                  
 time_distributed (TimeDistribu  (None, 1024, 53)    21253       ['bidirectional[0][0]']          
 ted)                                                                                         

## loss/accuracy function

In [5]:
def masked_loss_function(y_true, y_pred):
    mask = tf.math.not_equal(tf.reduce_sum(y_true[:,:,:n_pitch], axis=2), -1*n_pitch)  # false if it is a padding time step
    pitch_loss = tf.losses.categorical_crossentropy(y_true[:,:,:n_pitch], y_pred[:,:,:n_pitch])
    onset_loss = tf.losses.binary_crossentropy(y_true[:,:,n_pitch:n_pitch+1], y_pred[:,:,n_pitch:n_pitch+1])
    start_loss = tf.square(y_true[:,:,n_pitch+1] - y_pred[:,:,n_pitch+1])
    end_loss = tf.square(y_true[:,:,n_pitch+2] - y_pred[:,:,n_pitch+2])
    velocity_loss = tf.square(y_true[:,:,n_pitch+3] - y_pred[:,:,n_pitch+3])
    total_loss = tf.reduce_sum([pitch_loss, onset_loss, start_loss, end_loss, velocity_loss], axis=0)
    total_loss *= tf.cast(mask, total_loss.dtype)
    return tf.reduce_mean(total_loss)

def masked_accuracy(y_true, y_pred):
    mask = tf.math.not_equal(tf.reduce_sum(y_true[:,:,:n_pitch], axis=2), -1*n_pitch)  # false if it is a padding time step
    mask = tf.cast(mask, tf.float32)
    pitch_acc = tf.reduce_mean(mask * tf.metrics.categorical_accuracy(y_true[:,:,:n_pitch], y_pred[:,:,:n_pitch]))
    onset_acc = tf.reduce_mean(mask * tf.metrics.binary_accuracy(y_true[:,:,n_pitch:n_pitch+1], y_pred[:,:,n_pitch:n_pitch+1]))
    start_loss = tf.reduce_mean(mask * tf.square(y_true[:,:,n_pitch+1] - y_pred[:,:,n_pitch+1]))
    end_loss = tf.reduce_mean(mask * tf.square(y_true[:,:,n_pitch+2] - y_pred[:,:,n_pitch+2]))
    velocity_loss = tf.reduce_mean(mask * tf.square(y_true[:,:,n_pitch+3] - y_pred[:,:,n_pitch+3]))

    return pitch_acc, onset_acc, start_loss, end_loss, velocity_loss

## training

In [6]:
@tf.function
def train_step(x, y):
    loss = 0
    with tf.GradientTape() as tape:
        pred = model(x)
        
        loss = masked_loss_function(y, pred)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    acc = masked_accuracy(y, pred)

    return loss, acc

In [7]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    total_loss = 0
    total_pitch_acc = 0
    total_onset_acc = 0
    total_start_loss = 0
    total_end_loss = 0
    total_velocity_loss = 0
    steps_per_epoch = 0

    for x, y in tqdm.tqdm(train_dataset):
        batch_loss, batch_acc = train_step(x, y)
        total_loss += batch_loss
        pitch_acc, onset_acc, start_loss, end_loss, velocity_loss = batch_acc
        total_pitch_acc += pitch_acc
        total_onset_acc += onset_acc
        total_start_loss += start_loss
        total_end_loss += end_loss
        total_velocity_loss += velocity_loss
        steps_per_epoch += 1
    
    print(f'Epoch {epoch+1} Loss {total_loss / steps_per_epoch:.4f} PitchAcc {total_pitch_acc / steps_per_epoch:.4f}')
    print(f"OnsetAcc {total_onset_acc / steps_per_epoch} StartLoss {total_start_loss / steps_per_epoch} EndLoss {total_end_loss / steps_per_epoch} VelLoss {total_velocity_loss / steps_per_epoch}")
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

  4%|▍         | 2/52 [00:35<13:48, 16.58s/it]2023-01-10 19:44:46.477991: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
  4%|▍         | 2/52 [00:51<21:17, 25.55s/it]


KeyboardInterrupt: 

## predict

In [38]:
def slice_per_step(a):
    # add -1 to the end of each sample to make them the same length per step(piece_length)
    original_length = a.shape[0]
    pad_count = PIECE_LEN - (original_length % PIECE_LEN)
    print(a.shape, end=' ')
    if pad_count!=PIECE_LEN : 
        print('pad by',pad_count, end=' ')
        a = np.pad(a, ((0, pad_count), (0, 0)), 'constant', constant_values=-1)
    # reshape into per step
    a = np.reshape(a, (-1, PIECE_LEN, a.shape[1]))
    print('to',a.shape)
    return a, original_length

In [39]:
import midi_np_translation.output2midi_v2 as output2midi
test_file = np.load("preprocessed_dataset/irealpro_dataset_v2/Autumn Leaves_o0.mid.npy")
padded_input, original_length = slice_per_step(test_file)
test_result = np.reshape(model.predict(padded_input), (-1, 57))[:original_length]
print(test_result.shape)
# test_result = np.argmax(test_result, axis=2)
output2midi.output_to_midi(test_result, ref_midi_path="input_midi/irealpro_transposed/Autumn Leaves_o0.mid", output_path="yo_al.mid")
# output2midi_lagacy.output_to_midi(np.reshape(test_result, (-1, 57)), ref_midi_path="input_midi/irealpro_transposed/Autumn Leaves_o0.mid", output_path="yo_al.mid")

(1560, 43) pad by 488 to (2, 1024, 43)
(1560, 57)
