In [41]:
! pip3 install tensorflow-gpu
import tensorflow as tf
print(tf.__version__)

! pip3 install tensorflow_addons
import tensorflow_addons as tfa

import os

import numpy as np

! pip3 install tqdm
from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2.11.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Parameter**

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
# Original Data Info.
n_pitch = 53
n_feature = 43

# Model Parameter
input_size = n_feature
target_size = n_pitch + 4 # (+4 addition) are oneset + start correction + end correction + velocity

# Training Parameter
BATCH_SIZE = 32          # Batch Sizes
VALIDATION_RATIO = 0.1  # Validation Ratio to Input Data
learning_rate = 0.001   # Learning Rate
n_hidden = 100            # Hidden Units number

# Data Parameter
PIECE_LEN = 200          # (auto set by data) Extra +1 for START timestamp
is_onset_index = 53
start_correction_index = 54
end_correction_index = 55
velocity_index = 56
key_order = ['pitch', 'onset', 'start', 'end', 'velocity']

ROOT = r'/content/drive/MyDrive/ML'

In [44]:
os.chdir(ROOT)
print(os.getcwd())

/content/drive/MyDrive/ML


In [45]:
from zipfile import ZipFile
# Delete incomplete unzipped folder and run this 
if 'irealpro_dataset_v2' not in os.listdir():
    print('Creating Folder \'irealpro_dataset_v2\'')
    os.mkdir('../irealpro_dataset_v2')
    print('Extract to Folder \'irealpro_dataset_v2\'')
    with ZipFile(r"/content/drive/MyDrive/ML/irealpro_dataset_v2.zip", 'r') as zObject:
        zObject.extractall(path=r"/content/drive/MyDrive/ML/irealpro_dataset_v2")

In [46]:
np.random.seed(10000)

# **Data Preparation**

`MAX_DATA`: control how many songs files as the input (assign with -1 to use all songs)

In [47]:
MAX_DATA = 10

In [48]:
dec_init_input = np.zeros((1, target_size))
dec_init_input[0, start_correction_index] = 1.

count_pad = 0 
count_spilled = 0
def load_npy_data(x_path, y_path, offset):
    global count_pad, count_spilled, dec_init_input
    x = np.load(x_path)
    y = np.load(y_path)

    if offset == 0:
        Z_ = dec_init_input
    else:
        Z_ = y[offset-1].astype(np.float32)
        Z_ = np.expand_dims(Z_, axis=0)
        count_spilled += 1

    if x.shape[0] >= offset+PIECE_LEN:
        X_ = x[offset:offset+PIECE_LEN].astype(np.float32)
        Y_ = y[offset:offset+PIECE_LEN].astype(np.float32)
    else:
        pad_count = offset + PIECE_LEN - x.shape[0]
        X_ = np.pad(x[offset:], ((0, pad_count), (0, 0)), 'constant', constant_values=-1).astype(np.float32)
        Y_ = np.pad(y[offset:], ((0, pad_count), (0, 0)), 'constant', constant_values=-1).astype(np.float32)
        count_pad += 1
    try:
        assert X_.shape == (PIECE_LEN, input_size)
        assert Z_.shape == (1, target_size)
        assert Y_.shape == (PIECE_LEN, target_size)
    except:
        print('You got',X_.shape, Z_.shape, Y_.shape)
        raise ValueError

    return X_, Z_, Y_

def generate_dataset(input_dir: str, data_size=-1):
    # using tf.data.Dataset API to create dataset
    x_paths = [] # input path
    y_paths = [] # ans file path
    offsets = [] # starting point of a piece
    data_cnt = 0
    for file_name in os.listdir(input_dir): # 'Scan Files'
        if data_cnt==data_size-1: break
        if file_name.endswith(".ans.npy"):
            data_cnt+=1
            y_path = str(os.path.join(input_dir, file_name))
            x_path = str(os.path.join(input_dir, file_name[:-8] + ".npy"))
            assert os.path.exists(x_path), f"corresponding input file {x_path} doesn't exist"

            # split and pad data into PIECE_LEN
            y_content = np.load(y_path)
            for offset in range(0, y_content.shape[0], PIECE_LEN):
                y_paths.append(y_path)
                x_paths.append(x_path)
                offsets.append(offset)

    dataset = []
    for i in zip(x_paths, y_paths, offsets): # 'Read Files'
        dataset.append(load_npy_data(*i))
    print(len(dataset))
    dataset = list(zip(*dataset))
    # print(len(dataset[0]))
    # print(len(dataset[1]))
    # print(len(dataset[2]))
    encoder_input_data = np.array(dataset[0])
    decoder_input_data = np.array(dataset[1])
    decoder_target_data = np.array(dataset[2])
    
    perm_id = np.random.shuffle(np.arange(encoder_input_data.shape[0]))
    encoder_input_data = encoder_input_data[perm_id]
    decoder_input_data = decoder_input_data[perm_id]
    decoder_target_data = decoder_target_data[perm_id]
    
    print(encoder_input_data.shape)
    print(decoder_input_data.shape)
    print(decoder_target_data.shape)

    pitch_target, onset_target, start_target, end_target, velocity_target = np.split(decoder_target_data, [n_pitch+i for i in range(4)], axis=-1)
    decoder_target_data = (pitch_target, onset_target, start_target, end_target, velocity_target)

    return encoder_input_data, decoder_input_data, decoder_target_data

encoder_input_data, decoder_input_data, decoder_target_data = generate_dataset(ROOT+"/irealpro_dataset_v2", MAX_DATA)

print('Data Padded:', count_pad)
print('Data with Non Start Decoder Input:', count_spilled)
print('Data PIECE_LEN:', PIECE_LEN)

86
(1, 86, 200, 43)
(1, 86, 1, 57)
(1, 86, 200, 57)
Data Padded: 9
Data with Non Start Decoder Input: 77
Data PIECE_LEN: 200


In [49]:
if encoder_input_data.shape[0] == 1:
    print("Remove Unwanted Dimension and Split Labels")
    print(encoder_input_data.shape)
    print(decoder_input_data.shape)
    for target in decoder_target_data:
        print(target.shape)
    encoder_input_data = tf.reshape(encoder_input_data, encoder_input_data.shape[1:])
    decoder_input_data = tf.reshape(decoder_input_data, decoder_input_data.shape[1:])
    decoder_target_data = [tf.reshape(target, target.shape[1:]) for target in decoder_target_data]
    decoder_target_data = {key: decoder_target_data[i] for i, key in enumerate(key_order)} # Split target data to each key_order (labels)
    print(encoder_input_data.shape)
    print(decoder_input_data.shape)
    for key in decoder_target_data:
        print(decoder_target_data[key].shape)

inputs = tf.data.Dataset.zip(tuple([tf.data.Dataset.from_tensor_slices((encoder_input_data)), tf.data.Dataset.from_tensor_slices((decoder_input_data))]))
outputs = tf.data.Dataset.from_tensor_slices((decoder_target_data))

train_dataset = tf.data.Dataset.zip((inputs, outputs))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

Remove Unwanted Dimension and Split Labels
(1, 86, 200, 43)
(1, 86, 1, 57)
(1, 86, 200, 53)
(1, 86, 200, 1)
(1, 86, 200, 1)
(1, 86, 200, 1)
(1, 86, 200, 1)
(86, 200, 43)
(86, 1, 57)
(86, 200, 53)
(86, 200, 1)
(86, 200, 1)
(86, 200, 1)
(86, 200, 1)


In [50]:
inputs.element_spec

(TensorSpec(shape=(200, 43), dtype=tf.float32, name=None),
 TensorSpec(shape=(1, 57), dtype=tf.float64, name=None))

In [51]:
outputs.element_spec

{'pitch': TensorSpec(shape=(200, 53), dtype=tf.float32, name=None),
 'onset': TensorSpec(shape=(200, 1), dtype=tf.float32, name=None),
 'start': TensorSpec(shape=(200, 1), dtype=tf.float32, name=None),
 'end': TensorSpec(shape=(200, 1), dtype=tf.float32, name=None),
 'velocity': TensorSpec(shape=(200, 1), dtype=tf.float32, name=None)}

# **Model**

## *Define Loss, Accuracy, and Optimizer*

In [91]:
# Define Loss, Accuracy, and Optimizer
def masked_loss_function(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true[:,:,:n_pitch], axis=2), -1*n_pitch)  # false if it is a padding time step
    pitch_loss = tf.losses.categorical_crossentropy(y_true[:,:,:n_pitch], y_pred[:,:,:n_pitch])
    onset_loss = tf.losses.binary_crossentropy(y_true[:,:,n_pitch:n_pitch+1], y_pred[:,:,n_pitch:n_pitch+1])
    start_loss = tf.square(y_true[:,:,n_pitch+1] - y_pred[:,:,n_pitch+1])
    end_loss = tf.square(y_true[:,:,n_pitch+2] - y_pred[:,:,n_pitch+2])
    velocity_loss = tf.square(y_true[:,:,n_pitch+3] - y_pred[:,:,n_pitch+3])
    total_loss = tf.reduce_sum([pitch_loss, onset_loss, start_loss, end_loss, velocity_loss], axis=0)
    total_loss *= tf.cast(mask, total_loss.dtype)
    return tf.reduce_mean(total_loss)

def mse_with_positive_pressure(y_true: tf.Tensor, y_pred: tf.Tensor):
    mse = (y_true - y_pred) ** 2
    positive_pressure = 10 * tf.maximum(-y_pred, 0.0)
    return tf.reduce_mean(mse + positive_pressure)

def pitch_loss(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    pitch_l = tf.losses.categorical_crossentropy(y_true, y_pred)
    return pitch_l * tf.cast(mask, pitch_l.dtype) 
def onset_loss(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    onset_l = tf.losses.binary_crossentropy(y_true, y_pred)
    return onset_l * tf.cast(mask, onset_l.dtype) 
def start_loss(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    start_l = tf.square(y_true - y_pred)
    return start_l * tf.cast(mask, start_l.dtype) 
def end_loss(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    end_l = tf.square(y_true - y_pred)
    return end_l * tf.cast(mask, end_l.dtype) 
def velocity_loss(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    velocity_l = tf.square(y_true - y_pred)
    return velocity_l * tf.cast(mask, velocity_l.dtype) 

def pitch_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    mask = tf.cast(mask, tf.float32)
    pitch_acc = tf.reduce_mean(mask * tf.metrics.categorical_accuracy(y_true, y_pred))
    return pitch_acc
def onset_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    mask = tf.cast(mask, tf.float32)
    onset_acc = tf.reduce_mean(mask * tf.metrics.binary_accuracy(y_true, y_pred))
    return onset_acc
def start_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    mask = tf.cast(mask, tf.float32)
    start_acc = tf.reduce_mean(mask * tf.square(y_true - y_pred))
    return start_acc
def end_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    mask = tf.cast(mask, tf.float32)
    end_acc = tf.reduce_mean(mask * tf.square(y_true - y_pred))
    return end_acc
def velocity_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor):
    mask = tf.math.not_equal(tf.reduce_sum(y_true, axis=2), -1*n_pitch)  # false if it is a padding time step
    mask = tf.cast(mask, tf.float32)
    velocity_acc = tf.reduce_mean(mask * tf.square(y_true - y_pred))
    return velocity_acc

# TODO: Fix Losses
bundled_loss = {
    'pitch': tf.keras.losses.CategoricalCrossentropy(), 
    'onset': mse_with_positive_pressure, 
    'start': mse_with_positive_pressure, 
    'end': mse_with_positive_pressure, 
    'velocity': mse_with_positive_pressure,
}
# TODO: Fix Accuracy
bundled_metrics = {
    'pitch': pitch_accuracy, 
    'onset': onset_accuracy, 
    'start': start_accuracy, 
    'end': end_accuracy, 
    'velocity': velocity_accuracy,
}
optimizer = tf.keras.optimizers.Adam()

In [53]:
checkpoint_filepath = './checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    )

## Reinjection Technique

In [54]:
input_shape = (None, PIECE_LEN, input_size)
sta_shape = (None, 1, input_size)

output_shape = (None, PIECE_LEN, target_size)
print('n_hidden:', n_hidden)
print('input_shape:', input_shape)
print('output_shape:', output_shape)


n_hidden: 100
input_shape: (None, 200, 43)
output_shape: (None, 200, 57)


In [71]:
from keras import Model
from keras import backend as K
from keras.layers import Input, Concatenate, Dense, LSTM, Embedding, GRU, Attention, Lambda, Permute, Flatten, BatchNormalization, RepeatVector

# Encoder
encoder_inputs = Input(shape=(input_shape[1],input_shape[2]), name='Input_Encoder')
encoder = LSTM(n_hidden, activation=tfa.activations.mish, dropout=0.2, recurrent_dropout=0.2, return_sequences=True, return_state=True, name='Encoder')
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
state_h = BatchNormalization(momentum=0.6)(state_h)
state_c = BatchNormalization(momentum=0.6)(state_c)
states = [state_h, state_c]

attention = Attention(n_hidden, name='Attention')
attention_bn =BatchNormalization(momentum=0.6, name='BN_Attention')
decoder_inputs = Input(shape=(1, target_size), name='Input_Decoder')
decoder_lstm = LSTM(n_hidden, activation=tfa.activations.mish, dropout=0.2, recurrent_dropout=0.2, return_sequences=True, return_state=True, name='Decoder')
decoder_dense = Dense(target_size, activation=tfa.activations.mish, name='Dense_Timestamp_Output')

all_outputs = []
all_attention = []
dec_input = decoder_inputs
for _ in tqdm(range(PIECE_LEN), 'Building Re-inject Decoder Model'):
    # Attention Update
    context_vector, attention_weights = attention([tf.expand_dims([state_h, state_c],2), encoder_outputs], return_attention_scores=True)
    all_attention.append(attention_weights)
    context_vector = attention_bn(context_vector) # (2, None, 1, PIECE_LEN)
    context_vector_h, context_vector_c = context_vector # (2, None, 1, n_hidden)
    context_vector_h = Flatten()(context_vector_h)
    context_vector_c = Flatten()(context_vector_c)

    state_h = Concatenate(axis=-1, name='Concat_StateH_ContextH')([state_h,context_vector_h])
    state_c = Concatenate(axis=-1, name='Concat_StateC_ContextC')([state_c,context_vector_c])
    # Decoder Reinject
    output_dec, state_h, state_c = decoder_lstm(dec_input, initial_state=states)
    # Output
    output_dec = decoder_dense(output_dec)
    all_outputs.append(output_dec)
    # Update Decoder Input & State
    dec_input = output_dec
    states = [state_h, state_c]

# Concatenate all predictions
decoder_outputs = Lambda(lambda x: (K.concatenate(x, axis=1)), name='Concat_Timestamp')(all_outputs)
pitch_outputs, onset_outputs, start_outputs, end_outputs, velocity_outputs = tf.split(decoder_outputs, [n_pitch, 1, 1, 1, 1], axis=2)
decoder_outputs = {
    'pitch': pitch_outputs, 
    'onset': onset_outputs, 
    'start': start_outputs, 
    'end': end_outputs, 
    'velocity': velocity_outputs
}


# Define and compile model as previously
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\

assert encoder_inputs.shape == input_shape

Building Re-inject Decoder Model: 100%|██████████| 200/200 [00:34<00:00,  5.77it/s]


In [96]:
model.compile(
    optimizer='rmsprop', 
    loss=bundled_loss,          # TODO: Fix Losses
    # metrics=bundled_metrics,  # TODO: Fix Accuracy
)

In [58]:
outputs.element_spec

{'pitch': TensorSpec(shape=(200, 53), dtype=tf.float32, name=None),
 'onset': TensorSpec(shape=(200, 1), dtype=tf.float32, name=None),
 'start': TensorSpec(shape=(200, 1), dtype=tf.float32, name=None),
 'end': TensorSpec(shape=(200, 1), dtype=tf.float32, name=None),
 'velocity': TensorSpec(shape=(200, 1), dtype=tf.float32, name=None)}

In [79]:
decoder_outputs

{'pitch': <KerasTensor: shape=(None, 200, 53) dtype=float32 (created by layer 'tf.split')>,
 'onset': <KerasTensor: shape=(None, 200, 1) dtype=float32 (created by layer 'tf.split')>,
 'start': <KerasTensor: shape=(None, 200, 1) dtype=float32 (created by layer 'tf.split')>,
 'end': <KerasTensor: shape=(None, 200, 1) dtype=float32 (created by layer 'tf.split')>,
 'velocity': <KerasTensor: shape=(None, 200, 1) dtype=float32 (created by layer 'tf.split')>}

In [97]:
model.evaluate(train_dataset, return_dict=True)



{'loss': 34119870316544.0,
 'tf.split_loss': 27191897227264.0,
 'tf.split_1_loss': 342791815168.0,
 'tf.split_2_loss': -13.189253807067871,
 'tf.split_3_loss': 931137650688.0,
 'tf.split_4_loss': 5654042378240.0}

In [None]:
# masked_loss_function = tf.keras.losses.BinaryFocalCrossentropy(from_logits=True)
# masked_loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
# masked_loss_function = tf.keras.losses.MeanAbsoluteError()

In [24]:
masked_loss_function

<function __main__.masked_loss_function(y_true, y_pred)>

In [None]:
model.compile(
    optimizer='rmsprop', 
    loss=bundled_loss, 
    loss_weights=1e-30,
    metrics=bundled_metrics,
)

In [None]:
model.summary()
tf.keras.utils.plot_model(
    model,
    to_file='seq2seq_lstm_attention.png',
    show_shapes=True,
    show_layer_names=True,
    rankdir='LR',
    expand_nested=False,
    dpi=96,
    show_layer_activations=True
)

In [None]:
model.fit(train_dataset,
          batch_size=BATCH_SIZE,
          epochs=10,
        #   validation_split=VALIDATION_RATIO,
          verbose=1,
        )

### Predict Function

In [None]:
def predict(enc_inputs):
    # Init Decoder Input
    dec_state = np.zeros((1, target_size))
    dec_state[0, start_correction_index] = 1.
    all_outputs = []
    for enc_input in enc_inputs:
        # Format Model Input: (Encoder, Decoder)
        format_enc_input = np.expand_dims(enc_input, 0)
        format_dec_state = np.expand_dims(dec_state, 0)
        # print(format_enc_input.shape)
        # print(format_dec_state.shape)

        # Model Output
        outputs = model.predict([format_enc_input, format_dec_state])
        outputs = np.squeeze(outputs)
        # print('output:', outputs.shape)
        all_outputs.append(np.squeeze(outputs))
        dec_state = outputs[-1]
        dec_state = np.expand_dims(dec_state, 0)
        # print(dec_state.shape)
    return np.concatenate(all_outputs, axis=0)

# **Midi Format Output**

In [None]:
def generate_dataset(input_dir: str, data_size=-1, batch_per_files=True):
    # using tf.data.Dataset API to create dataset
    x_paths = [] # input path
    y_paths = [] # ans file path
    offsets = [] # starting point of a piece
    titles = []
    data_cnt = 0
    for file_name in os.listdir(input_dir): # 'Scan Files'
        if data_cnt==data_size-1: break
        if file_name.endswith(".ans.npy"):
            data_cnt+=1
            y_path = str(os.path.join(input_dir, file_name))
            x_path = str(os.path.join(input_dir, file_name[:-8] + ".npy"))
            assert os.path.exists(x_path), f"corresponding input file {x_path} doesn't exist"

            # split and pad data into PIECE_LEN
            y_content = np.load(y_path)
            for offset in range(0, y_content.shape[0], PIECE_LEN):
                y_paths.append(y_path)
                x_paths.append(x_path)
                offsets.append(offset)
                titles.append(file_name[:-8])

    paths = list(zip(x_paths, y_paths, offsets))
    i = 0
    while(i < len(paths)): # Batch per Files Handler
        dataset = []
        temp_dataset = []
        flag = False
        while(i < len(paths)):
            if not batch_per_files:
                dataset.append(load_npy_data(*paths[i]))
            else:
                offset = paths[i][2]
                if offset == 0:
                    temp_dataset = load_npy_data(*paths[i])
                    flag = True
                    i += 1
                    break
                elif flag:
                    dataset = []    # Reset Batch
                    dataset.append(temp_dataset)
                    dataset.append(load_npy_data(*paths[i]))
                else:
                    dataset.append(load_npy_data(*paths[i]))
            i += 1
        
        # Skip kickstart dataset+
        if len(dataset) == 0: continue

        # Package
        dataset = list(zip(*dataset))
        encoder_input_data = np.array(dataset[0])
        decoder_input_data = np.array(dataset[1])
        decoder_target_data = np.array(dataset[2])
        print(encoder_input_data.shape)
        print(decoder_input_data.shape)
        print(decoder_target_data.shape)

        # Shuffle
        perm_id = np.random.shuffle(np.arange(encoder_input_data.shape[0]))
        encoder_input_data = encoder_input_data[perm_id]
        decoder_input_data = decoder_input_data[perm_id]
        decoder_target_data = decoder_target_data[perm_id]

        yield encoder_input_data, decoder_input_data, decoder_target_data, titles[i-1]
    yield encoder_input_data, decoder_input_data, decoder_target_data, titles[-1]

In [None]:
# import midi_np_translation.output2midi as output2midi

for test_encoder_input_data, _, _, title in generate_dataset(ROOT+"/irealpro_dataset_v2", 5, batch_per_files=True):
    test_encoder_input_data = np.reshape(test_encoder_input_data, test_encoder_input_data.shape[1:])
    # print(test_encoder_input_data.shape)
    print(f'Generating: {title}')
    pred_result = predict(test_encoder_input_data)
    # output2midi.output_to_midi(bass_ndarr=test_result.reshape(-1,52), ref_midi_path=f"input_midi/irealpro_transposed/{title}.mid", output_path=f"{title}_al.mid")