# Import

In [2]:
import os, sys
import fnmatch
import cv2
import numpy as np
import string
import time
import json
import math

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.activations import relu, sigmoid, softmax
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import layers

# Data

##### Generator

In [2]:
def batch_gen():
    path_x = 'D:/AMT/all/batches/marimba/x'
    path_y = 'D:/AMT/all/batches/marimba/y'
    for idx, dirname in enumerate(os.listdir(path_x)):
        training_img = []
        training_txt = []
        train_input_length = []
        train_label_length = []
        for filename in os.listdir(f'{path_x}/{dirname}'):
            # Images
            img = cv2.imread(f'{path_x}/{dirname}/{filename}', cv2.IMREAD_GRAYSCALE)
            # img = img.reshape(img.shape[1], img.shape[0])
            # (W, H) --> (W, H, 1)
            img = np.expand_dims(img, axis=2)
            # Normalize image
            img = img / 255.
            training_img.append(img)

            # Text Targets
            text = np.load(f'{path_y}/{dirname}/{filename}'.split('.')[0] + '.npy')
            # if len(text) > max_label_len:
            #     max_label_len = len(text)
            training_txt.append(text)
            train_label_length.append(len(text))
            train_input_length.append(img.shape[1]-1)
        train_padded_txt = pad_sequences(training_txt, padding='post', maxlen=51)

        yield np.array(training_img), np.array(train_padded_txt), np.array(train_input_length), np.array(train_label_length)

In [10]:
def val_gen(instrument):
    path_x = f'D:/AMT/all/batches/{instrument}/validate_x'
    path_y = f'D:/AMT/all/batches/{instrument}/validate_y'
    for idx, dirname in enumerate(os.listdir(path_x)):
        validate_img = []
        validate_txt = []
        validate_input_length = []
        validate_label_length = []
        # max_label_len = 0
        for filename in os.listdir(f'{path_x}/{dirname}'):
            # Images
            img = cv2.imread(f'{path_x}/{dirname}/{filename}', cv2.IMREAD_GRAYSCALE)
            # img = img.reshape(img.shape[1], img.shape[0])
            # (W, H) --> (W, H, 1)
            img = np.expand_dims(img, axis=2)
            # Normalize image
            img = img / 255.
            validate_img.append(img)

            # Text Targets
            text = np.load(f'{path_y}/{dirname}/{filename}'.split('.')[0] + '.npy')
            # if len(text) > max_label_len:
            #     max_label_len = len(text)
            validate_txt.append(text)
            validate_label_length.append(len(text))
            validate_input_length.append(img.shape[1]-1)
        validate_padded_txt = pad_sequences(validate_txt, padding='post', maxlen=51)

        yield np.array(validate_img), np.array(validate_padded_txt), np.array(validate_input_length), np.array(validate_label_length)

In [3]:
w2i = json.load(open('C:/Users/trifo/Desktop/AMT/w2i_all.json', 'r'))
len(w2i)

1634

# Model

##### CRNN Model

In [11]:
inputs = Input(shape=(128,None,1))
# print(inputs.shape[0]/2)
 
# convolution layer with kernel size (3,3)
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
# poolig layer with kernel size (2,2)
pool_1 = MaxPool2D(pool_size=(2, 1))(conv_1)
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPool2D(pool_size=(2, 1))(conv_2)
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4)
conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
pool_5 = MaxPool2D(pool_size=(2, 1))(conv_5)
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_5)
pool_6 = MaxPool2D(pool_size=(2, 1))(conv_6)
conv_7 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_6)
# Batch normalization layer
batch_norm_5 = BatchNormalization()(conv_7)
conv_8 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_8)
pool_7 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)
conv_9 = Conv2D(512, (2,2), activation = 'relu')(pool_7)
squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_9)
 
# bidirectional LSTM layers with units=128
blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)

outputs = Dense(len(w2i)+1, activation = 'softmax')(blstm_2)

# model to be used at test time
act_model = Model(inputs, outputs)

In [23]:
act_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 128, None, 1)]    0         
                                                                 
 conv2d_9 (Conv2D)           (None, 128, None, 64)     640       
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 64, None, 64)     0         
 2D)                                                             
                                                                 
 conv2d_10 (Conv2D)          (None, 64, None, 128)     73856     
                                                                 
 max_pooling2d_7 (MaxPooling  (None, 32, None, 128)    0         
 2D)                                                             
                                                                 
 conv2d_11 (Conv2D)          (None, 32, None, 256)     2951

In [12]:
max_label_len = 51
labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
 

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
 
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length, )
 
 
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

#model to be used at training time
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
filepath="best_model.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callbacks_list = [checkpoint]

In [9]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 128, 512, 1  0           []                               
                                )]                                                                
                                                                                                  
 conv2d_9 (Conv2D)              (None, 128, 512, 64  640         ['input_2[0][0]']                
                                )                                                                 
                                                                                                  
 max_pooling2d_6 (MaxPooling2D)  (None, 64, 512, 64)  0          ['conv2d_9[0][0]']               
                                                                                            

##### Model Training

In [None]:
batch_size = 32
epochs = 10

for i in range(epochs):
    model.save_weights('my_model_weights.h5')
    gen = batch_gen()
    next(gen, None)
    index = 1
    while True:
        print(f'Epoch: {i+1} | Batch: {index}')
        # print(model.weights)
        index +=1
        next_batch = next(gen, None)
        if not next_batch:
            break
        training_img, train_padded_txt, train_input_length, train_label_length = next_batch
        print(train_input_length)
        history = model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], y=np.zeros(len(training_img)), batch_size=batch_size, epochs = 1, validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], [np.zeros(len(valid_img))]), verbose = 1, callbacks = callbacks_list)
        # model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], y=np.zeros(len(training_img)), batch_size=batch_size, epochs = 1,  verbose = 1, callbacks = callbacks_list)
    

##### Model Testing

In [23]:
from Levenshtein import distance
instrument = 'piano'
val = val_gen(instrument)
#{instrument}.h5'
act_model.load_weights(f'model/{instrument}.h5')
i2w = json.load(open('i2w_all.json'))
# predict outputs on validation images
metric = []
for i in range(4):
    valid_img, valid_padded_txt, _, _ = next(val)
    prediction = act_model.predict(valid_img[:])
    print(f"\nBatch {i+1} | Shape {valid_img.shape[2]}\n")
    
    # use CTC decoder
    out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1], beam_width=500,
                            greedy=True)[0][0])
    for idx, x in enumerate(out):
        og = ''.join([i2w[str(y)] for y in valid_padded_txt[idx] if y != 0])
        pred = ''.join([i2w[str(y)] for y in x if int(y) != -1])
        accuracy = 1 - distance(og, pred)/len(og)
        print(f'Accuracy {accuracy:.4f}')
        metric.append(accuracy)
        og = ' | '.join([i2w[str(y)] for y in valid_padded_txt[idx] if y != 0])
        print(f"Original: {og}\n")
        pred = ' | '.join([i2w[str(y)] for y in x if y != -1])
        print(f"Predicted: {pred}\n")
        print('----------------------------------------')
print(f"{instrument} mean accuracy: {np.mean(metric):.4f}")


Batch 1 | Shape 1497

Accuracy 0.9848
Original: note-G4_whole | note-B4_whole | note-C5_whole | note-A4_whole | note-D5_whole | note-B4_whole | note-G4_whole | note-D5_whole | note-E5_whole | note-D5_whole | note-C5_whole | note-C5_whole | note-B4_double_whole | note-A4_double_whole.

Predicted: note-G4_whole | note-Bb4_whole | note-C5_whole | note-A4_whole | note-D5_whole | note-Bb4_whole | note-G4_whole | note-D5_whole | note-E5_whole | note-D5_whole | note-C5_whole | note-C5_whole | note-B4_double_whole | note-A4_double_whole

----------------------------------------
Accuracy 0.9125
Original: rest-quadruple_whole | rest-quadruple_whole | note-D4_double_whole | note-F4_whole | note-G4_whole | note-D4_half | note-F4_whole | note-E4_quarter | note-D4_quarter | note-D4_whole | note-D4_quarter | note-E4_quarter | note-F4_quarter | note-G4_quarter | note-A4_half | note-Bb4_half.

Predicted: rest-quadruple_whole | note-D4_double_whole | note-F4_whole | note-G4_whole | note-D4_half | note-

acoustic_guitar mean accuracy: 0.9373
marimba mean accuracy: 0.9360 Works better overall, one bad example
recorder mean accuracy: 0.8869
electric_bass mean accuracy: 0.6174
piano mean accuracy: 0.9454

In [18]:
# val = val_gen()
act_model.load_weights('model/piano.h5')
i2w = json.load(open('i2w_all.json'))
# predict outputs on validation images
# valid_img, valid_padded_txt, valid_input_length, valid_label_length = next(val)
valid_img = cv2.imread('experiment/jojo.png', cv2.IMREAD_GRAYSCALE)
# valid_img = valid_img.reshape(1, 128, 388)
valid_img = np.expand_dims(valid_img, axis=0)
# Normalize image
valid_img = valid_img / 255.
prediction = act_model.predict(valid_img)
# print(f"\nFirst batch | Shape {valid_img.shape[2]}\n")

# use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1], beam_width=500,
                        greedy=True)[0][0])
i = 0
for x in out:
    print("predicted text = ", end = '')
    for p in x:  
        if int(p) != -1:
            print(i2w[str(p)], end = ', ')       
    print('\n')
    i+=1

predicted text = note-F#4_quarter, note-E4_quarter, note-Bb3_eighth, note-E4_eighth, note-G4_eighth, note-Eb4_eighth, note-F4_sixteenth, note-D4_thirty_second, note-F4_thirty_second, note-Ab4_thirty_second, note-F4_sixteenth, note-Eb4_eighth, note-F4_quarter, note-Bb4_quarter, note-Bb3_eighth, note-F4_thirty_second, note-D4_thirty_second, note-E4_sixteenth, note-D4_thirty_second, note-Eb4_eighth, note-F4_sixteenth, note-D4_thirty_second, note-F4_thirty_second, note-Ab4_thirty_second, note-G4_eighth, note-G4_eighth, note-F4_quarter, note-E4_quarter, rest-quarter, 

