In [1]:
import numpy as np
import tensorflow as tf
from music21 import *
import os
import time
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
from tqdm.notebook import tqdm

In [2]:
path = os.getcwd()
filenames = os.listdir("./maestro-v2.0.0/2018")[:10]

In [3]:
filenames

['MIDI-Unprocessed_Chamber1_MID--AUDIO_07_R3_2018_wav--2.midi',
 'MIDI-Unprocessed_Chamber2_MID--AUDIO_09_R3_2018_wav--1.midi',
 'MIDI-Unprocessed_Chamber2_MID--AUDIO_09_R3_2018_wav--3.midi',
 'MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi',
 'MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--2.midi',
 'MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--3.midi',
 'MIDI-Unprocessed_Chamber4_MID--AUDIO_11_R3_2018_wav--1.midi',
 'MIDI-Unprocessed_Chamber4_MID--AUDIO_11_R3_2018_wav--3.midi',
 'MIDI-Unprocessed_Chamber5_MID--AUDIO_18_R3_2018_wav--1.midi',
 'MIDI-Unprocessed_Chamber5_MID--AUDIO_18_R3_2018_wav--2.midi']

In [4]:
def _parseMidi(list_filenames):
    print("Loading samples.....")
    samples = [converter.parse(path + "/maestro-v2.0.0/2018/" + str(file)) for file in tqdm(list_filenames)]
    sam_mono = [sample.chordify() for sample in tqdm(samples) if len(instrument.partitionByInstrument(sample).parts) == 1]
    print("Samples loaded and chordified.....")
    list_chords = [[] for _ in sam_mono]
    list_durations = [[] for _ in sam_mono]
    list_keys = [[]]
    print("Preparing data....")
    for i, song in enumerate(sam_mono):
        list_keys.append(song.analyze("key"))
        for element in song:
            if isinstance(element, note.Note):
                list_chords[i].append(element.pitch)
                list_durations[i].append(element.duration.quarterLength)
            elif isinstance(element, chord.Chord):
                list_chords[i].append(".".join(str(n) for n in element.pitches))
                list_durations[i].append(element.duration.quarterLength)
    return list_chords, list_durations, list_keys

In [5]:
list_chords, list_durations, list_keys = _parseMidi(filenames)

Loading samples.....


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Samples loaded and chordified.....
Preparing data....


In [6]:
#Find the number of unique Chords
unique_chords = np.unique([i for s in list_chords for i in s])
print(len(unique_chords))
chord_to_int = dict(zip(unique_chords, range(0, len(unique_chords))))
#Find the number of unique durations
unique_durations = np.unique([i for s in list_durations for i in s])
print(len(unique_durations))
duration_to_int = dict(zip(unique_durations, range(0, len(unique_durations))))

20285
79


In [7]:
#Create the reverse dict
int_to_chord = {i:j for j, i in chord_to_int.items()}
int_to_durations = {i:j for j, i in duration_to_int.items()}

In [8]:
# Define the sequence length
sequence_length = 32

# Define the empty lists
train_chords = []
train_durations = []
target_chords = []
target_durations = []

#Genrate the train and the target data
for i in range(len(list_chords)):
    chordList = [chord_to_int[chord] for chord in list_chords[i]]
    durationList = [duration_to_int[chord] for chord in list_durations[i]]
    for j in range(len(chordList) - sequence_length - 1):
        train_chords.append(chordList[j:j + sequence_length])
        train_durations.append(durationList[j:j + sequence_length])
        target_chords.append(chordList[j + sequence_length + 1])
        target_durations.append(durationList[j + sequence_length + 1])

In [9]:
train_chords = np.asarray(train_chords)
train_durations = np.asarray(train_durations)
target_chords = np.asarray(target_chords)
target_durations = np.asarray(target_durations)

In [10]:
print(train_chords.shape)
print(train_durations.shape)
print(target_chords.shape)
print(target_durations.shape)

(83874, 32)
(83874, 32)
(83874,)
(83874,)


In [11]:
embed_dim = 64

In [12]:
target_chords = to_categorical(target_chords)
target_durations = to_categorical(target_durations)

In [13]:
target_chords.shape

(83874, 20285)

In [14]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, *kwargs):
        super(BahdanauAttention, self).__init__(*kwargs)
        
    def build(self, input_shape):
        # eg. input shape = (64, 512, 1024)
        self.W = self.add_weight(name = 'att_weight', shape = (input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name = 'att_bias', shape = (input_shape[1], 1), initializer="normal")
        super(BahdanauAttention, self).build(input_shape)
    
    def call(self, x):
        et = K.tanh(K.dot(x, self.W) + self.b)
        et = K.softmax(K.squeeze(et, axis = -1))
        at = K.expand_dims(et, axis = -1)
        output = x * at
        return K.sum(output, axis = 1)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
    
    def get_config(self):
        return super(BahdanauAttention,self).get_config()

In [15]:
# Input layers
chord_input = tf.keras.layers.Input((sequence_length,))
duration_input = tf.keras.layers.Input((sequence_length,))

# Embedding layers
embedding_chord = tf.keras.layers.Embedding(input_dim=len(chord_to_int), output_dim=embed_dim,
                                            input_length=sequence_length)(chord_input)
embedding_durations = tf.keras.layers.Embedding(input_dim=len(duration_to_int), output_dim=embed_dim, 
                                               input_length=sequence_length)(duration_input)

# Concat these 2 layers
concat = tf.keras.layers.Concatenate(axis = 1)([embedding_chord, embedding_durations])

# Define the single LSTM layer with 512 units
lstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True))(concat)

lstm_layer2 = tf.keras.layers.LSTM(512, return_sequences=True)(lstm_layer)

attention = BahdanauAttention()(lstm_layer2)

flatten = tf.keras.layers.Flatten()(attention)

# Define the intermediate dense layer
dense = tf.keras.layers.Dense(256)(flatten)

# Define the final output layers
dense1 = tf.keras.layers.Dense(len(chord_to_int), activation = "softmax")(dense)
dense2 = tf.keras.layers.Dense(len(duration_to_int), activation = "softmax")(dense)

# Define the model 
model = tf.keras.models.Model([chord_input, duration_input], [dense1, dense2])

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 32, 64)       1298240     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 32, 64)       5056        input_2[0][0]                    
______________________________________________________________________________________________

In [17]:
model.compile(loss = "categorical_crossentropy", optimizer = "rmsprop")
model.fit([train_chords, train_durations], [target_chords, target_durations], batch_size=64, epochs = 30)

Train on 83874 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x272a10c6da0>

In [23]:
model.save("att_modelv1.0.hdf5")

In [20]:
def generate_samples(n_samples):
    def predict_next(chord_seq, dur_seq):
        out_chord, out_dur = model.predict([chord_seq, dur_seq])
        return out_chord, out_dur
    
    def make_predictions(num_steps, init_chord, init_dur, model):
        for i in range(num_steps):
            chord, dur = predict_next(np.asarray(init_chord[-32:]).reshape(1,-1), np.asarray(init_dur[-32:]).reshape(1,-1))
            chord_out = np.argmax(chord)
            dur_out = np.argmax(dur)
            init_chord.append(chord_out)
            init_dur.append(dur_out)
        return init_chord[32:], init_dur[32:]
    
    for n in tqdm(range(n_samples)):
        seed = np.random.randint(low = 0, high = train_chords.shape[0])
        CHORD, DURATION = make_predictions(100, train_chords[seed].tolist(), train_durations[seed].tolist(), model)
        CHORD = [int_to_chord[c] for c in CHORD]
        DURATION = [int_to_durations[d] for d in DURATION]
        generated_stream = stream.Stream()
        generated_stream.append(instrument.Piano())
        for i in range(len(CHORD)):
            try:
                generated_stream.append(note.Note(CHORD[i].replace(".", " "), quaterType = DURATION[i]))
            except:
                generated_stream.append(chord.Chord(CHORD[i].replace(".", " "), quaterType = DURATION[i]))
        generated_stream.write('midi', fp=path+'/generated/song{0}.mid'.format(n))

In [21]:
generate_samples(10)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


