In [1]:
!nvidia-smi

Thu Apr 15 02:25:56 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive

drive.mount('/content/drive')

%cd /content/drive/MyDrive/MuseGAN

Mounted at /content/drive
/content/drive/MyDrive/MuseGAN


In [3]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.initializers import RandomNormal

In [4]:
# data_ints = np.load('./data/lift_every_voice_amplified.npy')
data_ints = np.load('./data/Jsb16thSeparated.npy')

max_note = 83

where_are_NaNs = np.isnan(data_ints)

data_ints[where_are_NaNs] = max_note + 1
max_note = max_note + 1

data_ints = data_ints.astype(int)
num_classes = max_note + 1

data_binary = np.eye(num_classes)[data_ints]
data_binary[data_binary == 0] = -1
data_binary = np.delete(data_binary, max_note, -1)

data_binary = data_binary.transpose([0,1,2, 4,3])

In [5]:
data_binary.shape

(229, 2, 16, 84, 4)

In [6]:
IMG_SHAPE = data_binary.shape[1:]
BATCH_SIZE = 64
WEIGHT_INIT = RandomNormal(mean=0., stddev=0.02)

# Size of the noise vector
noise_dim = 32

train_images = data_binary

print(f"Number of examples: {len(train_images)}")
print(f"Shape of the images in the dataset: {train_images.shape[1:]}")

Number of examples: 229
Shape of the images in the dataset: (2, 16, 84, 4)


In [7]:
def get_discriminator_model():

  critic_input = layers.Input(shape = IMG_SHAPE, name = 'discriminator_input')

  x = layers.ZeroPadding3D((1, 1, 1))(critic_input)

  x = layers.Conv3D(filters = 128,
                    kernel_size = (2, 1, 1),
                    strides = (1, 1, 1),
                    kernel_initializer = WEIGHT_INIT,
                    padding = 'valid')(x)
  x = layers.LeakyReLU()(x)

  x = layers.Conv3D(filters = 128,
                    kernel_size = (train_images.shape[1] - 1, 1, 1),
                    strides = (1, 1, 1),
                    kernel_initializer = WEIGHT_INIT,
                    padding = 'valid')(x)
  x = layers.LeakyReLU()(x)


  x = layers.Conv3D(filters = 128,
                    kernel_size = (1, 1, 12),
                    strides = (1, 1, 12),
                    kernel_initializer = WEIGHT_INIT,
                    padding = 'same')(x)
  x = layers.LeakyReLU()(x)

  x = layers.Conv3D(filters = 128,
                    kernel_size = (1, 1, 7),
                    strides = (1, 1, 7),
                    kernel_initializer = WEIGHT_INIT,
                    padding = 'same')(x)
  x = layers.LeakyReLU()(x)

  x = layers.Conv3D(filters = 128,
                    kernel_size = (1, 2, 1),
                    strides = (1, 2, 1),
                    kernel_initializer = WEIGHT_INIT,
                    padding = 'same')(x)
  x = layers.LeakyReLU()(x)

  x = layers.Conv3D(filters = 128,
                       kernel_size = (1, 2, 1),
                       strides = (1, 2, 1),
                       kernel_initializer = WEIGHT_INIT,
                       padding = 'same')(x)
  x = layers.LeakyReLU()(x)

  x = layers.Conv3D(filters = 128,
                    kernel_size = (1, 4, 1),
                    strides = (1, 2, 1),
                    kernel_initializer = WEIGHT_INIT,
                    padding = 'same')(x)
  x = layers.LeakyReLU()(x)

  x = layers.Conv3D(filters = 128,
                    kernel_size = (1, 3, 1),
                    strides = (1, 2, 1),
                    kernel_initializer = WEIGHT_INIT,
                    padding = 'same')(x)
  x = layers.LeakyReLU()(x)


  x = layers.Flatten()(x)

  x = layers.Dense(1024, kernel_initializer = WEIGHT_INIT)(x)
  x = layers.LeakyReLU()(x)
  critic_output = layers.Dense(1, activation = None, kernel_initializer = WEIGHT_INIT)(x)

  d_model = keras.models.Model(critic_input, critic_output, name  = 'discriminator')
  return d_model

d_model = get_discriminator_model()
d_model.summary()

Model: "discriminator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
discriminator_input (InputLa [(None, 2, 16, 84, 4)]    0         
_________________________________________________________________
zero_padding3d (ZeroPadding3 (None, 4, 18, 86, 4)      0         
_________________________________________________________________
conv3d (Conv3D)              (None, 3, 18, 86, 128)    1152      
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 3, 18, 86, 128)    0         
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 3, 18, 86, 128)    16512     
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 3, 18, 86, 128)    0         
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 3, 18, 8, 128)   

In [8]:
def temporal_network():

  input_layer = layers.Input(shape = (noise_dim, ), name = 'temporal_input')

  x = layers.Reshape([1, 1, noise_dim])(input_layer)

  x = layers.Conv2DTranspose(filters = 1024,
                             kernel_size = (2, 1),
                             padding = 'valid',
                             strides = (1, 1),
                             kernel_initializer = WEIGHT_INIT)(x)
  x = layers.BatchNormalization(momentum = 0.9)(x)
  x = layers.Activation('relu')(x)

  x = layers.Conv2DTranspose(filters = noise_dim,
                             kernel_size = (train_images.shape[1] - 1, 1),
                             padding = 'valid',
                             strides = (1, 1),
                             kernel_initializer = WEIGHT_INIT)(x)
  x = layers.BatchNormalization(momentum = 0.9)(x)
  x = layers.Activation('relu')(x)

  output_layer = layers.Reshape([train_images.shape[1], noise_dim])(x)

  return keras.models.Model(input_layer, output_layer)


def BarGenerator():
  input_layer = layers.Input(shape = (noise_dim * 4, ), name = 'bar_generator_input')

  x = layers.Dense(1024)(input_layer)
  x = layers.BatchNormalization(momentum = 0.9)(x)
  x = layers.Activation('relu')(x)

  x = layers.Reshape([2, 1, 512])(x)

  x = layers.Conv2DTranspose(filters = 512,
                             kernel_size = (2, 1),
                             padding = 'same',
                             strides = (2, 1),
                             kernel_initializer = WEIGHT_INIT)(x)
  x = layers.BatchNormalization(momentum = 0.9)(x)
  x = layers.Activation('relu')(x)

  x = layers.Conv2DTranspose(filters = 256,
                             kernel_size = (2, 1),
                             padding = 'same',
                             strides = (2, 1),
                             kernel_initializer = WEIGHT_INIT)(x)
  x = layers.BatchNormalization(momentum = 0.9)(x)
  x = layers.Activation('relu')(x)

  x = layers.Conv2DTranspose(filters = 256,
                             kernel_size = (2, 1),
                             padding = 'same',
                             strides = (2, 1),
                             kernel_initializer = WEIGHT_INIT)(x)
  x = layers.BatchNormalization(momentum = 0.9)(x)
  x = layers.Activation('relu')(x)

  x = layers.Conv2DTranspose(filters = 256,
                             kernel_size = (1, 7),
                             padding = 'same',
                             strides = (1, 7),
                             kernel_initializer = WEIGHT_INIT)(x)
  x = layers.BatchNormalization(momentum = 0.9)(x)
  x = layers.Activation('relu')(x)

  x = layers.Conv2DTranspose(filters = 1,
                             kernel_size = (1, 12),
                             padding = 'same',
                             strides = (1, 12),
                             kernel_initializer = WEIGHT_INIT)(x)
  x = layers.Activation('tanh')(x)

  output_layer = layers.Reshape([1, train_images.shape[2], train_images.shape[3], 1])(x)

  return keras.models.Model(input_layer, output_layer)


def get_generator_model():

  input = layers.Input(shape = (noise_dim, ), name = 'noise_input')

  chords_input = layers.Lambda(lambda x: x[:, :])(input)
  style_input = layers.Lambda(lambda x: x[:, :])(input)
  melody_input = layers.Lambda(lambda x: x[:, :])(input)
  groove_input = layers.Lambda(lambda x: x[:, :])(input)




  # 화음
  # chords_input = layers.Input(shape = (noise_dim, ), name = 'chords_input')
  chords_temporal_network = temporal_network()
  chords_over_time = chords_temporal_network(chords_input)
  # 스타일
  # style_input = layers.Input(shape = (noise_dim, ), name = 'style_input')
  # 멜로디
  # melody_input = layers.Input(shape = (train_images.shape[4], noise_dim), name = 'melody_input')
  melody_over_time = [None] * train_images.shape[4]
  melody_temporal_network = [None] * train_images.shape[4]
  for track in range(train_images.shape[4]):   # 악기 갯수(train_images.shape[4]) 별로 생성
    melody_temporal_network[track] = temporal_network()
    melody_track = layers.Lambda(lambda x: x[:, :])(melody_input)
    melody_over_time[track] = melody_temporal_network[track](melody_track)
  # 리듬
  # groove_input = layers.Input(shape = (train_images.shape[4], noise_dim), name = 'groove_input')

  # 트랙마다 마디 생성자를 만듭니다.
  barGen = [None] * train_images.shape[4]
  for track in range(train_images.shape[4]):
    barGen[track] = BarGenerator()
  # 트랙과 마디마다 출력을 생성합니다.
  bars_output = [None] * train_images.shape[1]
  for bar in range(train_images.shape[1]):
    track_output = [None] * train_images.shape[4]

    c = layers.Lambda(lambda x: x[:, bar, :],
                      name = 'chords_input_bar_' + str(bar))(chords_over_time)
    s = style_input

    for track in range(train_images.shape[4]):
      m = layers.Lambda(lambda x: x[:, bar, :])(melody_over_time[track])
      g = layers.Lambda(lambda x: x[:, :])(groove_input)

      z_input = layers.Concatenate(axis = 1,
                                   name = 'total_input_bar_{}_track_{}'.format(bar, track))([c, s, m, g])
      track_output[track] = barGen[track](z_input)

    bars_output[bar] = layers.Concatenate(axis = -1)(track_output)

  generator_output = layers.Concatenate(axis = 1, name = 'concat_bars')(bars_output)

  g_model = keras.models.Model(input, generator_output)

  return g_model


g_model = get_generator_model()
g_model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
noise_input (InputLayer)        [(None, 32)]         0                                            
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 32)           0           noise_input[0][0]                
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 32)           0           noise_input[0][0]                
__________________________________________________________________________________________________
lambda_4 (Lambda)               (None, 32)           0           lambda_2[0][0]                   
____________________________________________________________________________________________

In [9]:
class WGAN(keras.Model):
    def __init__(
        self,
        discriminator,
        generator,
        latent_dim,
        tracks_dim,
        discriminator_extra_steps=3,
        gp_weight=10.0,
    ):
        super(WGAN, self).__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.latent_dim = latent_dim
        self.tracks_dim = tracks_dim
        self.d_steps = discriminator_extra_steps
        self.gp_weight = gp_weight

    def compile(self, d_optimizer, g_optimizer, d_loss_fn, g_loss_fn):
        super(WGAN, self).compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.d_loss_fn = d_loss_fn
        self.g_loss_fn = g_loss_fn

    def gradient_penalty(self, batch_size, real_images, fake_images):
        """ Calculates the gradient penalty.

        This loss is calculated on an interpolated image
        and added to the discriminator loss.
        """
        # Get the interpolated image
        alpha = tf.random.normal([batch_size, 1, 1, 1, 1], 0.0, 1.0)
        diff = fake_images - real_images
        interpolated = real_images + alpha * diff

        with tf.GradientTape() as gp_tape:
            gp_tape.watch(interpolated)
            # 1. Get the discriminator output for this interpolated image.
            pred = self.discriminator(interpolated, training=True)

        # 2. Calculate the gradients w.r.t to this interpolated image.
        grads = gp_tape.gradient(pred, [interpolated])[0]
        # 3. Calculate the norm of the gradients.
        norm = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=[1, 2, 3]))
        gp = tf.reduce_mean((norm - 1.0) ** 2)
        return gp

    def train_step(self, real_images):
        if isinstance(real_images, tuple):
            real_images = real_images[0]

        # Get the batch size
        batch_size = tf.shape(real_images)[0]


        # For each batch, we are going to perform the
        # following steps as laid out in the original paper:
        # 1. Train the generator and get the generator loss
        # 2. Train the discriminator and get the discriminator loss
        # 3. Calculate the gradient penalty
        # 4. Multiply this gradient penalty with a constant weight factor
        # 5. Add the gradient penalty to the discriminator loss
        # 6. Return the generator and discriminator losses as a loss dictionary

        # Train the discriminator first. The original paper recommends training
        # the discriminator for `x` more steps (typically 5) as compared to
        # one step of the generator. Here we will train it for 3 extra steps
        # as compared to 5 to reduce the training time.
        for i in range(self.d_steps):
            global B
            B = i

            # Get the latent vector
            random_latent_vectors = tf.random.normal(
                shape=(batch_size, self.latent_dim)
            )

            with tf.GradientTape() as tape:
                # Generate fake images from the latent vector
                fake_images = self.generator(random_latent_vectors, training=True)
                # Get the logits for the fake images
                fake_logits = self.discriminator(fake_images, training=True)
                # Get the logits for the real images
                real_logits = self.discriminator(real_images, training=True)

                # Calculate the discriminator loss using the fake and real image logits
                d_cost = self.d_loss_fn(real_img=real_logits, fake_img=fake_logits)
                # Calculate the gradient penalty
                gp = self.gradient_penalty(batch_size, real_images, fake_images)
                # Add the gradient penalty to the original discriminator loss
                d_loss = d_cost + gp * self.gp_weight


            # Get the gradients w.r.t the discriminator loss
            d_gradient = tape.gradient(d_loss, self.discriminator.trainable_variables)
            # Update the weights of the discriminator using the discriminator optimizer
            self.d_optimizer.apply_gradients(
                zip(d_gradient, self.discriminator.trainable_variables)
            )


        # Train the generator
        # Get the latent vector
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
        with tf.GradientTape() as tape:
            # Generate fake images using the generator
            generated_images = self.generator(random_latent_vectors, training=True)
            # Get the discriminator logits for fake images
            gen_img_logits = self.discriminator(generated_images, training=True)
            # Calculate the generator loss
            g_loss = self.g_loss_fn(gen_img_logits)

        # Get the gradients w.r.t the generator loss
        gen_gradient = tape.gradient(g_loss, self.generator.trainable_variables)
        # Update the weights of the generator using the generator optimizer
        self.g_optimizer.apply_gradients(
            zip(gen_gradient, self.generator.trainable_variables)
        )
        return {"d_loss": d_loss, "g_loss": g_loss}

In [10]:
from music21 import *

class GANMonitor(keras.callbacks.Callback):
    def __init__(self, latent_dim=128):
        self.latent_dim = latent_dim
    def on_epoch_end(self, epoch, logs=None):
      if epoch % 100 == 0:
        random_latent_vectors = tf.random.normal(shape=(1, self.latent_dim))
        output = self.model.generator(random_latent_vectors)
        for score_num in range(len(output)):
          max_pitches = np.argmax(output, axis = 3)
          midi_note_score = max_pitches[score_num].reshape([train_images.shape[1] * train_images.shape[2], train_images.shape[4]])
          parts = stream.Score()
          parts.append(tempo.MetronomeMark(number=66))
          for i in range(train_images.shape[4]):
            last_x = int(midi_note_score[:, i][0])
            s = stream.Part()
            dur = 0
            for idx, x in enumerate(midi_note_score[:, i]):
              x = int(x)
              if (x != last_x or idx % 4 == 0) and idx > 0:
                n = note.Note(last_x)
                n.duration = duration.Duration(dur)
                s.append(n)
                dur = 0
              last_x = x
              dur = dur + 0.25
            n = note.Note(last_x)
            n.duration = duration.Duration(dur)
            s.append(n)            
            parts.append(s)
          parts.write('midi', fp = './output/sample_{}.midi'.format(epoch))
          self.model.generator.save_weights('./weights-g_{}.h5'.format(epoch))
          self.model.discriminator.save_weights('./weights-d_{}.h5'.format(epoch))

In [11]:
# Instantiate the optimizer for both networks
# (learning_rate=0.0002, beta_1=0.5 are recommended)
generator_optimizer = keras.optimizers.Adam(
    learning_rate=0.0002, beta_1=0.5, beta_2=0.9
)
discriminator_optimizer = keras.optimizers.Adam(
    learning_rate=0.0002, beta_1=0.5, beta_2=0.9
)

# Define the loss functions for the discriminator,
# which should be (fake_loss - real_loss).
# We will add the gradient penalty later to this loss function.
def discriminator_loss(real_img, fake_img):
    real_loss = tf.reduce_mean(real_img)
    fake_loss = tf.reduce_mean(fake_img)
    return fake_loss - real_loss


# Define the loss functions for the generator.
def generator_loss(fake_img):
    return -tf.reduce_mean(fake_img)


# Set the number of epochs for trainining.
epochs = 2000

# Instantiate the customer `GANMonitor` Keras callback.
cbk = GANMonitor(latent_dim=noise_dim)

# Instantiate the WGAN model.
wgan = WGAN(
    discriminator=d_model,
    generator=g_model,
    latent_dim=noise_dim,
    tracks_dim=train_images[4],
    discriminator_extra_steps=3,
)

# Compile the WGAN model.
wgan.compile(
    d_optimizer=discriminator_optimizer,
    g_optimizer=generator_optimizer,
    g_loss_fn=generator_loss,
    d_loss_fn=discriminator_loss,
)

In [12]:
g_model.load_weights('/content/drive/MyDrive/MuseGAN/weights-g_1900.h5', None)

for m in range(50):
  random_latent_vectors = tf.random.normal(shape=(1, noise_dim))
  output = g_model(random_latent_vectors)
  for score_num in range(len(output)):
    max_pitches = np.argmax(output, axis = 3)
    midi_note_score = max_pitches[score_num].reshape([train_images.shape[1] * train_images.shape[2], train_images.shape[4]])
    parts = stream.Score()
    parts.append(tempo.MetronomeMark(number=66))
    for i in range(train_images.shape[4]):
      last_x = int(midi_note_score[:, i][0])
      s = stream.Part()
      dur = 0
      for idx, x in enumerate(midi_note_score[:, i]):
        x = int(x)
        if (x != last_x or idx % 4 == 0) and idx > 0:
          n = note.Note(last_x)
          n.duration = duration.Duration(dur)
          s.append(n)
          dur = 0
        last_x = x
        dur = dur + 0.25
      n = note.Note(last_x)
      n.duration = duration.Duration(dur)
      s.append(n)            
      parts.append(s)
    parts.write('midi', fp = './generate/sample_{}.midi'.format(m))
    print('./generate/sample_{}.midi'.format(m))

./generate/sample_0.midi
./generate/sample_1.midi
./generate/sample_2.midi
./generate/sample_3.midi
./generate/sample_4.midi
./generate/sample_5.midi
./generate/sample_6.midi
./generate/sample_7.midi
./generate/sample_8.midi
./generate/sample_9.midi
./generate/sample_10.midi
./generate/sample_11.midi
./generate/sample_12.midi
./generate/sample_13.midi
./generate/sample_14.midi
./generate/sample_15.midi
./generate/sample_16.midi
./generate/sample_17.midi
./generate/sample_18.midi
./generate/sample_19.midi
./generate/sample_20.midi
./generate/sample_21.midi
./generate/sample_22.midi
./generate/sample_23.midi
./generate/sample_24.midi
./generate/sample_25.midi
./generate/sample_26.midi
./generate/sample_27.midi
./generate/sample_28.midi
./generate/sample_29.midi
./generate/sample_30.midi
./generate/sample_31.midi
./generate/sample_32.midi
./generate/sample_33.midi
./generate/sample_34.midi
./generate/sample_35.midi
./generate/sample_36.midi
./generate/sample_37.midi
./generate/sample_38.m