<a href="https://colab.research.google.com/github/GuilhermeRLDev/DeepLearningExperiments/blob/main/AMTTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#Import relevant libraries to build model
!pip install pretty_midi
!pip install wget
!pip install jams

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pretty_midi
  Downloading pretty_midi-0.2.9.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16
  Downloading mido-1.2.10-py2.py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.9-py3-none-any.whl size=5591954 sha256=34eea26d8811585637c13470636b371c1379cae78f8c683e23be1d942d0e0d85
  Stored in directory: /root/.cache/pip/wheels/2a/5a/e3/30eeb9a99350f3f7e21258fcb132743eef1a4f49b3505e76b6
Successfully built pretty_midi
Installing collected packages: mido,

In [8]:
import os
import random
from glob import glob
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import librosa as lr 

In [9]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        x = self.emb(x)
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions


class AudioFeatureEmbeddingCNN(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)

In [3]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [4]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.

        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, enc_out, target):
        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        target_att = self.self_att(target, target, attention_mask=causal_mask)
        target_norm = self.layernorm1(target + self.self_dropout(target_att))
        enc_out = self.enc_att(target_norm, enc_out)
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))
        return ffn_out_norm

In [5]:
class Transformer(keras.Model):
    def __init__(
        self,
        num_hid=64,
        num_head=2,
        num_feed_forward=128,
        source_maxlen=100,
        target_maxlen=100,
        num_layers_enc=4,
        num_layers_dec=1,
        num_classes=10,
    ):
        super().__init__()
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_input = AudioFeatureEmbeddingCNN(num_hid=num_hid, maxlen=source_maxlen)
        self.dec_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        )

        self.encoder = keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )

        self.classifier = layers.Dense(num_classes)

    def decode(self, enc_out, target):
        y = self.dec_input(target)
        for i in range(self.num_layers_dec):
            y = getattr(self, f"dec_layer_{i}")(enc_out, y)
        return y

    def call(self, inputs):
        source = inputs[0]
        target = inputs[1]
        x = self.encoder(source)
        y = self.decode(x, target)
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric]

    def train_step(self, batch):
        """Processes one batch inside model.fit()."""
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        with tf.GradientTape() as tape:
            preds = self([source, dec_input])
            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
            loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def test_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        preds = self([source, dec_input])
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def generate(self, source, target_start_token_idx):
        """Performs inference over one batch of inputs using greedy decoding."""
        bs = tf.shape(source)[0]
        enc = self.encoder(source)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            dec_out = self.decode(enc, dec_input)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input

In [39]:
import jams

#Processing data for encoder
#Constants 
GUITAR_DATASET = "gdrive/MyDrive/datasets"

#Loading and preprocessing functions
def load_dataset(path, total_samples):
  '''
    Return x number of samples from disk 
  '''
  data_path = f"{path}/data"
  labels_path = f"{path}/labels"
  data = os.listdir(data_path)
  labels = os.listdir(labels_path)

  files = []
  i = 0
  for file in labels:

    if i == total_samples:
      break

    file_name = file.split(".")[0]
    if file_name in files:
      continue

    # Load data and anotation for the samples
    files.append({"audio":f"{data_path}/{file_name}_hex.wav", "label":f"{labels_path}/{file_name}.jams"})
    i += 1

  return files

In [38]:
dataset = load_dataset(GUITAR_DATASET, 2)

print(dataset)

[{'data': 'gdrive/MyDrive/datasets/data/03_SS1-100-C#_comp_hex.wav', 'label': 'gdrive/MyDrive/datasets/labels/03_SS1-100-C#_comp.jams'}, {'data': 'gdrive/MyDrive/datasets/data/04_Jazz2-110-Bb_solo_hex.wav', 'label': 'gdrive/MyDrive/datasets/labels/04_Jazz2-110-Bb_solo.jams'}]


In [42]:
import IPython

for m in dataset: 
  print(m)
  value, sr= lr.load(m['data'])
  IPython.display.display(IPython.display.Audio(value, rate=sr))

{'data': 'gdrive/MyDrive/datasets/data/03_SS1-100-C#_comp_hex.wav', 'label': 'gdrive/MyDrive/datasets/labels/03_SS1-100-C#_comp.jams'}


{'data': 'gdrive/MyDrive/datasets/data/04_Jazz2-110-Bb_solo_hex.wav', 'label': 'gdrive/MyDrive/datasets/labels/04_Jazz2-110-Bb_solo.jams'}


In [15]:
!pip install ddsp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ddsp
  Downloading ddsp-3.5.0-py2.py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.6/212.6 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting crepe<=0.0.12
  Downloading crepe-0.0.12.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting note-seq<0.0.4
  Downloading note_seq-0.0.3-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.1/210.1 KB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hmmlearn<=0.2.7
  Downloading hmmlearn-0.2.7-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (129 kB)


In [17]:
import matplotlib.pyplot as plt
from ddsp import spectral_ops
import numpy as np

In [48]:
def pre_process_audio(path_to_audio):
  '''
    Function build according to MT3 paper
  '''
  #Follow same thresholds define by 
  SAMPLE_RATE = 16000
  HOP_SIZE = 128
  FFT_SIZE = 2048

  #Load file using librosa

  signal, sr = lr.load(path_to_audio)

  if sr != SAMPLE_RATE:
    signal = lr.resample(signal, sr, SAMPLE_RATE)

  frames_per_second = SAMPLE_RATE / HOP_SIZE
  spec = lr.stft(signal)

  signal = np.pad(signal, [0, HOP_SIZE - len(signal) % HOP_SIZE], mode='constant')
  frames = tf.signal.frame(signal, frame_length=HOP_SIZE,frame_step=HOP_SIZE, pad_end=True)

  times = np.arange(len(frames))/ frames_per_second

  overlap = 1 - HOP_SIZE / FFT_SIZE

  #Generate mel_spectograms
  spectograms = spectral_ops.compute_logmel(
      frames,
      bins=128,
      lo_hz=20.0,
      overlap=overlap,
      fft_size=FFT_SIZE,
      sample_rate=SAMPLE_RATE)

  return spectograms, times

for file in dataset:
  spectograms, times = pre_process_audio(file['data'])

#print(len(spectograms[0]))


tf.Tensor(
[[[ -4.058028   -4.0049324  -3.943265  ...  -5.876467   -5.8179274
    -5.7658195]]

 [[ -5.2469893  -5.1641407  -5.043145  ...  -5.160206   -5.1746902
    -5.15285  ]]

 [[ -5.6670547  -5.5902505  -5.4964986 ...  -6.756909   -6.7218456
    -6.749571 ]]

 ...

 [[ -8.393268   -8.399989   -8.433564  ...  -9.542396   -9.583026
    -9.476732 ]]

 [[ -8.472454   -8.489106   -8.538403  ...  -9.387893   -9.229505
   -10.180226 ]]

 [[ -8.674822   -8.677942   -8.707562  ...  -9.495348   -9.883925
   -11.060667 ]]], shape=(4363, 1, 128), dtype=float32)


In [None]:
print()

In [22]:
HOP_SIZE - len(signal) % HOP_SIZE

96