<a href="https://colab.research.google.com/github/Jonipeloni/musicgeneration/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pretty_midi numpy

Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.2-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting packaging~=23.1 (from mido>=1.1.16->pretty_midi)
  Downloading packaging-23.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592289 sha256=0418da3f7e5c58df2ba31381898838a87c3d32e638864584af12bc9381159ae6
  Stored in direc

In [2]:
import os
import numpy as np
import tensorflow as tf
import pretty_midi

# Assuming '/content/drive/My Drive/Midi' contains your MIDI files.
midi_folder_path = '/content/drive/My Drive/Midi'

def midi_to_tokens(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append((note.start, note.end, note.pitch, note.velocity))
    notes.sort(key=lambda note: note[0])  # Sort by start time
    tokens = []
    last_end_time = 0
    for note in notes:
        start_time, end_time, pitch, velocity = note
        time_shift = round(start_time - last_end_time, 2)
        if time_shift > 0:
            tokens.append(f"TimeShift_{time_shift}")
        tokens.append(f"NoteOn_{pitch}_{velocity}")
        duration = round(end_time - start_time, 2)
        tokens.append(f"Duration_{duration}")
        last_end_time = end_time
    return tokens

def process_midi_folder(folder_path):
    all_tokens = []
    for midi_file in os.listdir(folder_path):
        if midi_file.endswith(('.mid', '.midi')):
            tokens = midi_to_tokens(os.path.join(folder_path, midi_file))
            all_tokens.extend(tokens)
    return all_tokens

def create_dataset(all_tokens, seq_length=100, batch_size=64):
    # Convert tokens to IDs
    token_to_id = np.load('/content/drive/My Drive/token_to_id.npy', allow_pickle=True).item()
    token_ids = [token_to_id.get(token, token_to_id['UNK']) for token in all_tokens]

    dataset = tf.data.Dataset.from_tensor_slices(token_ids)
    sequences = dataset.batch(seq_length+1, drop_remainder=True)

    def split_input_target(chunk):
        input_text = chunk[:-1]
        target_text = chunk[1:]
        return input_text, target_text

    dataset = sequences.map(split_input_target)
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
    return dataset

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Midi'

In [None]:
#Main execution
all_tokens = process_midi_folder(midi_folder_path)
dataset = create_dataset(all_tokens)


In [None]:
from collections import Counter
import numpy as np

# Assuming 'all_tokens' is a list of all tokens you've processed from your MIDI files
all_tokens = process_midi_folder(midi_folder_path)  # From your MIDI processing

# Count unique tokens and create a mapping to integers
token_counts = Counter(all_tokens)
unique_tokens = sorted(token_counts.keys())
token_to_id = {token: id for id, token in enumerate(unique_tokens, start=1)}

# Optionally, include a special token for unknown tokens (if your model needs to handle unseen tokens)
token_to_id['UNK'] = len(token_to_id) + 1

# Save the mapping for later use
np.save('/content/drive/My Drive/token_to_id.npy', token_to_id)


In [None]:
#Create the Positional Encodings which are added to the Embeddings
def pos_enc(length, d_model):
    pos = np.arange(length)[:, np.newaxis]
    j = np.arange(d_model)[np.newaxis, :]
    #distribute the angles according to the formula for Positional Encodings
    angle_rates = 1 / np.power(10000, (2 * (j//2)) / np.float32(d_model))
    angle_rads = pos * angle_rates
    #even coordinates get sin, odd coordinates get cos
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

The formulas for Positional Encodings from the original Attention Paper are $$PE_{(pos, 2j)} = \sin(pos / 10000^{2j / d_{\text{model}}})$$
$$PE_{(pos, 2j+1)} = \cos(pos / 10000^{2j / d_{\text{model}}})
$$ so we have to implement two formulas, one for the even and one for the odd coordinates.

In [None]:
import tensorflow as tf

#Create the Embeddings our Transformer can use
class Embedding(tf.keras.layers.Layer):
    def __init__(self, voc, d_model, block_length, dropout_rate):
        super(Embedding, self).__init__()
        self.voc = voc
        self.block_length = block_length
        self.d_model = d_model
        self.emb = tf.keras.layers.Embedding(input_dim=voc,
                          output_dim=d_model,
                          input_length=block_length)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.pos_enc = pos_enc(block_length, d_model)

    #Add the Embeddings and the Encodings
    @tf.function
    def call(self, x):
        x = self.emb(x) + self.pos_enc[:, :self.block_length, :]
        x = self.dropout(x)
        return x

In [None]:
#SwigLu
class SwiGLu(tf.keras.layers.Layer):
    def __init__(self, d_model, units):
        super(SwiGLu, self).__init__()
        self.dense1 = tf.keras.layers.Dense(units=(units//3)*2, activation=tf.nn.silu, use_bias=False)
        self.dense2 = tf.keras.layers.Dense(units=(units//3)*2, use_bias=False)
        self.dense3 = tf.keras.layers.Dense(units=d_model, use_bias=False)

    @tf.function
    def call(self, x):
        w = self.dense1(x)
        v = self.dense2(x)
        x = self.dense3(w*v)
        return x

SwigLu is a special type of Activation Function, where the name Activation Function is actually a bit misleading, because it consists of multiply NN Layers. At first, the input is put into two different Dense Layers having $\frac{2}{3} \cdot d_{model}$ units. The outputs of those layers are then multiplied element-wise, and the result is put into a final Dense Layers, so the output dimensionality is $d_{model}$ again. The SiLu Activation Function in the first Dense Layer is defined as $SiLu(x) = x \cdot Sigmoid(x)$.

In [None]:
#Implement the classic attention layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dropout_rate):
        super(Attention, self).__init__()
        #Choose head_size in a way that the total dimensionality does not change
        self.size_heads = d_model // num_heads
        self.query = tf.keras.layers.Dense(units=self.size_heads, use_bias=False)
        self.key = tf.keras.layers.Dense(units=self.size_heads, use_bias=False)
        self.value = tf.keras.layers.Dense(units=self.size_heads, use_bias=False)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    #Implement the Attention function
    @tf.function
    def call(self, x):
        B, T, C = x.shape
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        #Compute the Attention matrix
        scores = tf.matmul(Q,tf.transpose(K, perm=[0, 2, 1])) / tf.math.sqrt(tf.cast(self.size_heads, tf.float32))
        #Mask for training
        tril   = tf.linalg.band_part(tf.ones((T, T)), -1, 0)
        scores = tf.where(tril == 0, tf.fill(tril.shape, -float('inf')), scores)
        #Continue Computing the Attention Values
        scores = tf.nn.softmax(scores, axis=-1)
        scores = self.dropout(scores)
        return tf.matmul(scores, V)

The Attention formula is $\text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$, where the softmax is applied to each row of the matrix. First, we compute $\frac{QK^T}{\sqrt{d_k}}$, then after masking, we put it into the softmax, apply dropout and multiply by V.


In [None]:
#Implement Multi-Head-Attention
class MultiHead(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dropout_rate):
        super(MultiHead, self).__init__()
        self.heads = [Attention(d_model, num_heads, dropout_rate) for _ in range(num_heads)]
        self.dense = tf.keras.layers.Dense(units=d_model, use_bias=False)
    #Write the results in one concatenated matrix to keep the dimensionality
    @tf.function
    def call(self, x):
        x = tf.concat([head(x) for head in self.heads], axis=-1)
        x = self.dense(x)
        return x

In [None]:
#Transformer block
class Tblock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, units, dropout_rate):
        super(Tblock, self).__init__()
        self.ffn = SwiGLu(d_model, units)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.attention = MultiHead(d_model, num_heads, dropout_rate)
        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    @tf.function
    def call(self, x):
        res = x
        x = self.layernorm1(x)
        x = self.attention(x)
        x = self.dropout(x)
        x = self.add([res, x])
        res = x
        x = self.layernorm2(x)
        x = self.ffn(x)
        x = self.dropout(x)
        x = self.add([res, x])
        return x


In [None]:
#Decoder
class Decoder(tf.keras.layers.Layer):
    def __init__(self, voc, d_model, num_heads, units, dropout_rate, num_layers, block):
        super(Decoder, self).__init__()
        self.embedding = Embedding(voc, d_model, block, dropout_rate)
        self.layers = [Tblock(d_model, num_heads, units, dropout_rate) for _ in range(num_layers)]


    @tf.function
    def call(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
#Full Transformer
class Transformer(tf.keras.Model):
    def __init__(self, voc, d_model, num_heads, units, dropout_rate, num_layers, block):
        super(Transformer, self).__init__()
        self.decoder = Decoder(voc, d_model, num_heads, units, dropout_rate, num_layers, block)
        self.final_dense = tf.keras.layers.Dense(units=voc)

    @tf.function
    def call(self, x):
        x = self.decoder(x)
        logits = self.final_dense(x)
        return logits

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)  # Ensure d_model is a float
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)  # Cast step to float to avoid type issues
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)



In [None]:
import tensorflow as tf

transformer = Transformer(
    voc=1024,
    d_model=512,
    num_heads=8,
    units=2048,
    dropout_rate=0.1,
    num_layers=6,
    block=100
)

learning_rate = CustomSchedule(d_model=512)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

#Loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

#Training Step
train_loss = tf.keras.metrics.Mean(name='train_loss')

@tf.function
def train_step(inp, tar):
    with tf.GradientTape() as tape:
        predictions = transformer(inp, training=True)
        loss = loss_function(tar, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)

#Training loop
EPOCHS = 20

for epoch in range(EPOCHS):
    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

    print(f'Epoch {epoch + 1}, Loss: {train_loss.result()}')
    train_loss.reset_states()


Epoch 1, Loss: 0.4231685996055603
Epoch 2, Loss: 8.616084414825309e-06
Epoch 3, Loss: 2.0085099095012993e-07
Epoch 4, Loss: 6.788434259163978e-09
Epoch 5, Loss: 4.221442639895656e-10
Epoch 6, Loss: 1.1678954678351339e-11
Epoch 7, Loss: 2.6352518257832802e-12
Epoch 8, Loss: 1.976436592512898e-12
Epoch 9, Loss: 1.197842416618758e-13
Epoch 10, Loss: 0.0
Epoch 11, Loss: 0.0
Epoch 12, Loss: 0.0
Epoch 13, Loss: 0.0
Epoch 14, Loss: 0.0
Epoch 15, Loss: 1.796763557165501e-13
Epoch 16, Loss: 0.0
Epoch 17, Loss: 0.0
Epoch 18, Loss: 0.0
Epoch 19, Loss: 0.0
Epoch 20, Loss: 5.989212760720147e-14
