In [None]:
!pip install -U transformers tensorflow

# Imports

In [2]:
from transformers import GPT2Config, TFGPT2Model
import tensorflow as tf
import numpy as np
import os

# Config

In [3]:
# Test the model on a fake input
BATCH_SIZE = 2
SEQ_LEN = 4096
TOKEN_DIM = 512

Decoder creation

In [4]:
# Custom configuration for using GPT2 as a standard transformer decoder
config = GPT2Config(vocab_size=0, n_positions = SEQ_LEN, n_embd = TOKEN_DIM, n_layer = 6, n_head = 8, activation_function='relu')
# Instantiate decoder
decoder = TFGPT2Model(config)

Testing the decoder on random inputs

In [5]:
output = decoder({'inputs_embeds': tf.ones((BATCH_SIZE, SEQ_LEN, TOKEN_DIM))})
output['last_hidden_state'].shape

TensorShape([2, 4096, 512])

# Dataset

In [6]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DATASET_PATH = '/content/drive/MyDrive/Uni/Magistrale/AI4I/Project/Dataset/tf_data'
except:
    DATASET_PATH = os.path.join('..', 'data', 'tf_data')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
dataset = tf.data.Dataset.load(DATASET_PATH).batch(BATCH_SIZE).cache().shuffle(256).prefetch(32)
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 13450, 11), dtype=tf.uint8, name=None), TensorSpec(shape=(None,), dtype=tf.uint8, name=None))>

In [8]:
X, y = next(dataset.as_numpy_iterator())
print(X.shape, y.shape)

(2, 13450, 11) (2,)


# Embedding layers

In [9]:
## Ranges and dimensions for embedding layers
TYPE_RANGE      = 8
MEASURE_RANGE   = 256
BEAT_RANGE      = 133
POSITION_RANGE  = 128
DURATION_RANGE  = 136
PITCH_RANGE     = 256
INSTRUMENT_RANGE= 129
VELOCITY_RANGE  = 128
KEY_SIGN_RANGE  = 24
TIME_SIGN_RANGE = 153
TEMPO_RANGE     = 49

OUTPUT_SIZE = 64

In [10]:
embedding_layers = [
    # Type embedding
    tf.keras.layers.Embedding(TYPE_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Measure embedding
    tf.keras.layers.Embedding(MEASURE_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Beat embedding
    tf.keras.layers.Embedding(BEAT_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Position embedding
    tf.keras.layers.Embedding(POSITION_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Duration embedding
    tf.keras.layers.Embedding(DURATION_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Pitch embedding
    tf.keras.layers.Embedding(PITCH_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Instrument embedding
    tf.keras.layers.Embedding(INSTRUMENT_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Velocity embedding
    tf.keras.layers.Embedding(VELOCITY_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Key sign embedding
    tf.keras.layers.Embedding(KEY_SIGN_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Time sign embedding
    tf.keras.layers.Embedding(TIME_SIGN_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Tempo embedding
    tf.keras.layers.Embedding(TEMPO_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN)
]

In [11]:
outputs = []
for i in tf.range(X.shape[2]):
    outputs.append(embedding_layers[i](X[:,:SEQ_LEN,i]))

## Embedding concatenation

In [12]:
concat_layer = tf.keras.layers.Concatenate(axis=2)
concat_outputs = concat_layer(outputs)
concat_outputs.shape

TensorShape([2, 4096, 704])

## Embedding resizing

In [13]:
dense_layer = tf.keras.layers.Dense(TOKEN_DIM)
encoding = dense_layer(concat_outputs)
encoding.shape

TensorShape([2, 4096, 512])

## Positional encoding

In [14]:
import math

def get_positional_embedding_matrix(seq_len=SEQ_LEN, dim=TOKEN_DIM):
    # From "Attention is all you need", https://arxiv.org/pdf/1706.03762.pdf
    PE = np.zeros((seq_len, dim))
    for pos in range(seq_len):
        for i in range(int(dim/2)):
            PE[pos,2*i]   = math.sin(pos/(10000**(2*i/dim)))
            PE[pos,2*i+1] = math.cos(pos/(10000**(2*i/dim)))
    return PE

In [15]:
positional_encoding_matrix = get_positional_embedding_matrix()

In transformers, it is common to add the positional embedding to the elements embeddings.

In [16]:
sum_layer = tf.keras.layers.Add()
positional_encoding = tf.stack([positional_encoding_matrix]*BATCH_SIZE)
final_encoding = sum_layer([encoding, positional_encoding])
final_encoding.shape

TensorShape([2, 4096, 512])

# Output management

In [17]:
output = decoder({'inputs_embeds': final_encoding})
output['last_hidden_state'].shape

TensorShape([2, 4096, 512])

We need a dense + softmax layer for each of the tokens for trying to reconstruct the input.

In [18]:
output_dense_layers = [
    # Type
    tf.keras.layers.Dense(TYPE_RANGE, activation='softmax'),
    # Measure
    tf.keras.layers.Dense(MEASURE_RANGE, activation='softmax'),
    # Beat
    tf.keras.layers.Dense(BEAT_RANGE, activation='softmax'),
    # Position
    tf.keras.layers.Dense(POSITION_RANGE, activation='softmax'),
    # Duration
    tf.keras.layers.Dense(DURATION_RANGE, activation='softmax'),
    # Pitch
    tf.keras.layers.Dense(PITCH_RANGE, activation='softmax'),
    # Instrument
    tf.keras.layers.Dense(INSTRUMENT_RANGE, activation='softmax'),
    # Velocity
    tf.keras.layers.Dense(VELOCITY_RANGE, activation='softmax'),
    # Key sign
    tf.keras.layers.Dense(KEY_SIGN_RANGE, activation='softmax'),
    # Time sign
    tf.keras.layers.Dense(TIME_SIGN_RANGE, activation='softmax'),
    # Tempo
    tf.keras.layers.Dense(TEMPO_RANGE, activation='softmax')
]

In [19]:
out_scores = [output_dense_layers[i](output['last_hidden_state']) 
              for i in range(len(output_dense_layers))]

for i in range(len(out_scores)):
    print(out_scores[i].shape)

(2, 4096, 8)
(2, 4096, 256)
(2, 4096, 133)
(2, 4096, 128)
(2, 4096, 136)
(2, 4096, 256)
(2, 4096, 129)
(2, 4096, 128)
(2, 4096, 24)
(2, 4096, 153)
(2, 4096, 49)


## Groundtruth vectors definition

In [20]:
gt_vectors = [X[:,:SEQ_LEN,i] for i in range(len(out_scores))]

for i in range(len(out_scores)):
    print(gt_vectors[i].shape)

(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)


 ## Loss definition

In [21]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy()

for i in range(len(out_scores)):
    print(loss_function(gt_vectors[i], out_scores[i]))

tf.Tensor(2.3346944, shape=(), dtype=float32)
tf.Tensor(6.237889, shape=(), dtype=float32)
tf.Tensor(4.7969894, shape=(), dtype=float32)
tf.Tensor(5.776598, shape=(), dtype=float32)
tf.Tensor(5.461115, shape=(), dtype=float32)
tf.Tensor(5.213531, shape=(), dtype=float32)
tf.Tensor(4.542588, shape=(), dtype=float32)
tf.Tensor(3.9674158, shape=(), dtype=float32)
tf.Tensor(4.785807, shape=(), dtype=float32)
tf.Tensor(6.601122, shape=(), dtype=float32)
tf.Tensor(5.358327, shape=(), dtype=float32)


When defining the whole Keras model for training, we can set up multiple outputs and give different weights for the multiple losses.