In [16]:
# Imports
from transformers import GPT2Config, TFGPT2Model
import tensorflow as tf
import numpy as np
import os

# Config

In [17]:
BATCH_SIZE = 2
SEQ_LEN = 8192
TOKEN_DIM = 512

Decoder creation

In [18]:
# Custom configuration for using GPT2 as a standard transformer decoder
config = GPT2Config(vocab_size=0, n_positions = SEQ_LEN, n_embd = TOKEN_DIM, 
                    n_layer = 6, n_head = 8, activation_function='relu')
# Instantiate decoder
decoder = TFGPT2Model(config)

Testing the decoder on random inputs

In [19]:
output = decoder({'inputs_embeds': tf.ones((BATCH_SIZE, SEQ_LEN, TOKEN_DIM))})
output['last_hidden_state'].shape

TensorShape([2, 8192, 512])

# Dataset

In [20]:
DATASET_PATH = os.path.join('..', 'data', 'lmda_genres_tf_data')

Load the dataset from disk and process it (batching, shuffling, ...)

In [21]:
dataset = tf.data.Dataset.load(DATASET_PATH).batch(BATCH_SIZE).cache().shuffle(256).prefetch(32)
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 8192, 11), dtype=tf.uint8, name=None), TensorSpec(shape=(None, 18), dtype=tf.float64, name=None))>

In [22]:
X, y = next(dataset.as_numpy_iterator())
print(X.shape, y.shape)

(2, 8192, 11) (2, 18)


2022-11-17 21:28:10.419478: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


# Embedding layers

The inputs need to be encoded by some embedding layer (a specific embedding layer for each token type).

In [23]:
## Ranges and dimensions for embedding layers
TYPE_RANGE      = 8
MEASURE_RANGE   = 256
BEAT_RANGE      = 132
POSITION_RANGE  = 128
DURATION_RANGE  = 136
PITCH_RANGE     = 256
INSTRUMENT_RANGE= 129
VELOCITY_RANGE  = 128
KEY_SIGN_RANGE  = 24
TIME_SIGN_RANGE = 153
TEMPO_RANGE     = 49
GENRE_RANGE     = 18

OUTPUT_SIZE = 64
GENRE_DIM   = 64

In [24]:
embedding_layers = [
    # Type embedding
    tf.keras.layers.Embedding(TYPE_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Measure embedding
    tf.keras.layers.Embedding(MEASURE_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Beat embedding
    tf.keras.layers.Embedding(BEAT_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Position embedding
    tf.keras.layers.Embedding(POSITION_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Duration embedding
    tf.keras.layers.Embedding(DURATION_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Pitch embedding
    tf.keras.layers.Embedding(PITCH_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Instrument embedding
    tf.keras.layers.Embedding(INSTRUMENT_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Velocity embedding
    tf.keras.layers.Embedding(VELOCITY_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Key sign embedding
    tf.keras.layers.Embedding(KEY_SIGN_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Time sign embedding
    tf.keras.layers.Embedding(TIME_SIGN_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Tempo embedding
    tf.keras.layers.Embedding(TEMPO_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN)
]

Run the embedding layers on our inputs

In [25]:
outputs = []
for i in tf.range(X.shape[2]):
    outputs.append(embedding_layers[i](X[:,:SEQ_LEN,i]))

We also need to encode the genre using some layers.

In [26]:
genre_embedding_module = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(GENRE_DIM, activation='relu')
])

In [27]:
genre_embedding = genre_embedding_module(y)
genre_embedding.shape

TensorShape([2, 64])

## Embedding concatenation

We concatenate the output embeddings into a single tensor

In [28]:
concat_layer = tf.keras.layers.Concatenate(axis=2)
repeat_layer = tf.keras.layers.RepeatVector(SEQ_LEN)
genre_embedding = repeat_layer(genre_embedding)
concat_outputs = concat_layer(outputs + [genre_embedding])
concat_outputs.shape

TensorShape([2, 8192, 768])

Then we need to resize them into a known dimensionality

In [29]:
dense_layer = tf.keras.layers.Dense(TOKEN_DIM)
encoding = dense_layer(concat_outputs)
encoding.shape

TensorShape([2, 8192, 512])

## Positional encoding

We also add positional encodings to encode which is the position of each token in the sequence.

In [30]:
import math

def get_positional_embedding_matrix(seq_len=SEQ_LEN, dim=TOKEN_DIM):
    # From "Attention is all you need", https://arxiv.org/pdf/1706.03762.pdf
    PE = np.zeros((seq_len, dim))
    for pos in range(seq_len):
        for i in range(int(dim/2)):
            PE[pos,2*i]   = math.sin(pos/(10000**(2*i/dim)))
            PE[pos,2*i+1] = math.cos(pos/(10000**(2*i/dim)))
    return PE

In [31]:
positional_encoding_matrix = get_positional_embedding_matrix()

In transformers, it is common to add the positional embedding to the elements embeddings.

In [32]:
sum_layer = tf.keras.layers.Add()
positional_encoding = tf.stack([positional_encoding_matrix]*BATCH_SIZE)
final_encoding = sum_layer([encoding, positional_encoding])
final_encoding.shape

TensorShape([2, 8192, 512])

# Output management

In [33]:
output = decoder({'inputs_embeds': final_encoding})
output['last_hidden_state'].shape

TensorShape([2, 8192, 512])

We need a dense + softmax layer for each of the tokens for trying to reconstruct the input.

In [34]:
output_dense_layers = [
    # Type
    tf.keras.layers.Dense(TYPE_RANGE, activation='softmax'),
    # Measure
    tf.keras.layers.Dense(MEASURE_RANGE, activation='softmax'),
    # Beat
    tf.keras.layers.Dense(BEAT_RANGE, activation='softmax'),
    # Position
    tf.keras.layers.Dense(POSITION_RANGE, activation='softmax'),
    # Duration
    tf.keras.layers.Dense(DURATION_RANGE, activation='softmax'),
    # Pitch
    tf.keras.layers.Dense(PITCH_RANGE, activation='softmax'),
    # Instrument
    tf.keras.layers.Dense(INSTRUMENT_RANGE, activation='softmax'),
    # Velocity
    tf.keras.layers.Dense(VELOCITY_RANGE, activation='softmax'),
    # Key sign
    tf.keras.layers.Dense(KEY_SIGN_RANGE, activation='softmax'),
    # Time sign
    tf.keras.layers.Dense(TIME_SIGN_RANGE, activation='softmax'),
    # Tempo
    tf.keras.layers.Dense(TEMPO_RANGE, activation='softmax')
]

In [35]:
out_scores = [output_dense_layers[i](output['last_hidden_state']) 
              for i in range(len(output_dense_layers))]

for i in range(len(out_scores)):
    print(out_scores[i].shape)

(2, 8192, 8)
(2, 8192, 256)
(2, 8192, 132)
(2, 8192, 128)
(2, 8192, 136)
(2, 8192, 256)
(2, 8192, 129)
(2, 8192, 128)
(2, 8192, 24)
(2, 8192, 153)
(2, 8192, 49)


## Groundtruth vectors definition

In [36]:
gt_vectors = [X[:,:SEQ_LEN,i] for i in range(len(out_scores))]

for i in range(len(out_scores)):
    print(gt_vectors[i].shape)

(2, 8192)
(2, 8192)
(2, 8192)
(2, 8192)
(2, 8192)
(2, 8192)
(2, 8192)
(2, 8192)
(2, 8192)
(2, 8192)
(2, 8192)


 ## Loss definition

We can use a simple sparse categorical crossentropy loss function 
- Note: can we use regularizers or other kinds of constraint enforcing methods for some of the fields? Like, we know that regarding the type field of events there is a strict order to follow (start of song, start of events, ..., notes and end of song). Can we enforce this structure?

In [37]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy()

for i in range(len(out_scores)):
    print(loss_function(gt_vectors[i], out_scores[i]))

tf.Tensor(2.5348933, shape=(), dtype=float32)
tf.Tensor(5.649028, shape=(), dtype=float32)
tf.Tensor(5.0095873, shape=(), dtype=float32)
tf.Tensor(5.7224555, shape=(), dtype=float32)
tf.Tensor(5.680168, shape=(), dtype=float32)
tf.Tensor(6.1693664, shape=(), dtype=float32)
tf.Tensor(5.9566793, shape=(), dtype=float32)
tf.Tensor(5.0991096, shape=(), dtype=float32)
tf.Tensor(2.6551266, shape=(), dtype=float32)
tf.Tensor(6.261841, shape=(), dtype=float32)
tf.Tensor(5.4754124, shape=(), dtype=float32)


When defining the whole Keras model for training, we can set up multiple outputs and give different weights for the multiple losses.

# Single model

Let's try and define everything that this model does into a complete callable model.

In [None]:
# Imports
from transformers import GPT2Config, TFGPT2Model
import tensorflow as tf
import numpy as np
import os
import math

### CONSTANTS ###
BATCH_SIZE   = 2
SEQ_LEN      = 6144 #8192
TOKEN_DIM    = 512
EVENTS_ELEMS = 11

## Ranges and dimensions for embedding layers
TYPE_RANGE      = 8
MEASURE_RANGE   = 256
BEAT_RANGE      = 132
POSITION_RANGE  = 128
DURATION_RANGE  = 136
PITCH_RANGE     = 256
INSTRUMENT_RANGE= 129
VELOCITY_RANGE  = 128
KEY_SIGN_RANGE  = 24
TIME_SIGN_RANGE = 153
TEMPO_RANGE     = 49
GENRE_RANGE     = 18
OUTPUT_SIZE     = 64
GENRE_DIM       = 64


def get_positional_embedding_matrix(seq_len=SEQ_LEN, dim=TOKEN_DIM):
    # From "Attention is all you need", https://arxiv.org/pdf/1706.03762.pdf
    PE = np.zeros((seq_len, dim))
    for pos in range(seq_len):
        for i in range(int(dim/2)):
            PE[pos,2*i]   = math.sin(pos/(10000**(2*i/dim)))
            PE[pos,2*i+1] = math.cos(pos/(10000**(2*i/dim)))
    return PE


def create_model(input_shape=(SEQ_LEN, EVENTS_ELEMS), num_genres=GENRE_RANGE, batch_size=BATCH_SIZE):
    seq_len = input_shape[0]
    events_elements = input_shape[1]
    
    # Instantiate decoder
    config = GPT2Config(vocab_size = 0, n_positions = seq_len, 
                        n_embd = TOKEN_DIM, n_layer = 6, 
                        n_head = 8, activation_function='relu')
    decoder = TFGPT2Model(config, name='decoder')
    
    # Define inputs
    songs  = tf.keras.Input(shape=input_shape, name='songs')
    genres = tf.keras.Input(shape=num_genres , name='genres')
    
    # Embedding layers
    embedding_layers = [
        # Type embedding
        tf.keras.layers.Embedding(TYPE_RANGE, OUTPUT_SIZE, input_length=seq_len, name='type_embedding'),
        # Measure embedding
        tf.keras.layers.Embedding(MEASURE_RANGE, OUTPUT_SIZE, input_length=seq_len, name='measure_embedding'),
        # Beat embedding
        tf.keras.layers.Embedding(BEAT_RANGE, OUTPUT_SIZE, input_length=seq_len, name='beat_embedding'),
        # Position embedding
        tf.keras.layers.Embedding(POSITION_RANGE, OUTPUT_SIZE, input_length=seq_len, name='position_embedding'),
        # Duration embedding
        tf.keras.layers.Embedding(DURATION_RANGE, OUTPUT_SIZE, input_length=seq_len, name='duration_embedding'),
        # Pitch embedding
        tf.keras.layers.Embedding(PITCH_RANGE, OUTPUT_SIZE, input_length=seq_len, name='pitch_embedding'),
        # Instrument embedding
        tf.keras.layers.Embedding(INSTRUMENT_RANGE, OUTPUT_SIZE, input_length=seq_len, name='instrument_embedding'),
        # Velocity embedding
        tf.keras.layers.Embedding(VELOCITY_RANGE, OUTPUT_SIZE, input_length=seq_len, name='velocity_embedding'),
        # Key sign embedding
        tf.keras.layers.Embedding(KEY_SIGN_RANGE, OUTPUT_SIZE, input_length=seq_len, name='keysign_embedding'),
        # Time sign embedding
        tf.keras.layers.Embedding(TIME_SIGN_RANGE, OUTPUT_SIZE, input_length=seq_len, name='timesign_embedding'),
        # Tempo embedding
        tf.keras.layers.Embedding(TEMPO_RANGE, OUTPUT_SIZE, input_length=seq_len, name='tempo_embedding')
    ]
    
    genre_embedding_layer = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(GENRE_DIM)
    ], name='genre_embedding')
    
    # Input processing layers
    concat_layer               = tf.keras.layers.Concatenate(axis=2)
    repeat_layer               = tf.keras.layers.RepeatVector(seq_len)
    encoding_processing_layer  = tf.keras.layers.Dense(TOKEN_DIM, name='encoding_processing')
    
    # Positional encoding
    positional_encoding_matrix = get_positional_embedding_matrix(seq_len=seq_len, dim=TOKEN_DIM)
    positional_encoding        = tf.stack([positional_encoding_matrix]*batch_size)
    sum_layer                  = tf.keras.layers.Add(name='final_encoding')

    # Output layers
    output_dense_layers = [
        # Type
        tf.keras.layers.Dense(TYPE_RANGE, activation='softmax', name='type_probabilities'),
        # Measure
        tf.keras.layers.Dense(MEASURE_RANGE, activation='softmax', name='measure_probabilities'),
        # Beat
        tf.keras.layers.Dense(BEAT_RANGE, activation='softmax', name='beat_probabilities'),
        # Position
        tf.keras.layers.Dense(POSITION_RANGE, activation='softmax', name='position_probabilities'),
        # Duration
        tf.keras.layers.Dense(DURATION_RANGE, activation='softmax', name='duration_probabilities'),
        # Pitch
        tf.keras.layers.Dense(PITCH_RANGE, activation='softmax', name='pitch_probabilities'),
        # Instrument
        tf.keras.layers.Dense(INSTRUMENT_RANGE, activation='softmax', name='instrument_probabilities'),
        # Velocity
        tf.keras.layers.Dense(VELOCITY_RANGE, activation='softmax', name='velocity_probabilities'),
        # Key sign
        tf.keras.layers.Dense(KEY_SIGN_RANGE, activation='softmax', name='keysign_probabilities'),
        # Time sign
        tf.keras.layers.Dense(TIME_SIGN_RANGE, activation='softmax', name='timesign_probabilities'),
        # Tempo
        tf.keras.layers.Dense(TEMPO_RANGE, activation='softmax', name='tempo_probabilities')
    ]
    
    # Model dynamics
    embeddings        = [embedding_layers[i](songs[:,:,i]) for i in range(events_elements)]
    genre_embedding   = genre_embedding_layer(genres)
    genre_embedding   = repeat_layer(genre_embedding)
    concat_embeddings = concat_layer(embeddings + [genre_embedding])
    input_embedding   = encoding_processing_layer(concat_embeddings)
    input_embedding   = sum_layer([input_embedding, positional_encoding])
    model_output      = decoder({'inputs_embeds': input_embedding})['last_hidden_state']
    out_scores        = [output_dense_layers[i](model_output) for i in range(len(output_dense_layers))]

    return tf.keras.Model(inputs=[songs, genres], outputs=out_scores, name='music_generation_model')

In [None]:
if len(tf.config.list_physical_devices('GPU')) > 1:
    print("Using multiple GPUs with Mirrored Strategy")
    training_strategy = tf.distribute.MirroredStrategy()
    num_devices = training_strategy.num_replicas_in_sync
    global_batch_size = BATCH_SIZE * num_devices
    with training_strategy.scope():
        model = create_model(batch_size=global_batch_size)
else:
    print("Using single GPU/CPU device")
    model = create_model()

In [3]:
model.summary()

Model: "music_generation_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 songs (InputLayer)             [(None, 6144, 11)]   0           []                               
                                                                                                  
 genres (InputLayer)            [(None, 18)]         0           []                               
                                                                                                  
 tf.__operators__.getitem (Slic  (None, 6144)        0           ['songs[0][0]']                  
 ingOpLambda)                                                                                     
                                                                                                  
 tf.__operators__.getitem_1 (Sl  (None, 6144)        0           ['songs[0][0

We can test the model with some inputs from our dataset

In [4]:
DATASET_PATH = os.path.join('..', 'data', 'lmda_genres_tf_data')
dataset = tf.data.Dataset.load(DATASET_PATH).batch(global_batch_size).cache().shuffle(256).prefetch(32)

In [None]:
X, y = next(dataset.take(1).as_numpy_iterator())

In [8]:
output = model([X[:,:SEQ_LEN,:], y])
print([x.shape for x in output])

[TensorShape([4, 6144, 8]), TensorShape([4, 6144, 256]), TensorShape([4, 6144, 132]), TensorShape([4, 6144, 128]), TensorShape([4, 6144, 136]), TensorShape([4, 6144, 256]), TensorShape([4, 6144, 129]), TensorShape([4, 6144, 128]), TensorShape([4, 6144, 24]), TensorShape([4, 6144, 153]), TensorShape([4, 6144, 49])]
