In [1]:
# Imports
import os

import numpy as np
import tensorflow as tf
from tensorflow.keras import mixed_precision

from transformers import GPT2Config, TFGPT2Model

from config import Config

2022-11-26 18:14:23.734772: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-26 18:14:23.851201: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-26 18:14:23.882939: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-26 18:14:24.447294: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

# Config

In [2]:
ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
conf = Config("single_instruments_type", ROOT_PATH)

2022-11-26 18:14:25.498247: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


2022-11-26 18:14:26.588167: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30970 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
2022-11-26 18:14:26.588759: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30970 MB memory:  -> device: 1, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:86:00.0, compute capability: 7.0


Decoder creation

In [3]:
decoder = conf.get_decoder()

Testing the decoder on random inputs

In [5]:
output = decoder({'inputs_embeds': tf.ones((conf.BATCH_SIZE, conf.SEQ_LEN, conf.TOKEN_DIM))})
output['last_hidden_state'].shape

TensorShape([12, 6144, 512])

# Dataset

Load the dataset from disk and process it (batching, shuffling, ...)

In [8]:
DATASET_PATH = os.path.join('..', 'data', 'tf_data7')
dataset = tf.data.Dataset.load(DATASET_PATH).batch(conf.BATCH_SIZE).cache().shuffle(conf.SHUFFLE_SIZE).prefetch(conf.PREFETCH_SIZE)
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 6143, 11), dtype=tf.uint8, name=None), TensorSpec(shape=(None, 3), dtype=tf.uint8, name=None))>

In [9]:
X, y = next(dataset.as_numpy_iterator())
print(X.shape, y.shape)

(12, 6143, 11) (12, 3)


2022-11-26 18:31:55.627353: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


# Embedding layers

The inputs need to be encoded by some embedding layer (a specific embedding layer for each token type).

In [12]:
embedding_layers = [
    # Type embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['type'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Measure embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['measure'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Beat embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['beat'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Position embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['position'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Duration embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['duration'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Pitch embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['pitch'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Instrument embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['instrument'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Velocity embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['velocity'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Key sign embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['key_sign'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Time sign embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['time_sign'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Tempo embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['tempo'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN)
]

Run the embedding layers on our inputs

In [15]:
outputs = []
for i in tf.range(X.shape[2]):
    outputs.append(embedding_layers[i](X[:, : ,i]))

We also need to encode the genre using some layers.

In [20]:
genre_embedding_module = tf.keras.Sequential([
    tf.keras.layers.Dense(conf.SINGLE_EMB_SIZE, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(conf.GENRE_DIM, activation='relu')
])

In [21]:
genre_embedding = genre_embedding_module(y)
genre_embedding.shape

TensorShape([12, 512])

## Embedding concatenation

We concatenate the output embeddings into a single tensor

In [22]:
types_concat_layer = tf.keras.layers.Concatenate(axis=2)
concat_outputs = types_concat_layer(outputs)
concat_outputs.shape

TensorShape([12, 6143, 704])

Then we need to resize them into a known dimensionality

In [23]:
dense_layer = tf.keras.layers.Dense(conf.TOKEN_DIM)
encoding = dense_layer(concat_outputs)
encoding.shape

TensorShape([12, 6143, 512])

Finally, we need to preprend the genre embedding token to the sequence

In [24]:
sequence_concat_layer = tf.keras.layers.Concatenate(axis=1)
final_sequence = sequence_concat_layer([genre_embedding[:, np.newaxis, :], encoding])
final_sequence.shape

TensorShape([12, 6144, 512])

## Positional encoding

We also add positional encodings to encode which is the position of each token in the sequence.

In [25]:
positional_encoding_matrix = conf.get_positional_embedding_matrix()

In transformers, it is common to add the positional embedding to the elements embeddings.

In [26]:
sum_layer = tf.keras.layers.Add()
positional_encoding = tf.repeat(positional_encoding_matrix[np.newaxis, :, :], tf.constant(conf.BATCH_SIZE), axis=0)
final_encoding = sum_layer([final_sequence, positional_encoding])
final_encoding.shape

TensorShape([12, 6144, 512])

# Output management

In [27]:
output = decoder({'inputs_embeds': final_encoding})
output['last_hidden_state'].shape

TensorShape([12, 6144, 512])

We need a dense + softmax layer for each of the tokens for trying to reconstruct the input.

In [28]:
output_dense_layers = [
    # Type
    tf.keras.layers.Dense(conf.INPUT_RANGES['type'], activation='softmax'),
    # Measure
    tf.keras.layers.Dense(conf.INPUT_RANGES['measure'], activation='softmax'),
    # Beat
    tf.keras.layers.Dense(conf.INPUT_RANGES['beat'], activation='softmax'),
    # Position
    tf.keras.layers.Dense(conf.INPUT_RANGES['position'], activation='softmax'),
    # Duration
    tf.keras.layers.Dense(conf.INPUT_RANGES['duration'], activation='softmax'),
    # Pitch
    tf.keras.layers.Dense(conf.INPUT_RANGES['pitch'], activation='softmax'),
    # Instrument
    tf.keras.layers.Dense(conf.INPUT_RANGES['instrument'], activation='softmax'),
    # Velocity
    tf.keras.layers.Dense(conf.INPUT_RANGES['velocity'], activation='softmax'),
    # Key sign
    tf.keras.layers.Dense(conf.INPUT_RANGES['key_sign'], activation='softmax'),
    # Time sign
    tf.keras.layers.Dense(conf.INPUT_RANGES['time_sign'], activation='softmax'),
    # Tempo
    tf.keras.layers.Dense(conf.INPUT_RANGES['tempo'], activation='softmax')
]

In [None]:
out_scores = [output_dense_layers[i](output['last_hidden_state']) 
              for i in range(len(output_dense_layers))]

for i in range(len(out_scores)):
    print(out_scores[i].shape)

(12, 6144, 8)
(12, 6144, 256)
(12, 6144, 131)
(12, 6144, 128)
(12, 6144, 136)
(12, 6144, 256)
(12, 6144, 129)
(12, 6144, 128)
(12, 6144, 25)
(12, 6144, 153)
(12, 6144, 49)


## Groundtruth vectors definition

In [30]:
gt_vectors = [X[:,:,i] for i in range(len(out_scores))]

for i in range(len(out_scores)):
    print(gt_vectors[i].shape)

(12, 6143)
(12, 6143)
(12, 6143)
(12, 6143)
(12, 6143)
(12, 6143)
(12, 6143)
(12, 6143)
(12, 6143)
(12, 6143)
(12, 6143)


 ## Loss definition

We can use a simple sparse categorical crossentropy loss function. The two distributions we are comparing are the input sequence (so we ignore the genre embedding token representation) and the output sequence up to the last token representation (`output[:-1]`)
- Note: can we use regularizers or other kinds of constraint enforcing methods for some of the fields? Like, we know that regarding the type field of events there is a strict order to follow (start of song, start of events, ..., notes and end of song). Can we enforce this structure?

In [31]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy()
losses = []
for i in range(len(out_scores)):
    losses.append(loss_function(gt_vectors[i], out_scores[i][:, :-1, :]))
losses

[<tf.Tensor: shape=(), dtype=float32, numpy=3.624619>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.1048756>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.639281>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.1507883>,
 <tf.Tensor: shape=(), dtype=float32, numpy=4.737446>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.3642745>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.920309>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.8808045>,
 <tf.Tensor: shape=(), dtype=float32, numpy=4.109704>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.3772664>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.1257977>]

To these loss terms we can add some regularization terms that can help the model produce a grammatically correct sequence.

In [32]:
types = gt_vectors[0]
max_pred_types = tf.argmax(out_scores[0], axis=2) # 6, 6144
# Use a StaticHashTable to map values to their consecutive version within Tensorflow
keys_tensor = tf.range(TYPE_RANGE, dtype=tf.int32)
vals_tensor = tf.constant([0,1,2,3,3,3,3,4], dtype=tf.int32)
table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor), default_value=-1)
consecutive_gt_types   = table.lookup(tf.cast(types, tf.int32))
consecutive_pred_types = table.lookup(tf.cast(max_pred_types, tf.int32))
# Note: we assume that after token token type 7 all following token types are 7s
differences = consecutive_pred_types[:, 1:] - consecutive_pred_types[:, :-1]

In [33]:
# There are some constraint to pose for regularization
reg_term_1 = tf.reduce_sum(tf.math.maximum(0, -differences))                           # Difference between one element's type and the next is >= 0
reg_term_2 = tf.reduce_sum(tf.math.maximum(0, tf.math.maximum(1, differences) - 1))    # Difference between one element's type and the next is < 1

reg_term_1, reg_term_2

(<tf.Tensor: shape=(), dtype=int32, numpy=5554>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1567>)

In [35]:
REG_SCALER = 0.001

total_loss = tf.reduce_sum(losses) + \
             REG_SCALER * tf.cast(reg_term_1, tf.float32) + \
             REG_SCALER * tf.cast(reg_term_2, tf.float32)
total_loss

<tf.Tensor: shape=(), dtype=float32, numpy=67.156166>

When defining the whole Keras model for training, we can set up multiple outputs and give different weights for the multiple losses.

# Single model

Let's try and define everything that this model does into a complete callable model.

In [1]:
# Imports
import os
import math

import numpy as np
import tensorflow as tf
from transformers import GPT2Config, TFGPT2Model

from config import Config

ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
conf = Config("single_instruments_type", ROOT_PATH)

# Custom intermediate layer for allowing types transformation (no parameters to be learnt)
class SubsequentTypeTransformationLayer(tf.keras.layers.Layer):

    def __init__(self):
        super(SubsequentTypeTransformationLayer, self).__init__()
        # Use a StaticHashTable to map values to their consecutive version within Tensorflow
        self.keys_tensor = tf.range(conf.INPUT_RANGES['type'])
        self.vals_tensor = tf.constant([0,1,2,3,3,3,3,4])
        self.table = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(self.keys_tensor, self.vals_tensor), 
            default_value=-1)

    def call(self, inputs):
        return self.table.lookup(inputs)

# Model creation function (to be called within a scope in case of MultiGPU training)
def create_model(input_shape=(conf.SEQ_LEN-1, len(conf.INPUT_RANGES)), num_genres=len(conf.accepted_subgenres), 
                 use_regularization=True, reg_loss_scale=conf.REG_LOSS_SCALE):
    
    # Get input shapes
    seq_len = input_shape[0]
    events_elements = input_shape[1]
    
    # Instantiate transformer decoder (n_emb % n_head must be 0)
    decoder = conf.get_decoder()
    
    # Define inputs
    songs  = tf.keras.Input(shape=input_shape, name='songs',  dtype=tf.int32)
    genres = tf.keras.Input(shape=num_genres , name='genres', dtype=tf.float32)
    
    # Define loss
    loss_function = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    subsequent_type_transform_layer = SubsequentTypeTransformationLayer()
    reg_scaler = tf.constant(reg_loss_scale, dtype=tf.float32)
    
    # Embedding layers
    embedding_layers = [
        # Type embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['type'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='type_embeddings'),
        # Measure embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['measure'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='measure_embeddings'),
        # Beat embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['beat'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='beat_embeddings'),
        # Position embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['position'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='position_embeddings'),
        # Duration embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['duration'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='duration_embeddings'),
        # Pitch embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['pitch'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='pitch_embeddings'),
        # Instrument embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['instrument'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='instrument_embeddings'),
        # Velocity embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['velocity'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='velocity_embeddings'),
        # Key sign embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['key_sign'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='key_sign_embeddings'),
        # Time sign embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['time_sign'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='time_sign_embeddings'),
        # Tempo embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['tempo'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='tempo_embeddings')
    ]
    
    genre_embedding_layer = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(conf.GENRE_DIM)
    ], name='genre_embedding')
    
    # Input processing layers
    input_concat_layer         = tf.keras.layers.Concatenate(axis=2)
    sequence_concat_layer      = tf.keras.layers.Concatenate(axis=1)
    encoding_processing_layer  = tf.keras.layers.Dense(conf.TOKEN_DIM, name='encoding_processing')
    
    # Positional encoding
    positional_encoding_matrix = conf.get_positional_embedding_matrix()
    positional_encoding        = tf.repeat(positional_encoding_matrix[tf.newaxis, :, :], tf.shape(songs)[0], axis=0)
    sum_layer                  = tf.keras.layers.Add(name='final_encoding')

    # Output layers
    output_dense_layers = [
        # Type
        tf.keras.layers.Dense(conf.INPUT_RANGES['type'], name='type_scores'),
        # Measure
        tf.keras.layers.Dense(conf.INPUT_RANGES['measure'], name='measure_scores'),
        # Beat
        tf.keras.layers.Dense(conf.INPUT_RANGES['beat'], name='beat_scores'),
        # Position
        tf.keras.layers.Dense(conf.INPUT_RANGES['position'], name='position_scores'),
        # Duration
        tf.keras.layers.Dense(conf.INPUT_RANGES['duration'], name='duration_scores'),
        # Pitch
        tf.keras.layers.Dense(conf.INPUT_RANGES['pitch'], name='pitch_scores'),
        # Instrument
        tf.keras.layers.Dense(conf.INPUT_RANGES['instrument'], name='instrument_scores'),
        # Velocity
        tf.keras.layers.Dense(conf.INPUT_RANGES['velocity'], name='velocity_scores'),
        # Key sign
        tf.keras.layers.Dense(conf.INPUT_RANGES['key_sign'], name='keysign_scores'),
        # Time sign
        tf.keras.layers.Dense(conf.INPUT_RANGES['time_sign'], name='timesign_scores'),
        # Tempo
        tf.keras.layers.Dense(conf.INPUT_RANGES['tempo'], name='tempo_scores')
    ]
    
    output_probs_layers = [
        # Type
        tf.keras.layers.Softmax(name='type_probabilities'),
        # Measure
        tf.keras.layers.Softmax(name='measure_probabilities'),
        # Beat
        tf.keras.layers.Softmax(name='beat_probabilities'),
        # Position
        tf.keras.layers.Softmax(name='position_probabilities'),
        # Duration
        tf.keras.layers.Softmax(name='duration_probabilities'),
        # Pitch
        tf.keras.layers.Softmax(name='pitch_probabilities'),
        # Instrument
        tf.keras.layers.Softmax(name='instrument_probabilities'),
        # Velocity
        tf.keras.layers.Softmax(name='velocity_probabilities'),
        # Key sign
        tf.keras.layers.Softmax(name='keysign_probabilities'),
        # Time sign
        tf.keras.layers.Softmax(name='timesign_probabilities'),
        # Tempo
        tf.keras.layers.Softmax(name='tempo_probabilities')
    ]
    
    # Model dynamics
    embeddings        = [embedding_layers[i](songs[:,:,i]) for i in range(events_elements)]
    genre_embedding   = genre_embedding_layer(genres)
    input_embedding   = input_concat_layer(embeddings)
    input_embedding   = encoding_processing_layer(input_embedding)
    input_embedding   = sequence_concat_layer([genre_embedding[:, np.newaxis, :], input_embedding])
    input_embedding   = sum_layer([input_embedding, positional_encoding])
    model_output      = decoder({'inputs_embeds': input_embedding})['last_hidden_state']
    out_scores        = [output_dense_layers[i](model_output) for i in range(len(output_dense_layers))]
    # TODO: Here we should add the masking layer
    out_probabilities = [output_probs_layers[i](out_scores[i]) for i in range(len(output_dense_layers))]
    # TODO: In the line above we should add the masks computed in the masking layer

    # Create model
    model = tf.keras.Model(inputs=[songs, genres], outputs=out_probabilities, name='music_generation_model')
    
    # Define loss
    def custom_loss(songs, y_pred):
        gt_vectors = [songs[:,:,i] for i in range(len(conf.INPUT_RANGES))]
        # Base loss term
        losses = []
        for i in range(len(y_pred)):
            losses.append(tf.reduce_sum(
                tf.cast(loss_function(gt_vectors[i], y_pred[i][:, :-1, :]), tf.float32) * \
                (1. / conf.GLOBAL_BATCH_SIZE)))
        return tf.math.reduce_sum(losses)
    
    # Define regularizers
    def custom_regularizers(songs, y_pred):
        gt_vectors = [songs[:,:,i] for i in range(len(conf.INPUT_RANGES))]
        # Regularization loss: transform the actual vectors into consecutive-type representation
        types = gt_vectors[0]
        max_pred_types = tf.argmax(y_pred[0], axis=2, output_type=tf.int32)
        consecutive_gt_types   = subsequent_type_transform_layer(types)
        consecutive_pred_types = subsequent_type_transform_layer(max_pred_types)
        # Compute difference
        differences = consecutive_pred_types[:, 1:] - consecutive_pred_types[:, :-1]
        # Compute regularization terms
        # Difference between one element's type and the next is >= 0
        reg_term_1 = tf.math.reduce_sum(tf.math.maximum(0, -differences))
        # Difference between one element's type and the next is < 1
        reg_term_2 = tf.math.reduce_sum(tf.math.maximum(0, tf.math.maximum(1, differences) - 1))
        return reg_scaler * tf.cast(reg_term_1, tf.float32) + reg_scaler * tf.cast(reg_term_2, tf.float32)
    
    # Add losses
    model.add_loss(custom_loss(songs, out_scores))
    if use_regularization:
        model.add_loss(custom_regularizers(songs, out_scores))
    
    # Compile and return
    model.compile(optimizer="adam")
    return model

2022-11-26 19:01:56.645948: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-26 19:01:56.793264: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-26 19:01:56.826518: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-26 19:01:57.426598: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


2022-11-26 19:01:58.457744: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-26 19:01:59.532320: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30487 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
2022-11-26 19:01:59.532862: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30487 MB memory:  -> device: 1, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:86:00.0, compute capability: 7.0


In [2]:
if conf.num_devices > 1:
    print("Using multiple GPUs with Mirrored Strategy")
    with conf.training_strategy.scope():
        model = create_model()
else:
    print("Using single GPU/CPU device")
    model = create_model()

Using multiple GPUs with Mirrored Strategy
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


In [3]:
model.summary()
# tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)

Model: "music_generation_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 songs (InputLayer)             [(None, 6143, 11)]   0           []                               
                                                                                                  
 tf.__operators__.getitem_1 (Sl  (None, 6143)        0           ['songs[0][0]']                  
 icingOpLambda)                                                                                   
                                                                                                  
 tf.__operators__.getitem_2 (Sl  (None, 6143)        0           ['songs[0][0]']                  
 icingOpLambda)                                                                                   
                                                                             

We can test the model with some inputs from our dataset

In [4]:
DATASET_PATH = os.path.join('..', 'data', 'tf_data7')
dataset = tf.data.Dataset.load(DATASET_PATH).batch(conf.BATCH_SIZE).cache().shuffle(conf.SHUFFLE_SIZE).prefetch(conf.PREFETCH_SIZE)

In [5]:
X, y = next(dataset.take(1).as_numpy_iterator())

2022-11-26 19:02:06.349462: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [6]:
output = model([X, y])
print([x.shape for x in output])

[TensorShape([12, 6144, 8]), TensorShape([12, 6144, 256]), TensorShape([12, 6144, 131]), TensorShape([12, 6144, 128]), TensorShape([12, 6144, 136]), TensorShape([12, 6144, 256]), TensorShape([12, 6144, 129]), TensorShape([12, 6144, 128]), TensorShape([12, 6144, 25]), TensorShape([12, 6144, 153]), TensorShape([12, 6144, 49])]


In [7]:
model.losses

[<tf.Tensor: shape=(), dtype=float32, numpy=389425.3>,
 <tf.Tensor: shape=(), dtype=float32, numpy=14.207001>]

TODO: There is something weird with the multi-GPU loss. I bet I have to divide for the global batch size or something.