In [None]:
# Imports
import os

import numpy as np
import tensorflow as tf
from tensorflow.keras import mixed_precision
from transformers import GPT2Config, TFGPT2Model

mixed_precision.set_global_policy('mixed_float16')
physical_devices = tf.config.experimental.list_physical_devices('GPU')
for i in physical_devices:
    tf.config.experimental.set_memory_growth(i, True)

# Config

In [2]:
BATCH_SIZE = 6
SEQ_LEN = 6144
TOKEN_DIM = 512

Decoder creation

In [None]:
# Custom configuration for using GPT2 as a standard transformer decoder
config = GPT2Config(vocab_size = 0, n_positions = SEQ_LEN, n_embd = TOKEN_DIM, 
                    n_layer = 6, n_head = 8, activation_function='relu')
# Instantiate decoder
decoder = TFGPT2Model(config)

Testing the decoder on random inputs

In [None]:
output = decoder({'inputs_embeds': tf.ones((BATCH_SIZE, SEQ_LEN, TOKEN_DIM))})
output['last_hidden_state'].shape

# Dataset

In [5]:
DATASET_PATH = os.path.join('..', 'data', 'lmd_matched_6133_tf')

Load the dataset from disk and process it (batching, shuffling, ...)

In [6]:
dataset = tf.data.Dataset.load(DATASET_PATH).batch(BATCH_SIZE).cache().shuffle(256).prefetch(32)
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 6143, 11), dtype=tf.uint8, name=None), TensorSpec(shape=(None, 18), dtype=tf.float64, name=None))>

In [None]:
X, y = next(dataset.as_numpy_iterator())
print(X.shape, y.shape)

# Embedding layers

The inputs need to be encoded by some embedding layer (a specific embedding layer for each token type).

In [8]:
## Ranges and dimensions for embedding layers
TYPE_RANGE      = 8
MEASURE_RANGE   = 256
BEAT_RANGE      = 132
POSITION_RANGE  = 128
DURATION_RANGE  = 136
PITCH_RANGE     = 256
INSTRUMENT_RANGE= 129
VELOCITY_RANGE  = 128
KEY_SIGN_RANGE  = 24
TIME_SIGN_RANGE = 153
TEMPO_RANGE     = 49
GENRE_RANGE     = 18

OUTPUT_SIZE     = 64
GENRE_DIM       = 512

In [9]:
embedding_layers = [
    # Type embedding
    tf.keras.layers.Embedding(TYPE_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Measure embedding
    tf.keras.layers.Embedding(MEASURE_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Beat embedding
    tf.keras.layers.Embedding(BEAT_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Position embedding
    tf.keras.layers.Embedding(POSITION_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Duration embedding
    tf.keras.layers.Embedding(DURATION_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Pitch embedding
    tf.keras.layers.Embedding(PITCH_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Instrument embedding
    tf.keras.layers.Embedding(INSTRUMENT_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Velocity embedding
    tf.keras.layers.Embedding(VELOCITY_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Key sign embedding
    tf.keras.layers.Embedding(KEY_SIGN_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Time sign embedding
    tf.keras.layers.Embedding(TIME_SIGN_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN),
    # Tempo embedding
    tf.keras.layers.Embedding(TEMPO_RANGE, OUTPUT_SIZE, input_length=SEQ_LEN)
]

Run the embedding layers on our inputs

In [10]:
outputs = []
for i in tf.range(X.shape[2]):
    outputs.append(embedding_layers[i](X[:,:SEQ_LEN,i]))

We also need to encode the genre using some layers.

In [11]:
genre_embedding_module = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(GENRE_DIM, activation='relu')
])

In [12]:
genre_embedding = genre_embedding_module(y)
genre_embedding.shape

TensorShape([6, 512])

## Embedding concatenation

We concatenate the output embeddings into a single tensor

In [13]:
types_concat_layer = tf.keras.layers.Concatenate(axis=2)
concat_outputs = types_concat_layer(outputs)
concat_outputs.shape

TensorShape([6, 6143, 704])

Then we need to resize them into a known dimensionality

In [14]:
dense_layer = tf.keras.layers.Dense(TOKEN_DIM)
encoding = dense_layer(concat_outputs)
encoding.shape

TensorShape([6, 6143, 512])

Finally, we need to preprend the genre embedding token to the sequence

In [15]:
sequence_concat_layer = tf.keras.layers.Concatenate(axis=1)
final_sequence = sequence_concat_layer([genre_embedding[:, np.newaxis, :], encoding])
final_sequence.shape

TensorShape([6, 6144, 512])

## Positional encoding

We also add positional encodings to encode which is the position of each token in the sequence.

In [16]:
import math

def get_positional_embedding_matrix(seq_len=SEQ_LEN, dim=TOKEN_DIM):
    # From "Attention is all you need", https://arxiv.org/pdf/1706.03762.pdf
    PE = np.zeros((seq_len, dim))
    for pos in range(seq_len):
        for i in range(int(dim/2)):
            PE[pos,2*i]   = math.sin(pos/(10000**(2*i/dim)))
            PE[pos,2*i+1] = math.cos(pos/(10000**(2*i/dim)))
    return PE

In [21]:
positional_encoding_matrix = get_positional_embedding_matrix()

In transformers, it is common to add the positional embedding to the elements embeddings.

In [27]:
sum_layer = tf.keras.layers.Add()
positional_encoding = tf.repeat(positional_encoding_matrix[np.newaxis, :, :], tf.constant(BATCH_SIZE), axis=0)
final_encoding = sum_layer([final_sequence, positional_encoding])
final_encoding.shape

TensorShape([6, 6144, 512])

# Output management

In [20]:
output = decoder({'inputs_embeds': final_encoding})
output['last_hidden_state'].shape

TensorShape([6, 6144, 512])

We need a dense + softmax layer for each of the tokens for trying to reconstruct the input.

In [21]:
output_dense_layers = [
    # Type
    tf.keras.layers.Dense(TYPE_RANGE, activation='softmax'),
    # Measure
    tf.keras.layers.Dense(MEASURE_RANGE, activation='softmax'),
    # Beat
    tf.keras.layers.Dense(BEAT_RANGE, activation='softmax'),
    # Position
    tf.keras.layers.Dense(POSITION_RANGE, activation='softmax'),
    # Duration
    tf.keras.layers.Dense(DURATION_RANGE, activation='softmax'),
    # Pitch
    tf.keras.layers.Dense(PITCH_RANGE, activation='softmax'),
    # Instrument
    tf.keras.layers.Dense(INSTRUMENT_RANGE, activation='softmax'),
    # Velocity
    tf.keras.layers.Dense(VELOCITY_RANGE, activation='softmax'),
    # Key sign
    tf.keras.layers.Dense(KEY_SIGN_RANGE, activation='softmax'),
    # Time sign
    tf.keras.layers.Dense(TIME_SIGN_RANGE, activation='softmax'),
    # Tempo
    tf.keras.layers.Dense(TEMPO_RANGE, activation='softmax')
]

In [22]:
out_scores = [output_dense_layers[i](output['last_hidden_state']) 
              for i in range(len(output_dense_layers))]

for i in range(len(out_scores)):
    print(out_scores[i].shape)

(6, 6144, 8)
(6, 6144, 256)
(6, 6144, 132)
(6, 6144, 128)
(6, 6144, 136)
(6, 6144, 256)
(6, 6144, 129)
(6, 6144, 128)
(6, 6144, 24)
(6, 6144, 153)
(6, 6144, 49)


In [9]:
import tensorflow as tf
import numpy as np

In [19]:
mask = tf.stack([tf.stack([tf.squeeze(tf.random.categorical(tf.math.log([[0.5,0.5]]), 8))]*6144)]*2)

In [None]:
tf.keras.layers.Softmax()(tf.random.uniform((6, 6144, 8)), mask)

## Groundtruth vectors definition

In [23]:
gt_vectors = [X[:,:,i] for i in range(len(out_scores))]

for i in range(len(out_scores)):
    print(gt_vectors[i].shape)

(6, 6143)
(6, 6143)
(6, 6143)
(6, 6143)
(6, 6143)
(6, 6143)
(6, 6143)
(6, 6143)
(6, 6143)
(6, 6143)
(6, 6143)


 ## Loss definition

We can use a simple sparse categorical crossentropy loss function. The two distributions we are comparing are the input sequence (so we ignore the genre embedding token representation) and the output sequence up to the last token representation (`output[:-1]`)
- Note: can we use regularizers or other kinds of constraint enforcing methods for some of the fields? Like, we know that regarding the type field of events there is a strict order to follow (start of song, start of events, ..., notes and end of song). Can we enforce this structure?

In [24]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy()
losses = []
for i in range(len(out_scores)):
    losses.append(loss_function(gt_vectors[i], out_scores[i][:, :-1, :]))
losses

[<tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>,
 <tf.Tensor: shape=(), dtype=float16, numpy=inf>]

To these loss terms we can add some regularization terms that can help the model produce a grammatically correct sequence.

In [25]:
types = gt_vectors[0]
max_pred_types = tf.argmax(out_scores[0], axis=2) # 6, 6144
# Use a StaticHashTable to map values to their consecutive version within Tensorflow
keys_tensor = tf.range(TYPE_RANGE, dtype=tf.int32)
vals_tensor = tf.constant([0,1,2,3,3,3,3,4], dtype=tf.int32)
table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor), default_value=-1)
consecutive_gt_types   = table.lookup(tf.cast(types, tf.int32))
consecutive_pred_types = table.lookup(tf.cast(max_pred_types, tf.int32))
# Note: we assume that after token token type 7 all following token types are 7s
differences = consecutive_pred_types[:, 1:] - consecutive_pred_types[:, :-1]

In [26]:
# There are some constraint to pose for regularization
reg_term_1 = tf.reduce_sum(tf.math.maximum(0, -differences))                           # Difference between one element's type and the next is >= 0
reg_term_2 = tf.reduce_sum(tf.math.maximum(0, tf.math.maximum(1, differences) - 1))    # Difference between one element's type and the next is < 1

reg_term_1, reg_term_2

(<tf.Tensor: shape=(), dtype=int32, numpy=1737>,
 <tf.Tensor: shape=(), dtype=int32, numpy=553>)

In [27]:
REG_SCALER = 0.001

total_loss = tf.reduce_sum(losses) + \
             REG_SCALER * tf.cast(reg_term_1, tf.float16) + \
             REG_SCALER * tf.cast(reg_term_2, tf.float16)
total_loss

<tf.Tensor: shape=(), dtype=float16, numpy=inf>

When defining the whole Keras model for training, we can set up multiple outputs and give different weights for the multiple losses.

# Single model

Let's try and define everything that this model does into a complete callable model.

In [None]:
# Imports
from transformers import GPT2Config, TFGPT2Model
import tensorflow as tf
import numpy as np
import os
import math
# Setting mixed float16 use for lower memory use
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')
# Setting memory growth for lower memory use
physical_devices = tf.config.experimental.list_physical_devices('GPU')
for i in physical_devices:
    tf.config.experimental.set_memory_growth(i, True)
# Multi-GPU training strategy
training_strategy = tf.distribute.MirroredStrategy()
num_devices = training_strategy.num_replicas_in_sync

### CONSTANTS ###
BATCH_SIZE   = 12
GLOBAL_BATCH_SIZE = BATCH_SIZE * num_devices
SEQ_LEN      = 6144
TOKEN_DIM    = 512
EVENTS_ELEMS = 11

## Ranges and dimensions for embedding layers
TYPE_RANGE      = 8
MEASURE_RANGE   = 256
BEAT_RANGE      = 132
POSITION_RANGE  = 128
DURATION_RANGE  = 136
PITCH_RANGE     = 256
INSTRUMENT_RANGE= 129
VELOCITY_RANGE  = 128
KEY_SIGN_RANGE  = 24
TIME_SIGN_RANGE = 153
TEMPO_RANGE     = 49
GENRE_RANGE     = 18
# GENRE_RANGE     = 3
OUTPUT_SIZE     = 64
GENRE_DIM       = TOKEN_DIM

REG_LOSS_SCALE  = 0.001


# Custom intermediate layer for allowing types transformation (no parameters to be learnt)
class SubsequentTypeTransformationLayer(tf.keras.layers.Layer):

    def __init__(self):
        super(SubsequentTypeTransformationLayer, self).__init__()
        # Use a StaticHashTable to map values to their consecutive version within Tensorflow
        self.keys_tensor = tf.range(TYPE_RANGE)
        self.vals_tensor = tf.constant([0,1,2,3,3,3,3,4])
        self.table = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(self.keys_tensor, self.vals_tensor), 
            default_value=-1)

    def call(self, inputs):
        return self.table.lookup(inputs)


# Custom numpy function to create positional embeddings
def get_positional_embedding_matrix(seq_len=SEQ_LEN, dim=TOKEN_DIM):
    # From "Attention is all you need", https://arxiv.org/pdf/1706.03762.pdf
    PE = np.zeros((seq_len, dim))
    for pos in range(seq_len):
        for i in range(int(dim/2)):
            PE[pos,2*i]   = math.sin(pos/(10000**(2*i/dim)))
            PE[pos,2*i+1] = math.cos(pos/(10000**(2*i/dim)))
    return PE


# Model creation function (to be called within a scope in case of MultiGPU training)
def create_model(input_shape=(SEQ_LEN-1, EVENTS_ELEMS), num_genres=GENRE_RANGE, 
                 use_regularization=True, reg_loss_scale=REG_LOSS_SCALE):
    
    # Get input shapes
    seq_len = input_shape[0]
    events_elements = input_shape[1]
    
    # Instantiate transformer decoder (n_emb % n_head must be 0)
    config = GPT2Config(vocab_size = 0, n_positions = seq_len, 
                        n_embd = TOKEN_DIM, n_layer = 6, 
                        n_head = 2, activation_function='relu',
                        reorder_and_upcast_attn = True)
    decoder = TFGPT2Model(config, name='decoder')
    
    # Define inputs
    songs  = tf.keras.Input(shape=input_shape, name='songs',  dtype=tf.int32)
    genres = tf.keras.Input(shape=num_genres , name='genres', dtype=tf.float32)
    
    # Define loss
    loss_function = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    subsequent_type_transform_layer = SubsequentTypeTransformationLayer()
    reg_scaler = tf.constant(reg_loss_scale, dtype=tf.float32)
    
    # Embedding layers
    embedding_layers = [
        # Type embedding
        tf.keras.layers.Embedding(TYPE_RANGE, OUTPUT_SIZE, input_length=seq_len, name='type_embedding'),
        # Measure embedding
        tf.keras.layers.Embedding(MEASURE_RANGE, OUTPUT_SIZE, input_length=seq_len, name='measure_embedding'),
        # Beat embedding
        tf.keras.layers.Embedding(BEAT_RANGE, OUTPUT_SIZE, input_length=seq_len, name='beat_embedding'),
        # Position embedding
        tf.keras.layers.Embedding(POSITION_RANGE, OUTPUT_SIZE, input_length=seq_len, name='position_embedding'),
        # Duration embedding
        tf.keras.layers.Embedding(DURATION_RANGE, OUTPUT_SIZE, input_length=seq_len, name='duration_embedding'),
        # Pitch embedding
        tf.keras.layers.Embedding(PITCH_RANGE, OUTPUT_SIZE, input_length=seq_len, name='pitch_embedding'),
        # Instrument embedding
        tf.keras.layers.Embedding(INSTRUMENT_RANGE, OUTPUT_SIZE, input_length=seq_len, name='instrument_embedding'),
        # Velocity embedding
        tf.keras.layers.Embedding(VELOCITY_RANGE, OUTPUT_SIZE, input_length=seq_len, name='velocity_embedding'),
        # Key sign embedding
        tf.keras.layers.Embedding(KEY_SIGN_RANGE, OUTPUT_SIZE, input_length=seq_len, name='keysign_embedding'),
        # Time sign embedding
        tf.keras.layers.Embedding(TIME_SIGN_RANGE, OUTPUT_SIZE, input_length=seq_len, name='timesign_embedding'),
        # Tempo embedding
        tf.keras.layers.Embedding(TEMPO_RANGE, OUTPUT_SIZE, input_length=seq_len, name='tempo_embedding')
    ]
    
    genre_embedding_layer = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(GENRE_DIM)
    ], name='genre_embedding')
    
    # Input processing layers
    input_concat_layer         = tf.keras.layers.Concatenate(axis=2)
    sequence_concat_layer      = tf.keras.layers.Concatenate(axis=1)
    encoding_processing_layer  = tf.keras.layers.Dense(TOKEN_DIM, name='encoding_processing')
    
    # Positional encoding
    positional_encoding_matrix = get_positional_embedding_matrix(seq_len=seq_len+1, dim=TOKEN_DIM)
    positional_encoding        = tf.repeat(positional_encoding_matrix[tf.newaxis, :, :], tf.shape(songs)[0], axis=0)
    sum_layer                  = tf.keras.layers.Add(name='final_encoding')

    # Output layers
    output_dense_layers = [
        # Type
        tf.keras.layers.Dense(TYPE_RANGE, name='type_scores'),
        # Measure
        tf.keras.layers.Dense(MEASURE_RANGE, name='measure_scores'),
        # Beat
        tf.keras.layers.Dense(BEAT_RANGE, name='beat_scores'),
        # Position
        tf.keras.layers.Dense(POSITION_RANGE, name='position_scores'),
        # Duration
        tf.keras.layers.Dense(DURATION_RANGE, name='duration_scores'),
        # Pitch
        tf.keras.layers.Dense(PITCH_RANGE, name='pitch_scores'),
        # Instrument
        tf.keras.layers.Dense(INSTRUMENT_RANGE, name='instrument_scores'),
        # Velocity
        tf.keras.layers.Dense(VELOCITY_RANGE, name='velocity_scores'),
        # Key sign
        tf.keras.layers.Dense(KEY_SIGN_RANGE, name='keysign_scores'),
        # Time sign
        tf.keras.layers.Dense(TIME_SIGN_RANGE, name='timesign_scores'),
        # Tempo
        tf.keras.layers.Dense(TEMPO_RANGE, name='tempo_scores')
    ]
    
    output_probs_layers = [
        # Type
        tf.keras.layers.Softmax(name='type_probabilities'),
        # Measure
        tf.keras.layers.Softmax(name='measure_probabilities'),
        # Beat
        tf.keras.layers.Softmax(name='beat_probabilities'),
        # Position
        tf.keras.layers.Softmax(name='position_probabilities'),
        # Duration
        tf.keras.layers.Softmax(name='duration_probabilities'),
        # Pitch
        tf.keras.layers.Softmax(name='pitch_probabilities'),
        # Instrument
        tf.keras.layers.Softmax(name='instrument_probabilities'),
        # Velocity
        tf.keras.layers.Softmax(name='velocity_probabilities'),
        # Key sign
        tf.keras.layers.Softmax(name='keysign_probabilities'),
        # Time sign
        tf.keras.layers.Softmax(name='timesign_probabilities'),
        # Tempo
        tf.keras.layers.Softmax(name='tempo_probabilities')
    ]
    
    # Model dynamics
    embeddings        = [embedding_layers[i](songs[:,:,i]) for i in range(events_elements)]
    genre_embedding   = genre_embedding_layer(genres)
    input_embedding   = input_concat_layer(embeddings)
    input_embedding   = encoding_processing_layer(input_embedding)
    input_embedding   = sequence_concat_layer([genre_embedding[:, np.newaxis, :], input_embedding])
    input_embedding   = sum_layer([input_embedding, positional_encoding])
    model_output      = decoder({'inputs_embeds': input_embedding})['last_hidden_state']
    out_scores        = [output_dense_layers[i](model_output) for i in range(len(output_dense_layers))]
    # TODO: Here we should add the masking layer
    out_probabilities = [output_probs_layers[i](out_scores[i]) for i in range(len(output_dense_layers))]
    # TODO: In the line above we should add the masks computed in the masking layer

    # Create model
    model = tf.keras.Model(inputs=[songs, genres], outputs=out_probabilities, name='music_generation_model')
    
    # Define loss
    def custom_loss(songs, y_pred):
        gt_vectors = [songs[:,:,i] for i in range(EVENTS_ELEMS)]
        # Base loss term
        losses = []
        for i in range(len(y_pred)):
            losses.append(tf.reduce_sum(
                tf.cast(loss_function(gt_vectors[i], y_pred[i][:, :-1, :]), tf.float32) * \
                (1. / GLOBAL_BATCH_SIZE)))
        return tf.math.reduce_sum(losses)
    
    # Define regularizers
    def custom_regularizers(songs, y_pred):
        gt_vectors = [songs[:,:,i] for i in range(EVENTS_ELEMS)]
        # Regularization loss: transform the actual vectors into consecutive-type representation
        types = gt_vectors[0]
        max_pred_types = tf.argmax(y_pred[0], axis=2, output_type=tf.int32)
        consecutive_gt_types   = subsequent_type_transform_layer(types)
        consecutive_pred_types = subsequent_type_transform_layer(max_pred_types)
        # Compute difference
        differences = consecutive_pred_types[:, 1:] - consecutive_pred_types[:, :-1]
        # Compute regularization terms
        # Difference between one element's type and the next is >= 0
        reg_term_1 = tf.math.reduce_sum(tf.math.maximum(0, -differences))
        # Difference between one element's type and the next is < 1
        reg_term_2 = tf.math.reduce_sum(tf.math.maximum(0, tf.math.maximum(1, differences) - 1))
        return reg_scaler * tf.cast(reg_term_1, tf.float32) + reg_scaler * tf.cast(reg_term_2, tf.float32)
    
    # Add losses
    model.add_loss(custom_loss(songs, out_scores))
    if use_regularization:
        model.add_loss(custom_regularizers(songs, out_scores))
    
    # Compile and return
    model.compile(optimizer="adam")
    return model

In [None]:
if len(tf.config.list_physical_devices('GPU')) > 1:
    print("Using multiple GPUs with Mirrored Strategy")
    with training_strategy.scope():
        model = create_model()
else:
    print("Using single GPU/CPU device")
    global_batch_size = BATCH_SIZE
    model = create_model()

In [None]:
model.summary()
# tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)

We can test the model with some inputs from our dataset

In [11]:
DATASET_PATH = os.path.join('..', 'data', 'lmd_matched_6133_tf')
dataset = tf.data.Dataset.load(DATASET_PATH).batch(GLOBAL_BATCH_SIZE).shuffle(256).cache().prefetch(32)

In [None]:
X, y = next(dataset.take(1).as_numpy_iterator())

In [13]:
output = model([X, y])
print([x.shape for x in output])

[TensorShape([24, 6144, 8]), TensorShape([24, 6144, 256]), TensorShape([24, 6144, 132]), TensorShape([24, 6144, 128]), TensorShape([24, 6144, 136]), TensorShape([24, 6144, 256]), TensorShape([24, 6144, 129]), TensorShape([24, 6144, 128]), TensorShape([24, 6144, 24]), TensorShape([24, 6144, 153]), TensorShape([24, 6144, 49])]


In [14]:
model.losses

[<tf.Tensor: shape=(), dtype=float32, numpy=345038.62>,
 <tf.Tensor: shape=(), dtype=float32, numpy=35.868004>]