In [1]:
# Imports
import os

import numpy as np
import tensorflow as tf
from tensorflow.keras import mixed_precision

from transformers import TransfoXLConfig, TFTransfoXLModel, GPT2Config, TFGPT2Model

from config import Config

2022-12-31 12:02:35.768989: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-31 12:02:35.905496: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-31 12:02:35.938379: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-31 12:02:36.516618: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

In [2]:
# Workaround for very high loads on GPUs
tf.config.set_visible_devices([], 'GPU')

# Config

In [3]:
MODEL_TYPE = 'XL' # We can chose another model type, for now the supported ones are GPT and XL

ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
conf = Config("single_instruments_type", ROOT_PATH, model_type=MODEL_TYPE)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


2022-12-31 12:02:37.608208: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Decoder creation

In [4]:
decoder = conf.get_decoder()

Testing the decoder on random inputs

In [None]:
output = decoder({'inputs_embeds': tf.ones((conf.BATCH_SIZE, conf.SEQ_LEN, conf.TOKEN_DIM))})
output['last_hidden_state'].shape

# Dataset

Load the dataset from disk and process it (batching, shuffling, ...)

In [6]:
DATASET_PATH = os.path.join('..', 'data', 'tf_data7dict')
dataset = tf.data.Dataset.load(DATASET_PATH).batch(conf.BATCH_SIZE).cache().shuffle(conf.SHUFFLE_SIZE).prefetch(conf.PREFETCH_SIZE)
dataset

<PrefetchDataset element_spec=((TensorSpec(shape=(None, 6143, 11), dtype=tf.uint8, name=None), TensorSpec(shape=(None, 3), dtype=tf.uint8, name=None)), {'pitch': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'tempo': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'beat': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'duration': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'velocity': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'position': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'key_sign': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'time_sign': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'measure': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'type': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None), 'instrument': TensorSpec(shape=(None, 6143), dtype=tf.uint8, name=None)})>

In [7]:
X, y = next(dataset.take(1).as_numpy_iterator())
print(X[0].shape, X[1].shape, y.keys())

(8, 6143, 11) (8, 3) dict_keys(['pitch', 'tempo', 'beat', 'duration', 'velocity', 'position', 'key_sign', 'time_sign', 'measure', 'type', 'instrument'])


2022-12-24 02:21:23.968878: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


# Embedding layers

The inputs need to be encoded by some embedding layer (a specific embedding layer for each token type).

In [8]:
embedding_layers = [
    # Type embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['type'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Measure embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['measure'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Beat embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['beat'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Position embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['position'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Duration embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['duration'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Pitch embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['pitch'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Instrument embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['instrument'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Velocity embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['velocity'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Key sign embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['key_sign'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Time sign embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['time_sign'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN),
    # Tempo embedding
    tf.keras.layers.Embedding(conf.INPUT_RANGES['tempo'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN)
]

Run the embedding layers on our inputs

In [9]:
outputs = []
for i in tf.range(X[0].shape[2]):
    outputs.append(embedding_layers[i](X[0][:, : ,i]))

We also need to encode the genre using some layers.

In [10]:
genre_embedding_module = tf.keras.Sequential([
    tf.keras.layers.Dense(conf.SINGLE_EMB_SIZE, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(conf.GENRE_DIM, activation='relu')
])

In [11]:
genre_embedding = genre_embedding_module(X[1])
genre_embedding.shape

TensorShape([8, 512])

## Embedding concatenation

We concatenate the output embeddings into a single tensor

In [12]:
types_concat_layer = tf.keras.layers.Concatenate(axis=2)
concat_outputs = types_concat_layer(outputs)
concat_outputs.shape

TensorShape([8, 6143, 704])

Then we need to resize them into a known dimensionality

In [13]:
dense_layer = tf.keras.layers.Dense(conf.TOKEN_DIM)
encoding = dense_layer(concat_outputs)
encoding.shape

TensorShape([8, 6143, 512])

Finally, we need to preprend the genre embedding token to the sequence

In [14]:
sequence_concat_layer = tf.keras.layers.Concatenate(axis=1)
final_sequence = sequence_concat_layer([genre_embedding[:, np.newaxis, :], encoding])
final_sequence.shape

TensorShape([8, 6144, 512])

Note that Transformer-XL uses its own positional encoding (relative instead of absolute), so it's not needed to add it here. Otherwise:

In [15]:
if MODEL_TYPE == 'GPT':
    positional_encoding_matrix = conf.get_positional_embedding_matrix()

    sum_layer = tf.keras.layers.Add()
    positional_encoding = tf.repeat(positional_encoding_matrix[np.newaxis, :, :], 
                                    tf.constant(conf.BATCH_SIZE), axis=0)
    final_sequence = sum_layer([final_sequence, positional_encoding])
    final_sequence.shape

# Output management

In [16]:
output = decoder({'inputs_embeds': final_sequence})
output['last_hidden_state'].shape

TensorShape([8, 6144, 512])

We need a dense + softmax layer for each of the tokens for trying to reconstruct the input.

In [17]:
output_dense_layers = [
    # Type
    tf.keras.layers.Dense(conf.INPUT_RANGES['type'], activation='softmax'),
    # Measure
    tf.keras.layers.Dense(conf.INPUT_RANGES['measure'], activation='softmax'),
    # Beat
    tf.keras.layers.Dense(conf.INPUT_RANGES['beat'], activation='softmax'),
    # Position
    tf.keras.layers.Dense(conf.INPUT_RANGES['position'], activation='softmax'),
    # Duration
    tf.keras.layers.Dense(conf.INPUT_RANGES['duration'], activation='softmax'),
    # Pitch
    tf.keras.layers.Dense(conf.INPUT_RANGES['pitch'], activation='softmax'),
    # Instrument
    tf.keras.layers.Dense(conf.INPUT_RANGES['instrument'], activation='softmax'),
    # Velocity
    tf.keras.layers.Dense(conf.INPUT_RANGES['velocity'], activation='softmax'),
    # Key sign
    tf.keras.layers.Dense(conf.INPUT_RANGES['key_sign'], activation='softmax'),
    # Time sign
    tf.keras.layers.Dense(conf.INPUT_RANGES['time_sign'], activation='softmax'),
    # Tempo
    tf.keras.layers.Dense(conf.INPUT_RANGES['tempo'], activation='softmax')
]

In [18]:
out_scores = [output_dense_layers[i](output['last_hidden_state']) 
              for i in range(len(output_dense_layers))]

for i in range(len(out_scores)):
    print(out_scores[i].shape)

(8, 6144, 8)
(8, 6144, 256)
(8, 6144, 131)
(8, 6144, 128)
(8, 6144, 136)
(8, 6144, 256)
(8, 6144, 129)
(8, 6144, 128)
(8, 6144, 25)
(8, 6144, 153)
(8, 6144, 49)


## Groundtruth vectors definition

In [19]:
for k in y:
    print(f"{k}: {tf.shape(y[k])}")

pitch: [   8 6143]
tempo: [   8 6143]
beat: [   8 6143]
duration: [   8 6143]
velocity: [   8 6143]
position: [   8 6143]
key_sign: [   8 6143]
time_sign: [   8 6143]
measure: [   8 6143]
type: [   8 6143]
instrument: [   8 6143]


In [20]:
out_scores_dict = {
    key: out_scores[i] 
    for i, key in enumerate(conf.INPUT_RANGES)
}

 ## Loss definition

We can use a simple sparse categorical crossentropy loss function. The two distributions we are comparing are the input sequence (so we ignore the genre embedding token representation) and the output sequence up to the last token representation (`output[:-1]`)
- Note: can we use regularizers or other kinds of constraint enforcing methods for some of the fields? Like, we know that regarding the type field of events there is a strict order to follow (start of song, start of events, ..., notes and end of song). Can we enforce this structure?

In [21]:
def find_type_7(songs):
    idxs = []
    for song in songs:
        idxs.append(tf.math.reduce_min(tf.where(song[:, 0] == 7)))
    return tf.stack(idxs)

idx = tf.keras.layers.Lambda(find_type_7)(X[0])

mask = tf.cast(tf.stack([
    tf.concat([tf.ones(idx[i]), tf.zeros(conf.SEQ_LEN - 1 - idx[i])], axis=-1)
    for i in tf.range(tf.shape(idx)[0])
]), tf.bool)

In [22]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
losses = []
for key in y.keys():
    gt = tf.boolean_mask(y[key], mask)
    pred = tf.boolean_mask(out_scores_dict[key][:, :-1, :], mask)
    losses.append(loss_function(gt, pred))
losses

[<tf.Tensor: shape=(), dtype=float32, numpy=5.544826>,
 <tf.Tensor: shape=(), dtype=float32, numpy=3.8507602>,
 <tf.Tensor: shape=(), dtype=float32, numpy=4.869914>,
 <tf.Tensor: shape=(), dtype=float32, numpy=4.916162>,
 <tf.Tensor: shape=(), dtype=float32, numpy=4.852562>,
 <tf.Tensor: shape=(), dtype=float32, numpy=4.856603>,
 <tf.Tensor: shape=(), dtype=float32, numpy=3.2490582>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.0249686>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.544386>,
 <tf.Tensor: shape=(), dtype=float32, numpy=2.1158524>,
 <tf.Tensor: shape=(), dtype=float32, numpy=4.8589015>]

To these loss terms we can add some regularization terms that can help the model produce a grammatically correct sequence.

In [23]:
# Custom intermediate layer for allowing types transformation (no parameters to be learnt)
class SubsequentTypeTransformationLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(SubsequentTypeTransformationLayer, self).__init__()
        # Use a StaticHashTable to map values to their consecutive version within Tensorflow
        self.keys_tensor = tf.range(conf.INPUT_RANGES['type'])
        self.vals_tensor = tf.constant([0,1,2,3,3,3,3,4])
        self.table = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(self.keys_tensor, self.vals_tensor), 
            default_value=-1)

    def call(self, inputs):
        return self.table.lookup(inputs)

    
class InstrumentsChecker(tf.keras.layers.Layer):
     def __init__(self):
        super(InstrumentsChecker, self).__init__()
    
     def call(self, inputs):
        max_pred_types, instrument_scores = inputs
        reg_term_2_list = []
        for b in tf.range(tf.shape(max_pred_types)[0]):
            instruments_in_batch = tf.argmax(
                tf.gather(instrument_scores[b], tf.where(max_pred_types[b] == 1)[:, 0]),
                axis=-1)
            unique_instruments_in_batch, _ = tf.unique(instruments_in_batch)
            instruments_in_notes = tf.argmax(
                tf.gather(instrument_scores[b], tf.where(max_pred_types[b] == 3)[:, 0]),
                axis=-1)
            unique_instruments_in_notes, _, count_of_instruments_in_notes = \
                tf.unique_with_counts(instruments_in_notes)
            undefined_instruments_in_notes = tf.sparse.to_dense(
                  tf.sets.difference(tf.expand_dims(unique_instruments_in_notes, axis=0), 
                                     tf.expand_dims(unique_instruments_in_batch, axis=0)))[0]
            indices_of_undefined_instruments = tf.where(
                tf.expand_dims(undefined_instruments_in_notes, axis=1) == unique_instruments_in_notes)[:, 1]
            count_of_undefined_instruments = tf.gather(count_of_instruments_in_notes, indices_of_undefined_instruments)
            # Difference between the number of selected instruments and the number of unique instruments
            # (AKA: number of duplicates)
            reg_term_2_1 = tf.shape(instruments_in_batch)[0] - tf.shape(unique_instruments_in_batch)[0]
            # Sum the number of undefined instruments in notes
            reg_term_2_2 = tf.math.reduce_sum(count_of_undefined_instruments)
            reg_term_2_list.append(reg_term_2_1 + reg_term_2_2)
        return tf.math.reduce_sum(reg_term_2_list)


class MiscTypeChecker(tf.keras.layers.Layer):
     def __init__(self):
        super(MiscTypeChecker, self).__init__()
    
     def call(self, inputs):
        max_pred_types = inputs
        # 1) First token must have type 0 (each batch element times 4 to keep it comparable)
        rg1 = tf.math.reduce_sum(tf.cast(max_pred_types[:, 0] != 0, tf.int32)*4)
        # 2) Second token must have type 1 (each batch element times 4 to keep it comparable)
        rg2 = tf.math.reduce_sum(tf.cast(max_pred_types[:, 1] != 1, tf.int32)*4)
        rg3s = []
        rg4s = []
        for b in tf.range(tf.shape(max_pred_types)[0]):
            ones = tf.cast(tf.where(max_pred_types[b] == 1), tf.int32)
            last_1 = -1
            if tf.size(ones)  > 0: last_1 = tf.squeeze(ones[-1])
            # 3) There should be at least one of each type (squared to be comparable to other losses)
            rg3s.append((conf.INPUT_RANGES['type'] - tf.size(tf.unique(max_pred_types[b])[0]))**2)
            # 4) From the last 1 type token there should be the following types pattern:
            #    ..., 1, 2, 4, 5, 6, 3, ...
            if 0 < last_1 < (tf.shape(max_pred_types)[1] - 5):
                rg4s.append(tf.cast(max_pred_types[b, last_1 + 1] != 2, tf.int32) + \
                            tf.cast(max_pred_types[b, last_1 + 2] != 4, tf.int32) + \
                            tf.cast(max_pred_types[b, last_1 + 3] != 5, tf.int32) + \
                            tf.cast(max_pred_types[b, last_1 + 4] != 6, tf.int32) + \
                            tf.cast(max_pred_types[b, last_1 + 5] != 3, tf.int32))
            else:
                # Something has gone wrong, so the error would be the maximum + 1
                rg4s.append(6)
        return rg1 + rg2 + tf.math.reduce_sum(rg3s) + tf.math.reduce_sum(rg4s)

In [24]:
subsequent_type_transform_layer = SubsequentTypeTransformationLayer()
instruments_checker = InstrumentsChecker()
misc_type_checker = MiscTypeChecker()
reg_scaler = 0.001

In [29]:
def custom_regularizers(y_pred):
    # Regularization loss: transform the actual vectors into consecutive-type representation
    max_pred_types = tf.argmax(y_pred[0], axis=2, output_type=tf.int32)
    ####### 0: MISC CONSTRAINTS ABOUT TOKEN TYPES ORDER #######
    reg_term_0 = misc_type_checker(max_pred_types) * 20   # *20 to keep it comparable to other losses
    ####### 1: PUNISHMENT FOR NON-CONSECUTIVE TYPES ##########
    consecutive_pred_types = subsequent_type_transform_layer(max_pred_types)
    # Compute difference
    differences = consecutive_pred_types[:, 1:] - consecutive_pred_types[:, :-1]
    # Compute regularization terms
    # Difference between one element's type and the next is >= 0
    reg_term_1_1 = tf.math.reduce_sum(tf.math.maximum(0, -differences))
    # Difference between one element's type and the next is < 1
    reg_term_1_2 = tf.math.reduce_sum(tf.math.maximum(0, tf.math.maximum(1, differences) - 1))  
    reg_term_1 = reg_term_1_1 + reg_term_1_2
    ####### 2: PUNISHMENT FOR NOTES WHOSE INSTRUMENT IS NOT DEFINED AND FOR DUPLICATE INSTRUMENTS ########
    reg_term_2 = instruments_checker([max_pred_types, y_pred[6]])
    ####### 3: PUNISHMENT FOR CONSECUTIVE EVENTS WITH NON-INCREASING TIMINGS ########
    # Get the predicted measures, beats and positions
    max_pred_measures = tf.argmax(y_pred[1], axis=2, output_type=tf.int32)
    max_pred_beats = tf.argmax(y_pred[2], axis=2, output_type=tf.int32)
    max_pred_positions = tf.argmax(y_pred[3], axis=2, output_type=tf.int32)
    # Use them to compute the "times" matrix
    times = max_pred_measures*conf.INPUT_RANGES['beat']*conf.INPUT_RANGES['position'] + \
        max_pred_beats*conf.INPUT_RANGES['position'] + \
        max_pred_positions
    # Normalize times
    times = times / ((conf.INPUT_RANGES['measure']+1)*conf.INPUT_RANGES['beat']*conf.INPUT_RANGES['position'])
    # Only consider the time matrix when the type is between 3 and 6
    times = tf.cast(tf.where(tf.logical_and(max_pred_types >= 3, max_pred_types <= 6), times, 0), tf.float32)
    # For type 7 fill with a very large value
    times = tf.where(max_pred_types == 7, 1e10, times)
    # Compute time differences between consecutive time steps
    time_sep = times[:, 1:] - times[:, :-1]
    # Count negative time seps
    reg_term_3 = tf.math.reduce_sum(tf.cast(time_sep < 0, tf.int32))
    return reg_scaler * ((tf.cast(reg_term_0, tf.float32)) + (tf.cast(reg_term_1, tf.float32)) + \
                         (tf.cast(reg_term_2, tf.float32)) + (tf.cast(reg_term_3, tf.float32))) 

In [30]:
custom_regularizers(out_scores)

<tf.Tensor: shape=(), dtype=float32, numpy=6.708>

When defining the whole Keras model for training, we can set up multiple outputs and give different weights for the multiple losses.

# Single model

Let's try and define everything that this model does into a complete callable model.

In [1]:
# Imports
import os
import math

import numpy as np
import tensorflow as tf

# # Workaround for very high loads on GPUs
tf.config.set_visible_devices([], 'GPU')
# # Or use single GPU
# gpus = tf.config.list_physical_devices('GPU')
# tf.config.set_visible_devices(gpus[0], 'GPU')

from config import Config

MODEL_TYPE = 'XL'

ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
conf = Config("single_instruments_type", ROOT_PATH, model_type=MODEL_TYPE)

2022-12-25 10:08:32.005356: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-25 10:08:32.142350: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-25 10:08:32.181277: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-25 10:08:32.826658: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


  from .autonotebook import tqdm as notebook_tqdm
2022-12-25 10:08:33.809456: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
### CUSTOM LAYERS
# Custom intermediate layer for allowing types transformation (no parameters to be learnt)
class SubsequentTypeTransformationLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(SubsequentTypeTransformationLayer, self).__init__()
        # Use a StaticHashTable to map values to their consecutive version within Tensorflow
        self.keys_tensor = tf.range(conf.INPUT_RANGES['type'])
        self.vals_tensor = tf.constant([0,1,2,3,3,3,3,4])
        self.table = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(self.keys_tensor, self.vals_tensor), 
            default_value=-1)

    def call(self, inputs):
        return self.table.lookup(inputs)


# Custom intermediate layer for regularization that computes the loss related to 
# miscellaneous type errors that could happen in the generated song
class MiscTypeChecker(tf.keras.layers.Layer):
     def __init__(self):
        super(MiscTypeChecker, self).__init__()
    
     def call(self, inputs):
        max_pred_types = inputs
        # 1) First token must have type 0 (each batch element times 4 to keep it comparable)
        rg1 = tf.math.reduce_sum(tf.cast(max_pred_types[:, 0] != 0, tf.int32)*4)
        # 2) Second token must have type 1 (each batch element times 4 to keep it comparable)
        rg2 = tf.math.reduce_sum(tf.cast(max_pred_types[:, 1] != 1, tf.int32)*4)
        rg3s = tf.TensorArray(dtype=tf.int32, size=tf.shape(max_pred_types)[0])
        rg4s = tf.TensorArray(dtype=tf.int32, size=tf.shape(max_pred_types)[0])
        for b in tf.range(tf.shape(max_pred_types)[0]):
            ones = tf.cast(tf.where(max_pred_types[b] == 1), tf.int32)
            last_1 = -1
            if tf.size(ones)  > 0: last_1 = tf.squeeze(ones[-1])
            # 3) There should be at least one of each type (squared to be comparable to other losses)
            rg3s = rg3s.write(b, (conf.INPUT_RANGES['type'] - tf.size(tf.unique(max_pred_types[b])[0]))**2)
            # 4) From the last 1 type token there should be the following types pattern:
            #    ..., 1, 2, 4, 5, 6, 3, ...
            if 0 < last_1 < (tf.shape(max_pred_types)[1] - 5):
                rg4s = rg4s.write(b, (tf.cast(max_pred_types[b, last_1 + 1] != 2, tf.int32) + \
                                      tf.cast(max_pred_types[b, last_1 + 2] != 4, tf.int32) + \
                                      tf.cast(max_pred_types[b, last_1 + 3] != 5, tf.int32) + \
                                      tf.cast(max_pred_types[b, last_1 + 4] != 6, tf.int32) + \
                                      tf.cast(max_pred_types[b, last_1 + 5] != 3, tf.int32)))
            else:
                # Something has gone wrong, so the error would be the maximum + 1
                rg4s = rg4s.write(b, 6)
        return rg1 + rg2 + tf.math.reduce_sum(rg3s.stack()) + tf.math.reduce_sum(rg4s.stack())


# Custom intermediate layer for regularization that computes the loss related to duplicate instruments
# definition and instruments that are used wrongly in the notes.
class InstrumentsChecker(tf.keras.layers.Layer):
     def __init__(self):
        super(InstrumentsChecker, self).__init__()
    
     def call(self, inputs):
        max_pred_types, instrument_scores = inputs
        reg_term_2_list = tf.TensorArray(dtype=tf.int32, size=tf.shape(max_pred_types)[0])
        for b in tf.range(tf.shape(max_pred_types)[0]):
            instruments_in_batch = tf.argmax(
                tf.gather(instrument_scores[b], tf.where(max_pred_types[b] == 1)[:, 0]),
                axis=-1)
            unique_instruments_in_batch, _ = tf.unique(instruments_in_batch)
            instruments_in_notes = tf.argmax(
                tf.gather(instrument_scores[b], tf.where(max_pred_types[b] == 3)[:, 0]),
                axis=-1)
            unique_instruments_in_notes, _, count_of_instruments_in_notes = \
                tf.unique_with_counts(instruments_in_notes)
            undefined_instruments_in_notes = tf.sparse.to_dense(
                  tf.sets.difference(tf.expand_dims(unique_instruments_in_notes, axis=0), 
                                     tf.expand_dims(unique_instruments_in_batch, axis=0)))[0]
            indices_of_undefined_instruments = tf.where(
                tf.expand_dims(undefined_instruments_in_notes, axis=1) == unique_instruments_in_notes)[:, 1]
            count_of_undefined_instruments = tf.gather(count_of_instruments_in_notes, indices_of_undefined_instruments)
            # Difference between the number of selected instruments and the number of unique instruments
            # (AKA: number of duplicates)
            reg_term_2_1 = tf.shape(instruments_in_batch)[0] - tf.shape(unique_instruments_in_batch)[0]
            # Sum the number of undefined instruments in notes
            reg_term_2_2 = tf.math.reduce_sum(count_of_undefined_instruments)
            reg_term_2_list = reg_term_2_list.write(b, reg_term_2_1 + reg_term_2_2)
        return tf.math.reduce_sum(reg_term_2_list.stack())


In [3]:
# Model creation function (to be called within a scope in case of MultiGPU training)
def create_model(input_shape=(conf.SEQ_LEN-1, len(conf.INPUT_RANGES)), num_genres=len(conf.accepted_subgenres), 
                 use_regularization=True, use_masking_layers=False, reg_loss_scale=conf.REG_LOSS_SCALE):
    
    # Get input shapes
    seq_len = input_shape[0]
    events_elements = input_shape[1]
    
    # Instantiate transformer decoder (n_emb % n_head must be 0)
    decoder = conf.get_decoder()
    
    # Define inputs
    songs  = tf.keras.Input(shape=input_shape, name='songs',  dtype=tf.int32)
    genres = tf.keras.Input(shape=num_genres , name='genres', dtype=tf.float32)
    
    # Define loss
    loss_function = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    # Regularization layers
    subsequent_type_transform_layer = SubsequentTypeTransformationLayer()
    misc_type_checker = MiscTypeChecker()
    instruments_checker = InstrumentsChecker()
    reg_scaler = tf.constant(reg_loss_scale, dtype=tf.float32)
    
    # Embedding layers
    embedding_layers = [
        # Type embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['type'],       conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='type_embeddings'),
        # Measure embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['measure'],    conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='measure_embeddings'),
        # Beat embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['beat'],       conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='beat_embeddings'),
        # Position embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['position'],   conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='position_embeddings'),
        # Duration embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['duration'],   conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='duration_embeddings'),
        # Pitch embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['pitch'],      conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='pitch_embeddings'),
        # Instrument embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['instrument'], conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='instrument_embeddings'),
        # Velocity embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['velocity'],   conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='velocity_embeddings'),
        # Key sign embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['key_sign'],   conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='key_sign_embeddings'),
        # Time sign embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['time_sign'],  conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='time_sign_embeddings'),
        # Tempo embedding
        tf.keras.layers.Embedding(conf.INPUT_RANGES['tempo'],      conf.SINGLE_EMB_SIZE, input_length=conf.SEQ_LEN, name='tempo_embeddings')
    ]
    
    genre_embedding_layer = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(conf.GENRE_DIM)
    ], name='genre_embedding')
    
    # Input processing layers
    input_concat_layer         = tf.keras.layers.Concatenate(axis=2)
    sequence_concat_layer      = tf.keras.layers.Concatenate(axis=1)
    encoding_processing_layer  = tf.keras.layers.Dense(conf.TOKEN_DIM, name='encoding_processing')
    
    # Positional encoding
    if conf.model_type == 'GPT':
        positional_encoding_matrix = conf.get_positional_embedding_matrix()
        positional_encoding        = tf.repeat(positional_encoding_matrix[tf.newaxis, :, :], tf.shape(songs)[0], axis=0)
        sum_layer                  = tf.keras.layers.Add(name='final_encoding')

    # Output layers
    output_dense_layers = [
        # Type
        tf.keras.layers.Dense(conf.INPUT_RANGES['type'],       name='type_scores'),
        # Measure
        tf.keras.layers.Dense(conf.INPUT_RANGES['measure'],    name='measure_scores'),
        # Beat
        tf.keras.layers.Dense(conf.INPUT_RANGES['beat'],       name='beat_scores'),
        # Position
        tf.keras.layers.Dense(conf.INPUT_RANGES['position'],   name='position_scores'),
        # Duration
        tf.keras.layers.Dense(conf.INPUT_RANGES['duration'],   name='duration_scores'),
        # Pitch
        tf.keras.layers.Dense(conf.INPUT_RANGES['pitch'],      name='pitch_scores'),
        # Instrument
        tf.keras.layers.Dense(conf.INPUT_RANGES['instrument'], name='instrument_scores'),
        # Velocity
        tf.keras.layers.Dense(conf.INPUT_RANGES['velocity'],   name='velocity_scores'),
        # Key sign
        tf.keras.layers.Dense(conf.INPUT_RANGES['key_sign'],   name='keysign_scores'),
        # Time sign
        tf.keras.layers.Dense(conf.INPUT_RANGES['time_sign'],  name='timesign_scores'),
        # Tempo
        tf.keras.layers.Dense(conf.INPUT_RANGES['tempo'],      name='tempo_scores')
    ]
    
    output_probs_layers = [
        # Type
        tf.keras.layers.Softmax(name='type_probabilities'),
        # Measure
        tf.keras.layers.Softmax(name='measure_probabilities'),
        # Beat
        tf.keras.layers.Softmax(name='beat_probabilities'),
        # Position
        tf.keras.layers.Softmax(name='position_probabilities'),
        # Duration
        tf.keras.layers.Softmax(name='duration_probabilities'),
        # Pitch
        tf.keras.layers.Softmax(name='pitch_probabilities'),
        # Instrument
        tf.keras.layers.Softmax(name='instrument_probabilities'),
        # Velocity
        tf.keras.layers.Softmax(name='velocity_probabilities'),
        # Key sign
        tf.keras.layers.Softmax(name='keysign_probabilities'),
        # Time sign
        tf.keras.layers.Softmax(name='timesign_probabilities'),
        # Tempo
        tf.keras.layers.Softmax(name='tempo_probabilities')
    ]
    
    # Model dynamics
    embeddings        = [embedding_layers[i](songs[:,:,i]) for i in range(events_elements)]
    genre_embedding   = genre_embedding_layer(genres)
    input_embedding   = input_concat_layer(embeddings)
    input_embedding   = encoding_processing_layer(input_embedding)
    input_embedding   = sequence_concat_layer([genre_embedding[:, np.newaxis, :], input_embedding])
    if conf.model_type == 'GPT':
        input_embedding   = sum_layer([input_embedding, positional_encoding])
    model_output      = decoder({'inputs_embeds': input_embedding})['last_hidden_state']
    out_scores        = [output_dense_layers[i](model_output)[:,:-1,:] 
                         for i in range(len(output_dense_layers))]
    out_probabilities = [output_probs_layers[i](out_scores[i]) 
                         for i in range(len(output_dense_layers))]
                
    out_probabilities_dict = {
        key: out_probabilities[i] 
        for i, key in enumerate(conf.INPUT_RANGES)
    }

    # Create model
    model = tf.keras.Model(inputs=[songs, genres], 
                           outputs=out_probabilities_dict, 
                           name='music_generation_model')
    
    # Before computing losses, mask probabilities so that nothing after the first 7
    # in the original song counts.
    @tf.function
    def find_type_7(songs):
        mask = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        for i in tf.range(tf.shape(songs)[0]):
            end_song_idx = tf.math.reduce_min(tf.where(songs[i, :, 0] == 7))
            mask = mask.write(i, tf.concat([
                tf.ones(end_song_idx), 
                tf.zeros(conf.SEQ_LEN - 1 - end_song_idx)], axis=-1))
        return mask.stack()

    end_song_mask = tf.keras.layers.Lambda(find_type_7)(songs)
    end_song_mask = tf.cast(end_song_mask, tf.bool)
    
    # Define loss
    def custom_loss(y_true, y_pred):
        return tf.math.reduce_sum(loss_function(y_true, y_pred) * \
            (1. / (conf.GLOBAL_BATCH_SIZE * tf.cast(tf.shape(y_true)[0], tf.float32))))
    
    # Define regularizers
    def custom_regularizers(y_pred):
        # Regularization loss: transform the actual vectors into consecutive-type representation
        max_pred_types = tf.argmax(y_pred[0], axis=2, output_type=tf.int32)
        
        ####### 0: MISC CONSTRAINTS ABOUT TOKEN TYPES ORDER #######
        reg_term_0 = misc_type_checker(max_pred_types) * 20   # *20 to keep it comparable to other losses
        
        ####### 1: PUNISHMENT FOR NON-CONSECUTIVE TYPES ##########
        consecutive_pred_types = subsequent_type_transform_layer(max_pred_types)
        # Compute difference
        differences = consecutive_pred_types[:, 1:] - consecutive_pred_types[:, :-1]
        # Compute regularization terms
        # Difference between one element's type and the next is >= 0
        reg_term_1_1 = tf.math.reduce_sum(tf.math.maximum(0, -differences))
        # Difference between one element's type and the next is < 1
        reg_term_1_2 = tf.math.reduce_sum(tf.math.maximum(0, tf.math.maximum(1, differences) - 1))  
        reg_term_1 = reg_term_1_1 + reg_term_1_2
        
        ####### 2: PUNISHMENT FOR NOTES WHOSE INSTRUMENT IS NOT DEFINED AND FOR DUPLICATE INSTRUMENTS ########
        reg_term_2 = instruments_checker([max_pred_types, y_pred[6]])
        
        ####### 3: PUNISHMENT FOR CONSECUTIVE EVENTS WITH NON-INCREASING TIMINGS ########
        # Get the predicted measures, beats and positions
        max_pred_measures = tf.argmax(y_pred[1], axis=2, output_type=tf.int32)
        max_pred_beats = tf.argmax(y_pred[2], axis=2, output_type=tf.int32)
        max_pred_positions = tf.argmax(y_pred[3], axis=2, output_type=tf.int32)
        # Use them to compute the "times" matrix
        times = max_pred_measures*conf.INPUT_RANGES['beat']*conf.INPUT_RANGES['position'] + \
            max_pred_beats*conf.INPUT_RANGES['position'] + \
            max_pred_positions
        # Normalize times
        times = times / ((conf.INPUT_RANGES['measure']+1)*conf.INPUT_RANGES['beat']*conf.INPUT_RANGES['position'])
        # Only consider the time matrix when the type is between 3 and 6
        times = tf.cast(tf.where(tf.logical_and(max_pred_types >= 3, max_pred_types <= 6), times, 0), tf.float32)
        # For type 7 fill with a very large value
        times = tf.where(max_pred_types == 7, 1e10, times)
        # Compute time differences between consecutive time steps
        time_sep = times[:, 1:] - times[:, :-1]
        # Count negative time seps
        reg_term_3 = tf.math.reduce_sum(tf.cast(time_sep < 0, tf.int32))
        
        ###### PUT TOGETHER THE REGULARIZATION TERMS #######
        return tf.math.reduce_sum(
            reg_scaler * ((tf.cast(reg_term_0, tf.float32)) + (tf.cast(reg_term_1, tf.float32)) + \
                          (tf.cast(reg_term_2, tf.float32)) + (tf.cast(reg_term_3, tf.float32)))
        )
    
    # Add losses
    for i, k in enumerate(conf.INPUT_RANGES):
        loss_name = f'{k}_loss'
        gt = tf.boolean_mask(songs[:,:,i], end_song_mask)
        pred = tf.boolean_mask(out_probabilities[i], end_song_mask)
        loss = custom_loss(y_true = gt, y_pred = pred)
        model.add_loss(loss)
        model.add_metric(loss, name=loss_name)
    
    if use_regularization:
        # Note: we don't mask in regularization, because we don't use a ground truth
        # Here we just make the model learn how to produce a syntactically good output.
        reg_loss = custom_regularizers(out_probabilities)
        model.add_loss(reg_loss)
        model.add_metric(reg_loss, name='regularization_loss')
    
    # Compile and return
    model.compile(optimizer="adam")
    return model

In [4]:
if conf.num_devices > 1:
    print("Using multiple GPUs with Mirrored Strategy")
    with conf.training_strategy.scope():
        model = create_model(num_genres=3)
else:
    print("Using single GPU/CPU device")
    model = create_model(num_genres=3)

Using single GPU/CPU device


We can test the model with some inputs from our dataset

In [5]:
DATASET_PATH = os.path.join('..', 'data', 'tf_data7dict')
dataset = tf.data.Dataset.load(DATASET_PATH).batch(conf.BATCH_SIZE-4).cache().shuffle(conf.SHUFFLE_SIZE).prefetch(conf.PREFETCH_SIZE)

In [6]:
X, y = next(dataset.take(1).as_numpy_iterator())

2022-12-24 02:23:01.818624: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [7]:
output = model(X)
print([v.shape for _, v in output.items()])

[TensorShape([4, 6143, 8]), TensorShape([4, 6143, 256]), TensorShape([4, 6143, 131]), TensorShape([4, 6143, 128]), TensorShape([4, 6143, 136]), TensorShape([4, 6143, 256]), TensorShape([4, 6143, 129]), TensorShape([4, 6143, 128]), TensorShape([4, 6143, 25]), TensorShape([4, 6143, 153]), TensorShape([4, 6143, 49])]


In [8]:
model.losses

[<tf.Tensor: shape=(), dtype=float32, numpy=0.27524054>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.80073893>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.847412>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.81821555>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.65256166>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.7401344>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.70549273>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.7405702>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.51437616>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.67991376>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.48278618>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.1692>]

# Music Generation Metrics

| Name | To do? | Implemented? |
|:----:|:------:|:------------:|
|Pitch class histogram entropy | Yes | Yes |
| Grooving Pattern Similarity | Maybe | No | 
| Poliphony | Yes | Yes |
| Tone Span | Yes | Yes |
| Perplexity | Maybe | No |
| Macro Overlapping Area | Maybe | No |
| RMSE | Yes | No |

We could look into audio related metrics as well if they're not too hard to implement.

## Definitions

In [2]:
def get_notes_in_measures(song, start_measure, end_measure):
    # If it's a note with the same measure and its pitch is not drums-related
    return [token for token in song 
            if (token[0] == 3 
                and start_measure <= token[1] < end_measure
                and token[5] < 128)]

def pitch_class_histogram_entropy_metric(song, window_size=1):
    # Compute the mean pitch class histogram entropy in a song
    # using the specified number of measures (window_size).
    # Usually, interesting metrics use window_size 1 and 4
    song_measures = np.unique(song[:,1])
    if len(song_measures) < window_size:
        # print(f"\tSong has too few measures for window size {window_size}:"
        #      f" reverting back to a window size of {len(song_measures)}")
        window_size = len(song_measures)
    # Slide the window over the song to compute the entropy of notes in those measures
    entropy_for_windows = []
    for st_measure in range(0, len(song_measures) - window_size + 1):
        end_measure = st_measure + window_size
        notes = get_notes_in_measures(song, st_measure, end_measure) 
        if len(notes) > 0:
            notes_pitches = np.array([n[5] for n in notes])
            notes_classes = notes_pitches % 12 # {C, C#, ..., Bb, B}
            hist, edges = np.histogram(notes_classes, bins=list(range(12)))
            hist = hist / np.sum(hist)   # Normalize by total note count in the period
            hist = hist + 1e-10 # Avoid log of 0
            entropy = -np.sum(hist * (np.log(hist) / np.log(2))) # Fast log2 implementation
            entropy_for_windows.append(entropy)
        else:
            # print(f"\tWindow from measures {st_measure} to {end_measure} has no notes.")
            continue
    if len(entropy_for_windows) > 0:
        return np.mean(entropy_for_windows)
    else:
        return None

In [34]:
def poliphony_metric(song):
    notes_by_start_time = {}
    notes_in_song = 0
    for token in song:
        if token[0] == 3:   # Notes
            notes_in_song += 1
            start_time = token[1]*conf.INPUT_RANGES['beat']*conf.INPUT_RANGES['position'] + \
                         token[2]*conf.INPUT_RANGES['position'] + token[3]
            if start_time in notes_by_start_time:
                notes_by_start_time[start_time].append(token)
            else:
                notes_by_start_time[start_time] = [token]
    return sum([len(notes_list) > 1 for notes_list in notes_by_start_time.values()]) / notes_in_song

In [38]:
def tone_span_metric(song):
    lowest_pitch = 128
    highest_pitch = 0
    for token in song:
        if token[0] == 3:  # Notes
            if token[5] <= 128:  # No drums
                if token[5] < lowest_pitch:
                    lowest_pitch = token[5]
                if token[5] > highest_pitch:
                    highest_pitch = token[5]
    return highest_pitch - lowest_pitch

## Metrics computation on Test Set

In [5]:
dataset = tf.data.Dataset.load(conf.dataset_paths['tf_data7dict']).\
            batch(8).\
            cache().\
            shuffle(conf.SHUFFLE_SIZE).\
            prefetch(conf.PREFETCH_SIZE)

train_dataset = dataset.take(int(len(dataset)/100*70))
val_dataset   = dataset.skip(int(len(dataset)/100*70)).take(int(len(dataset)/100*15))
test_dataset  = dataset.skip(int(len(dataset)/100*85))

### Pitch Class Histogram Entropy

In [None]:
%%time

its = 1
entropies = []
entropies_4 = []

# Iterate over the test dataset
test_iter = test_dataset.as_numpy_iterator()
for X, _ in test_iter:
    print(f"Iteration {its} of {len(test_dataset)}...")
    its += 1

    for song in X[0]:
        entropy = pitch_class_histogram_entropy_metric(song, window_size = 1)
        entropy_4 = pitch_class_histogram_entropy_metric(song, window_size = 4)
        if entropy is not None:
            entropies.append(entropy)
        if entropy_4 is not None:
            entropies_4.append(entropy_4)

In [None]:
print(f"Mean entropy in the dataset with window length = 1 is: {np.mean(entropies)}")
print(f"Mean entropy in the dataset with window length = 4 is: {np.mean(entropies_4)}")

### Polypohony metric

In [None]:
%%time

polyphonies = []
# Iterate over the test dataset
test_iter = test_dataset.as_numpy_iterator()
for X, _ in test_iter:   
    for song in X[0]:
        polyphonies.append(poliphony_metric(song))

In [37]:
print(f"Mean frequency of polyphonies in the dataset is: {np.mean(polyphonies)*100:.4f}%")

Mean frequency of polyphonies in the dataset is: 11.8873%


### Tone span metric

In [39]:
%%time

tone_spans = []
# Iterate over the test dataset
test_iter = test_dataset.as_numpy_iterator()
for X, _ in test_iter:   
    for song in X[0]:
        tone_spans.append(tone_span_metric(song))

CPU times: user 9.7 s, sys: 15.2 ms, total: 9.72 s
Wall time: 9.71 s


In [41]:
print(f"Mean tone span in the dataset is: {np.mean(tone_spans)}")

Mean tone span in the dataset is: 43.00477099236641
