In [1]:
import os

import numpy as np
import tensorflow as tf
from tensorflow import keras

from typing import List
from tqdm import trange

import config, music_model, utils

### CONFIGURATION ###

ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
DATASET_NAME = 'tf_data7dict'
USE_SMALL_GENRE_SET = DATASET_NAME == 'tf_data7dict'
USE_ONE_GPU = True

conf = config.Config("single_instruments_type", ROOT_PATH)

if USE_SMALL_GENRE_SET:
    conf.accepted_subgenres = ['folk', 'nes', 'maestro']
# If we need to use only the first GPU
if USE_ONE_GPU:
    conf.GPUS = tf.config.experimental.list_physical_devices('GPU')[0]
    conf.BATCH_SIZE = conf.BATCH_SIZE
    conf.GLOBAL_BATCH_SIZE = conf.BATCH_SIZE
    conf.num_devices = 1

### MODEL CREATION ###

if conf.num_devices > 1:
    print("Using multiple GPUs with Mirrored Strategy")
    with conf.training_strategy.scope():
        model = music_model.create_model(conf,
                                         num_genres=len(conf.accepted_subgenres),
                                         use_regularization=False,
                                         use_masking_layers=False)
else:
    print("Using single GPU/CPU device")
    model = music_model.create_model(conf,
                                     num_genres=len(conf.accepted_subgenres),
                                     use_regularization=False,
                                     use_masking_layers=False)

2023-04-21 01:09:03.145083: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm
2023-04-21 01:09:05.061436: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-21 01:09:05.077293: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Document

Using single GPU/CPU device


2023-04-21 01:09:05.153180: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-21 01:09:05.153353: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-21 01:09:05.153470: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

---

## Quick example

In [2]:
## Load the dataset
DATASET_PATH = os.path.join('..', 'data', 'tf_data7dict')
dataset = tf.data.Dataset.load(DATASET_PATH).batch(conf.BATCH_SIZE).cache().shuffle(conf.SHUFFLE_SIZE).prefetch(conf.PREFETCH_SIZE)

## Take the first batch of the dataset and trim its sequence length to 2047
X, y = next(dataset.take(1).as_numpy_iterator())
X = (X[0][:,:conf.SEQ_LEN-1,:], X[1])
y = {k: y[k][:,:conf.SEQ_LEN-1] for k in y}

print(X[0].shape, X[1].shape)
print(y.keys())

(4, 2047, 11) (4, 3)
dict_keys(['duration', 'type', 'tempo', 'measure', 'instrument', 'pitch', 'position', 'beat', 'time_sign', 'velocity', 'key_sign'])


2023-04-21 01:09:09.239455: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [8]:
# Function to choose a style for the song
def encode_styles(styles_array):
    return np.stack([
        utils.one_hot_encode_labels_nmf(style)
        for style in styles_array
    ], axis=0).astype(np.int8)

# Example:
styles = encode_styles(['folk', 'nes', 'maestro', 'nes'])
print(styles)

[[1 0 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]]


In [4]:
preprocessing_model = keras.Model(inputs=model.input, outputs=model.get_layer('final_encoding').output)
transformer = model.get_layer('tfgpt2_model')
output_layers = [
    model.get_layer('type_scores'),
    model.get_layer('measure_scores'),
    model.get_layer('beat_scores'),
    model.get_layer('position_scores'),
    model.get_layer('duration_scores'),
    model.get_layer('pitch_scores'),
    model.get_layer('instrument_scores'),
    model.get_layer('velocity_values'),
    model.get_layer('keysign_scores'),
    model.get_layer('timesign_scores'),
    model.get_layer('tempo_scores')
]
activations = [tf.keras.activations.softmax] * 7 + [tf.keras.activations.relu] + [tf.keras.activations.softmax] * 3

## Test this pipeline
# Let's assume we need to only send the first 2 token.
# Therefore we will build an attention mask where only the first 2 values are 1 and the others are 0
CUR_IDX = 2
attention_mask = np.ones((conf.GEN_BATCH_SIZE, 1+CUR_IDX), dtype=np.int8)
padding_attention_mask = np.zeros((conf.GEN_BATCH_SIZE, conf.SEQ_LEN-1-CUR_IDX), dtype=np.int8)
attention_mask = np.concatenate([attention_mask, padding_attention_mask], axis=-1)

preprocessed_tensors = preprocessing_model(X)
out_transformer      = transformer({'inputs_embeds': preprocessed_tensors},
                                   attention_mask=attention_mask)['last_hidden_state']
out_scores           = [output_layers[i](out_transformer)[:,:-1,:] 
                        for i in range(len(output_layers))]
out_probs            = [activations[i](out_scores[i]) 
                        for i in range(len(activations))]

2023-04-21 01:09:09.922914: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [5]:
# Sample indices from the output probabilities
token_components = []
for head_idx in range(len(out_probs)):
    if head_idx == 7: token_components.append(
            np.asarray(out_probs[7][:, CUR_IDX+1, :], dtype=np.int32))
    else:
        batch = [[np.random.choice(
                    np.arange(out_probs[head_idx].shape[-1]), 
                    p=np.array(out_probs[head_idx][song, CUR_IDX+1, :])
                )] for song in range(conf.GEN_BATCH_SIZE)]
        token_components.append(np.asarray(batch, dtype=np.int32))
np.concatenate(token_components, axis=1)

# 2. Top-k sampling

array([[  5, 177, 104, 114,   5, 250,  23,   1,   2,  36,  29],
       [  5, 158,  32,  22,  67, 149,  69,   1,   8, 138,  46],
       [  1, 129,  12,  65,  26,  94, 109,   1,   8, 132,  31],
       [  5, 105,  95,  15,  25, 134, 128,   1,  14,  11,  41]],
      dtype=int32)

---

## Generation function

In [18]:
## Song generation function
def generate_songs(model, style_list:List[str], max_length:int=conf.SEQ_LEN-1, 
                   terminator_type:int=7, generation_mode:str='standard_sampling', 
                   temperature:float=1.0, top_k:int=10, top_p:float=0.9):
    
    # Check the validity of the parameters
    assert temperature > 0, "Temperature must be greater than 0"
    assert max_length <= conf.SEQ_LEN-1, f"The maximum length of the generated song must be less than {conf.SEQ_LEN-1}"
    assert generation_mode in ['standard_sampling', 'top_k_sampling', 'top_p_sampling'], \
        f"Parameter 'generation_mode' must be one of the following: 'standard_sampling', 'top_k_sampling', 'top_p_sampling'"

    # Collect explicitly the number of songs to generate
    num_songs = len(style_list)

    # Separate preprocessing model, transformer and output layers in order to be able to
    # inject the attention masks into the transformer and still use the loaded weights
    preprocessing_model = keras.Model(inputs=model.input, outputs=model.get_layer('final_encoding').output)
    transformer = model.get_layer('tfgpt2_model')
    output_layers = [
        model.get_layer('type_scores'), model.get_layer('measure_scores'), model.get_layer('beat_scores'),
        model.get_layer('position_scores'), model.get_layer('duration_scores'), model.get_layer('pitch_scores'),
        model.get_layer('instrument_scores'), model.get_layer('velocity_values'), model.get_layer('keysign_scores'),
        model.get_layer('timesign_scores'), model.get_layer('tempo_scores')
    ]
    activations = [tf.keras.activations.softmax] * 7 + [tf.keras.activations.relu] + [tf.keras.activations.softmax] * 3
    
    # Create the empty song array.
    # The first token of the songs is always the start token (0,...,0)
    # The rest of the tensor is filled with padding of zeroes, which will be masked by the attention masks
    generated_songs = np.zeros((num_songs, conf.SEQ_LEN-1, 11), dtype=np.int32)
    
    # Generate the one-hot encodings of the genres
    styles = encode_styles(style_list)

    # Start to generate the songs token by token
    for i in trange(1, max_length-1):

        # Create the attention mask (the first 2 tokens are always attended: genre and starting token)
        attention_mask = np.ones((num_songs, 1+i), dtype=np.int8)
        padding_attention_mask = np.zeros((num_songs, conf.SEQ_LEN-1-i), dtype=np.int8)
        attention_mask = np.concatenate([attention_mask, padding_attention_mask], axis=-1)

        # Preprocess the songs and pass them through the transformer
        preprocessed_tensors = preprocessing_model((generated_songs, styles))
        out_transformer      = transformer({'inputs_embeds': preprocessed_tensors},
                               attention_mask=attention_mask)['last_hidden_state']
        # Use the output layers to generate the probabilities for the next token
        # Output from transformer has SEQ_LEN tokens, so we trim it by removing the last one,
        # since it's the probability of a token that's out of our bounds.
        out_scores           = [output_layers[i](out_transformer)[:,:-1,:] / temperature
                                for i in range(len(output_layers))]
        out_probs            = [activations[i](out_scores[i]) 
                                for i in range(len(activations))]
        
        # Sample the next token from the output probabilities using the desired sampling mode:

        ## 1. Standard mapping: simply sample from the probability distributions with no additional frills.
        if generation_mode == 'standard_sampling':
            # Sample indices from the output probabilities
            token_components = []
            for head_idx in range(len(out_probs)):
                # Velocity is just a number, not a probability distribution
                if head_idx == 7: token_components.append(
                        np.asarray(out_probs[7][:, i+1, :], dtype=np.int32))
                else:
                    # For the other elements, sample an index using the prob distribution
                    batch = [[np.random.choice(
                                np.arange(out_probs[head_idx].shape[-1]), 
                                p=np.array(out_probs[head_idx][song, i+1, :])
                            )] for song in range(num_songs)]
                    token_components.append(np.asarray(batch, dtype=np.int32))
            new_tokens = np.concatenate(token_components, axis=1)

        ## 2. Top-k sampling: sample from the top-k most probable tokens
        elif generation_mode == 'top_k_sampling':
            raise NotImplementedError

        ## 3. Top-p sampling: sample from the most probable tokens until the cumulative probability exceeds p
        elif generation_mode == 'top_p_sampling':
            raise NotImplementedError
        
        # Add the new tokens to the songs
        generated_songs[:, i] = new_tokens

        ## TODO: if token has type of terminator, stop sampling and simply fill the rest of the song with end tokens 
        
    # Internally, the generated song is always SEQ_LEN - 1 long, so we cut it before returning it.
    return generated_songs[:, :max_length, :]

In [21]:
out_songs = generate_songs(model, ['folk'], max_length=512, temperature=0.9)

100%|██████████| 510/510 [00:39<00:00, 12.90it/s]


In [23]:
out_songs.shape, out_songs

((1, 512, 11),
 array([[[  0,   0,   0, ...,   0,   0,   0],
         [  0,   0,   0, ...,   0,   0,   0],
         [  5, 123,  53, ...,   9, 123,  37],
         ...,
         [  6, 200,  85, ...,  13,  39,  34],
         [  3, 158,  30, ...,  23,  46,  37],
         [  2, 113,  99, ...,   8,   4,  10]]], dtype=int32))