In [1]:
import os

import numpy as np
import tensorflow as tf
from tensorflow import keras

from typing import List
from tqdm import trange

import config, music_model, utils

### CONFIGURATION ###

ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
DATASET_NAME = 'lmd_matched_final_2048_cut'
WEIGHTS_PATH = os.path.join(ROOT_PATH, 'training', 'checkpoints', 'model_GPT_baseline_with_mse_vellmd_matched_2048', 'model_GPT_baseline_with_mse_vellmd_matched_2048')

USE_SMALL_GENRE_SET = DATASET_NAME == 'tf_data7dict'
USE_ONE_GPU = True

conf = config.Config("single_instruments_type", ROOT_PATH)

if USE_SMALL_GENRE_SET:
    conf.accepted_subgenres = ['folk', 'nes', 'maestro']
# If we need to use only the first GPU
if USE_ONE_GPU:
    conf.GPUS = tf.config.experimental.list_physical_devices('GPU')[0]
    conf.BATCH_SIZE = conf.BATCH_SIZE
    conf.GLOBAL_BATCH_SIZE = conf.BATCH_SIZE
    conf.num_devices = 1

### MODEL CREATION ###

if conf.num_devices > 1:
    print("Using multiple GPUs with Mirrored Strategy")
    with conf.training_strategy.scope():
        model = music_model.create_model(conf,
                                         num_genres=len(conf.accepted_subgenres),
                                         use_regularization=False,
                                         use_masking_layers=False)
else:
    print("Using single GPU/CPU device")
    model = music_model.create_model(conf,
                                     num_genres=len(conf.accepted_subgenres),
                                     use_regularization=False,
                                     use_masking_layers=False)

2023-04-27 23:35:36.226295: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-27 23:35:36.331713: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-27 23:35:36.766350: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/volpepe/miniconda3/envs/music_gen/lib/:/home/volpepe/miniconda3/envs/music_gen/lib/python3.10/site-packages/nvidia/cudnn/lib
2023-04-27 23:35:36.766439: W tensorflow/stream_executor/platform/default/dso_l

Using single GPU/CPU device


2023-04-27 23:35:38.237196: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-27 23:35:38.237925: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-27 23:35:38.238078: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-27 23:35:38.238192: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [2]:
model.load_weights(WEIGHTS_PATH)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fe0215e6d70>

---

## Quick example

In [3]:
# Function to choose a style for the song
def encode_styles(styles_array):
    one_hot_enc = np.zeros((len(styles_array), len(conf.accepted_subgenres)), dtype=np.int8)
    for i, style in enumerate(styles_array):
        one_hot_enc[i, conf.accepted_subgenres.index(style)] = 1
    return one_hot_enc

# Example:
styles = encode_styles(['rock', 'pop', 'dance', 'electronic'])
print(styles)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]


In [4]:
# preprocessing_model = keras.Model(inputs=model.input, outputs=model.get_layer('final_encoding').output)
# transformer = model.get_layer('tfgpt2_model')
# output_layers = [
#     model.get_layer('type_scores'),
#     model.get_layer('measure_scores'),
#     model.get_layer('beat_scores'),
#     model.get_layer('position_scores'),
#     model.get_layer('duration_scores'),
#     model.get_layer('pitch_scores'),
#     model.get_layer('instrument_scores'),
#     model.get_layer('velocity_values'),
#     model.get_layer('keysign_scores'),
#     model.get_layer('timesign_scores'),
#     model.get_layer('tempo_scores')
# ]
# activations = [tf.keras.activations.softmax] * 7 + [tf.keras.activations.relu] + [tf.keras.activations.softmax] * 3

# ## Test this pipeline
# # Let's assume we need to only send the first 2 token.
# # Therefore we will build an attention mask where only the first 2 values are 1 and the others are 0
# CUR_IDX = 2
# GEN_BATCH_SIZE = 4
# attention_mask = np.ones((GEN_BATCH_SIZE, 1+CUR_IDX), dtype=np.int8)
# padding_attention_mask = np.zeros((GEN_BATCH_SIZE, conf.SEQ_LEN-1-CUR_IDX), dtype=np.int8)
# attention_mask = np.concatenate([attention_mask, padding_attention_mask], axis=-1)

# preprocessed_tensors = preprocessing_model(X)
# out_transformer      = transformer({'inputs_embeds': preprocessed_tensors},
#                                    attention_mask=attention_mask)['last_hidden_state']
# out_scores           = [output_layers[i](out_transformer)[:,:-1,:] 
#                         for i in range(len(output_layers))]
# out_probs            = [np.array(activations[i](out_scores[i])) 
#                         for i in range(len(activations))]

In [5]:
# # Create the token components
# generation_mode = 'top_p_sampling'

# if generation_mode == 'top_p_sampling':
#     # Compute the dynamic top-p thresholds
#     top_p_sequence = np.linspace(0.9, 0.6, conf.SEQ_LEN-1)  # Generate max_length-1 evenly spaced values

# token_components = []
# for head_idx in range(len(out_probs)):
#     # Velocity is just a number, not a probability distribution
#     if head_idx == 7: token_components.append(
#         np.asarray(out_probs[7][:, CUR_IDX, :], dtype=np.int32))
#     else:
#     # Sample the next token from the output probabilities using the desired sampling mode

#         ## 1. Top-k sampling: sample from the top-k most probable tokens.
#         ## k is expressed as a ratio of the number of possibilities, because each component has different ranges.
#         if generation_mode == 'top_k_sampling':
#             k = int(out_probs[head_idx].shape[-1] * 0.3)
#             top_k_indices = np.argsort(-np.asarray(out_probs[head_idx][:, CUR_IDX, :]))[:, :k]
#             for song in range(GEN_BATCH_SIZE):
#                 redistrib_mass = sum(np.take(out_probs[head_idx][song, CUR_IDX, :], top_k_indices[song]))
#                 # redistrib_mass : 1 = prob[i] : x --> x = prob[i] / redistrib_mass
#                 for idx in range(out_probs[head_idx].shape[-1]):
#                     out_probs[head_idx][song, CUR_IDX, idx] = \
#                         (out_probs[head_idx][song, CUR_IDX, idx] / redistrib_mass) if idx in top_k_indices[song] else 0

#         ## 2. Top-p sampling: sample from the most probable tokens until the cumulative probability exceeds p
#         elif generation_mode == 'top_p_sampling':
#             best_indices = np.argsort(-np.asarray(out_probs[head_idx][:, CUR_IDX, :]))
#             for song in range(GEN_BATCH_SIZE):
#                 # Take elements until the cumulative probability exceeds the threshold (always at least one element)
#                 cum_prob = 0; k = 0
#                 while cum_prob < top_p_sequence[CUR_IDX]:
#                     cum_prob += out_probs[head_idx][song, CUR_IDX, best_indices[song, k]]
#                     k += 1
#                 # Directly modify the probability array
#                 # cum_prob : 1 = prob[i] : x --> x = prob[i] / cum_prob
#                 # For the other elements, the probability is 0
#                 for idx in range(out_probs[head_idx].shape[-1]):
#                     out_probs[head_idx][song, CUR_IDX, idx] = \
#                         (out_probs[head_idx][song, CUR_IDX, idx] / cum_prob) if idx in best_indices[song, :k] else 0
        
#         ## 3. Standard mapping: simply sample from the probability distributions with no additional frills.
#         batch = [[np.random.choice(
#             np.arange(out_probs[head_idx].shape[-1]),
#             p=out_probs[head_idx][song, CUR_IDX, :]
#             )] for song in range(GEN_BATCH_SIZE)]
#         token_components.append(np.asarray(batch, dtype=np.int32))

# # Create the tokens concatenating the sampled components
# new_tokens = np.concatenate(token_components, axis=1)
# new_tokens

---

## Generation function

In [None]:
def mask_current_token(song, current_token_idx, current_token_probabilities, current_settings):
    mask_from_previous_token = conf.full_mask.copy() # TODO: initiate with np.ones with right dimensions (excluding the token type, see)
    mask_from_current_token = conf.full_mask.copy()

    previous_token = song[current_token_idx-1, :]

    if previous_token[0] == 0: # only type 1 is acceptable
        mask_from_previous_token[0][0] = 0
        mask_from_previous_token[0][2:] = 0
    elif previous_token[0] == 1: # only type 1 and 2 are acceptable
        mask_from_previous_token[0][0] = 0
        mask_from_previous_token[0][3:] = 0
    elif previous_token[0] == 2: # only type 4 is acceptable
        mask_from_previous_token[0][0:4] = 0
        mask_from_previous_token[0][5:] = 0
    elif previous_token[0] == 3: # only type 3-4-5-6-7 are acceptable
        mask_from_previous_token[0][0:3] = 0
    elif previous_token[0] == 4: # cannot write type 0-1-2
        mask_from_previous_token[0][0:3] = 0
        if np.sum(song[:,0] == 5) == 0: # if no time_sign has been defined, you must define it
            mask_from_previous_token[0][3:5] = 0
            mask_from_previous_token[0][6:] = 0
        else:
            pass
    elif previous_token[0] == 5: # cannot write type 0-1-2
        mask_from_previous_token[0][0:3] = 0
        if np.sum(song[:,0] == 6) == 0: # if no tempo has been defined, you must define it
                mask_from_previous_token[0][3:6] = 0
                mask_from_previous_token[0][7] = 0
        else:
            pass
    elif previous_token[0] == 6: # cannot write type 0-1-2
        mask_from_previous_token[0][0:3] = 0
    elif previous_token[0] == 7:
        mask_from_previous_token[0][0:7] = 0


    # TODO: now with this mask you sample the token type for the current token
    current_token_type = 0 # TODO: this is just a filler

    if current_token_type == 0:
        mask_from_current_token = conf.default_mask.copy() # only index zero on the other parts of the token
    elif current_token_type == 1:
        mask_from_current_token = conf.default_mask.copy()[:5] + conf.full_mask.copy()[5] + conf.default_mask.copy()[6:] # only index zero on the other (except instruments)
    elif current_token_type == 2:
        mask_from_current_token = conf.default_mask.copy() # only index zero on the other parts of the token
    elif current_token_type == 3:

        # mask for measure / beat / position

        # mask for key_sign / time_sign / tempo
        pass





In [6]:
## Song generation function
def generate_songs(model, style_list:List[str], max_length:int=conf.SEQ_LEN-1, 
                   terminator_type:int=7, generation_mode:str='standard_sampling', 
                   temperature:float=1.0, top_k_ratio=0.3, top_p_start:float=0.9, 
                   top_p_min=0.9):
    
    # Check the validity of the parameters
    assert temperature > 0, "Temperature must be greater than 0"
    assert max_length <= conf.SEQ_LEN-1, f"The maximum length of the generated song must be less than {conf.SEQ_LEN-1}"
    assert generation_mode in ['standard_sampling', 'top_k_sampling', 'top_p_sampling'], \
        f"Parameter 'generation_mode' must be one of the following: 'standard_sampling', 'top_k_sampling', 'top_p_sampling'"

    # Collect explicitly the number of songs to generate
    num_songs = len(style_list)

    # Separate preprocessing model, transformer and output layers in order to be able to
    # inject the attention masks into the transformer and still use the loaded weights
    preprocessing_model = keras.Model(inputs=model.input, outputs=model.get_layer('final_encoding').output)
    transformer = model.get_layer('tfgpt2_model')
    output_layers = [
        model.get_layer('type_scores'), model.get_layer('measure_scores'), model.get_layer('beat_scores'),
        model.get_layer('position_scores'), model.get_layer('duration_scores'), model.get_layer('pitch_scores'),
        model.get_layer('instrument_scores'), model.get_layer('velocity_values'), model.get_layer('keysign_scores'),
        model.get_layer('timesign_scores'), model.get_layer('tempo_scores')
    ]
    activations = [tf.keras.activations.softmax] * 7 + [tf.keras.activations.relu] + [tf.keras.activations.softmax] * 3
    
    # Create the empty song array.
    # The first token of the songs is always the start token (0,...,0)
    # The rest of the tensor is filled with padding of zeroes, which will be masked by the attention masks
    generated_songs = np.zeros((num_songs, conf.SEQ_LEN-1, 11), dtype=np.int32)

    if generation_mode == 'top_p_sampling':
        # Compute the dynamic top-p thresholds
        top_p_sequence = np.linspace(top_p_start, top_p_min, max_length-1)  # Generate max_length-1 evenly spaced values
    
    # Generate the one-hot encodings of the genres
    styles = encode_styles(style_list)

    # Start to generate the songs token by token
    for i in trange(1, max_length):

        # Create the attention mask (the first 2 tokens are always attended: genre and starting token)
        attention_mask = np.ones((num_songs, 1+i), dtype=np.int8)
        padding_attention_mask = np.zeros((num_songs, conf.SEQ_LEN-1-i), dtype=np.int8)
        attention_mask = np.concatenate([attention_mask, padding_attention_mask], axis=-1)

        # Preprocess the songs and pass them through the transformer
        preprocessed_tensors = preprocessing_model((generated_songs, styles))
        out_transformer      = transformer({'inputs_embeds': preprocessed_tensors},
                               attention_mask=attention_mask)['last_hidden_state']
        # Use the output layers to generate the probabilities for the next token
        # Output from transformer has SEQ_LEN tokens, so we trim it by removing the last one,
        # since it's the probability of a token that's out of our bounds.
        out_scores           = [output_layers[i](out_transformer)[:,:-1,:]
                                for i in range(len(output_layers))]
        # Apply temperature to the scores but mind the velocity scores which is a scalar
        out_scores_tempered  = [out_scores[i] / temperature for i in range(7)] + \
            [out_scores[7]]  + [out_scores[i] / temperature for i in range(8, 11)]
        out_probs            = [np.array(activations[i](out_scores_tempered[i]))
                                for i in range(len(activations))]
        
        # Used for compliance with song representation

        
        # Create the token components
        token_components = []
        for head_idx in range(len(out_probs)):
            # Velocity is just a number, not a probability distribution
            if head_idx == 7: token_components.append(
                    np.asarray(out_probs[7][:, i, :] * conf.INPUT_RANGES['velocity'], dtype=np.int32))
            else:
                # Sample the next token from the output probabilities using the desired sampling mode
                if generation_mode != 'standard_sampling':
                    # Use some method to modify the probabilities, such as top-k or top-p sampling

                    ## 1. Top-k sampling: sample from the top-k most probable tokens.
                    ## k is expressed as a ratio of the number of possibilities, because each component has different ranges.
                    if generation_mode == 'top_k_sampling':
                        # Compute the top-k indices in the probability array
                        k = np.ceil(out_probs[head_idx].shape[-1] * top_k_ratio)
                        top_k_indices = np.argsort(-np.asarray(out_probs[head_idx][:, i, :]))[:, :k]
                        for song in range(num_songs):
                            # For each song, compute the mass of probability to be redistributed between the top-k elements
                            redistrib_mass = sum(np.take(out_probs[head_idx][song, i, :], top_k_indices[song]))
                            # Directly modify the probability array 
                            # redistrib_mass : 1 = prob[i] : x --> x = prob[i] / redistrib_mass
                            # For the non-top-k elements, the probability is 0
                            for idx in range(out_probs[head_idx].shape[-1]):
                                out_probs[head_idx][song, i, idx] = \
                                    (out_probs[head_idx][song, i, idx] / redistrib_mass) if idx in top_k_indices[song] else 0

                    ## 2. Top-p sampling: sample from the most probable tokens until the cumulative probability exceeds p
                    elif generation_mode == 'top_p_sampling':
                        best_indices = np.argsort(-np.asarray(out_probs[head_idx][:, i, :]))
                        for song in range(num_songs):
                            # Take elements until the cumulative probability exceeds the threshold (always at least one element)
                            cum_prob = 0; k = 0
                            while cum_prob < top_p_sequence[i-1]:
                                cum_prob += out_probs[head_idx][song, i, best_indices[song, k]]
                                k += 1
                            # Directly modify the probability array
                            # cum_prob : 1 = prob[i] : x --> x = prob[i] / cum_prob
                            # For the other elements, the probability is 0
                            for idx in range(out_probs[head_idx].shape[-1]):
                                out_probs[head_idx][song, i, idx] = \
                                    (out_probs[head_idx][song, i, idx] / cum_prob) if idx in best_indices[song, :k] else 0
                
                ## Use standard mapping to sample from the (potentially modified) probability distributions
                batch = [[np.random.choice(
                    np.arange(out_probs[head_idx].shape[-1]),
                    p=out_probs[head_idx][song, i, :]
                    )] for song in range(num_songs)]
                token_components.append(np.asarray(batch, dtype=np.int32))
                
        # Create the tokens concatenating the sampled components
        new_tokens = np.concatenate(token_components, axis=1)
        # Add the new tokens to the songs
        generated_songs[:, i] = new_tokens

    # If a token in a song has type of terminator, simply overwrite the rest of the song with end tokens
    for song in range(num_songs):
        terminator_indices = np.argwhere(generated_songs[song, :, 0] == terminator_type)
        if len(terminator_indices) > 0:
            first_terminator_index = terminator_indices[0,0]
            generated_songs[song, first_terminator_index:, :] = [7] + [0]*10
        
    # Internally, the generated song is always SEQ_LEN - 1 long, so we cut it before returning it.
    return generated_songs[:, :max_length, :]

In [7]:
for genre in conf.accepted_subgenres:
    print(f"Generating {genre} songs...")
    out_songs = generate_songs(model, [genre]*4, max_length=2047, temperature=0.9, 
                            generation_mode='top_p_sampling', top_p_start=0.9, top_p_min=0.6)
    np.save(os.path.join(conf.DATA_PATH, 'generated_songs', 'repr', f'songs_{genre}.npy'), out_songs, allow_pickle=True)

Generating rock songs...


  0%|          | 0/2046 [00:00<?, ?it/s]2023-04-27 23:35:43.830282: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
100%|██████████| 2046/2046 [04:33<00:00,  7.49it/s]


Generating pop songs...


100%|██████████| 2046/2046 [04:34<00:00,  7.47it/s]


Generating dance songs...


100%|██████████| 2046/2046 [04:37<00:00,  7.37it/s]


Generating country songs...


100%|██████████| 2046/2046 [04:37<00:00,  7.37it/s]


Generating metal songs...


100%|██████████| 2046/2046 [04:36<00:00,  7.39it/s]


Generating classical songs...


100%|██████████| 2046/2046 [04:23<00:00,  7.76it/s]


Generating folk songs...


100%|██████████| 2046/2046 [04:23<00:00,  7.77it/s]


Generating blues songs...


100%|██████████| 2046/2046 [04:23<00:00,  7.77it/s]


Generating house songs...


100%|██████████| 2046/2046 [04:23<00:00,  7.75it/s]


Generating indie songs...


100%|██████████| 2046/2046 [04:39<00:00,  7.32it/s]


Generating latin songs...


100%|██████████| 2046/2046 [04:39<00:00,  7.32it/s]


Generating jazz songs...


100%|██████████| 2046/2046 [04:52<00:00,  7.00it/s]


Generating funk songs...


100%|██████████| 2046/2046 [04:38<00:00,  7.34it/s]


Generating rap songs...


100%|██████████| 2046/2046 [04:31<00:00,  7.55it/s]


Generating punk songs...


100%|██████████| 2046/2046 [04:32<00:00,  7.50it/s]


Generating r&b songs...


100%|██████████| 2046/2046 [04:31<00:00,  7.54it/s]


Generating gospel songs...


100%|██████████| 2046/2046 [04:22<00:00,  7.79it/s]


Generating electronic songs...


100%|██████████| 2046/2046 [04:23<00:00,  7.77it/s]
