In [1]:
# Imports
import os
import math

import numpy as np
import tensorflow as tf

from tqdm import tqdm
import time
import json
import scipy.stats as st

# # Workaround for very high loads on GPUs
tf.config.set_visible_devices([], 'GPU')
# # Or use single GPU
# gpus = tf.config.list_physical_devices('GPU')
# tf.config.set_visible_devices(gpus[0], 'GPU')

from config import Config
import utils

MODEL_TYPE = 'GPT'

ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
conf = Config("single_instruments_type", ROOT_PATH, model_type=MODEL_TYPE)

2023-06-04 21:49:18.893660: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-04 21:49:19.026741: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-04 21:49:19.065592: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-04 21:49:19.623197: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


2023-06-04 21:49:21.262958: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Music Generation Metrics

We implemented several metrics and statistics that we compare across our sets.

## Definitions

In [2]:
def is_note(token):
    return token[0] == 3 and token[6] < 128

In [3]:
def get_notes_in_measures(song, start_measure, end_measure):
    # If it's a note with the same measure and its pitch is not drums-related
    return [token for token in song 
            if (is_note(token) and start_measure <= token[1] < end_measure)]

def pitch_class_histogram_entropy_metric(song, window_size=1):
    # Compute the mean pitch class histogram entropy in a song
    # using the specified number of measures (window_size).
    # Usually, interesting metrics use window_size 1 and 4
    song_measures = np.unique(song[:,1])
    if len(song_measures) < window_size:
        # print(f"\tSong has too few measures for window size {window_size}:"
        #      f" reverting back to a window size of {len(song_measures)}")
        window_size = len(song_measures)
    # Slide the window over the song to compute the entropy of notes in those measures
    entropy_for_windows = []
    for st_measure in range(0, len(song_measures) - window_size + 1):
        end_measure = st_measure + window_size
        notes = get_notes_in_measures(song, st_measure, end_measure) 
        if len(notes) > 0:
            notes_pitches = np.array([n[5] for n in notes])
            notes_classes = notes_pitches % 12 # {C, C#, ..., Bb, B}
            hist, edges = np.histogram(notes_classes, bins=list(range(12)))
            hist = hist / np.sum(hist)   # Normalize by total note count in the period
            hist = hist + 1e-10 # Avoid log of 0
            entropy = -np.sum(hist * (np.log(hist) / np.log(2))) # Fast log2 implementation
            entropy_for_windows.append(entropy)
        else:
            # print(f"\tWindow from measures {st_measure} to {end_measure} has no notes.")
            continue
    if len(entropy_for_windows) > 0:
        return np.mean(entropy_for_windows)
    else:
        return None

In [4]:
def poliphony_metric(song):
    notes_by_start_time = {}
    notes_in_song = 0
    for token in song:
        if is_note(token):   # Notes
            notes_in_song += 1
            start_time = token[1]*conf.INPUT_RANGES['beat']*conf.INPUT_RANGES['position'] + \
                         token[2]*conf.INPUT_RANGES['position'] + token[3]
            if start_time in notes_by_start_time:
                notes_by_start_time[start_time].append(token)
            else:
                notes_by_start_time[start_time] = [token]
    return sum([len(notes_list) > 1 for notes_list in notes_by_start_time.values()]) / notes_in_song

In [5]:
def tone_span_metric(song):
    lowest_pitch = 128
    highest_pitch = 0
    for token in song:
        if is_note(token):  # Notes
            if token[5] < lowest_pitch:
                lowest_pitch = token[5]
            if token[5] > highest_pitch:
                highest_pitch = token[5]
    return highest_pitch - lowest_pitch

In [6]:
base_tones = {'C':   0,
              'C#':  1, 
              'D':   2,
              'D#':  3,
              'E':   4,
              'F':   5,
              'F#':  6,
              'G':   7,
              'G#':  8,
              'A':   9,
              'A#': 10,
              'B':  11}

scale = {}
# Most important scales:
# Major scale:
scale['major'] = [0,2,4,5,7,9,11]
#(W-W-H-W-W-W-H)
#(2 2 1 2 2 2 1)

# Natural minor scale:
scale['natural_minor'] = [0,2,3,5,7,8,10]
#(W-H-W-W-H-W-W)
#(2 1 2 2 1 2 2)
 
# Harmonic minor scale:
scale['harmonic_minor'] = [0,2,3,5,7,8,11]
#(W-H-W-W-H-WH-H)
#(2 1 2 2 1 3 1)
 
def tones_to_scales(tones):
    counts = {}
    for base_tone in base_tones:
        counts[base_tone] = {}
        counts[base_tone]['major'] = 0
        counts[base_tone]['natural_minor'] = 0
        counts[base_tone]['harmonic_minor'] = 0

    # If no tones, return 0 for each note in each scale
    if not len(tones):
        frequencies = {}
        for base_tone in base_tones:
            frequencies[base_tone] = {}
            for scale_label in scale:
                frequencies[base_tone][scale_label] = 0.0
        return frequencies
    
    # Otherwise, compute scale consistency and their frequency
    for tone in tones:
        # For each note...
        for base_tone in base_tones:
            # For each of the base pitches...
            for scale_label in scale:
                # For each of the available scales...
                if tone%12-base_tones[base_tone] in scale[scale_label]:
                    # If the distance between the tone and the base tone is in a scale, 
                    # add 1 to that scale starting from that base tone
                    counts[base_tone][scale_label] += 1
    # Transform these counts into frequencies
    frequencies = {}
    for base_tone in counts:
        frequencies[base_tone] = {}
        for scale_label in counts[base_tone]:
            frequencies[base_tone][scale_label] = float(counts[base_tone][scale_label])/float(len(tones))
    return frequencies

def max_likelihood_scale(tones):
    # Get scale statistics
    scale_statistics = tones_to_scales(tones) 
    stat_list = []
    for base_tone in scale_statistics:
        for scale_label in scale_statistics[base_tone]:
            stat_list.append((base_tone, scale_label, scale_statistics[base_tone][scale_label]))
    # Get the most likely scale and its likelihood
    stat_list.sort(key=lambda e: e[2], reverse=True)
    return (stat_list[0][0]+' '+stat_list[0][1], stat_list[0][2])

def scale_consistency_metric(song):
    tones = [token[5] for token in song if is_note(token)]
    ml = max_likelihood_scale(tones)
    stats = {}
    stats['scale'] = ml[0] # <-- we ignore it for now but it's cool to see which is the most likely scale for a song
    stats['scale_score'] = ml[1]
    return stats['scale_score']

In [7]:
# NUMBER OF NOTES
def n_notes_metric(song):
    return sum([1 for token in song if is_note(token)])

# AVERAGE NOTE DURATION
def avg_note_duration_metric(song):
    return np.mean([token[4] for token in song if is_note(token)])

# MAX SILENCE BETWEEN EVENTS
def max_offset_duration_metric(song):
    offsets = []
    for i in range(len(song)-1):
        token = song[i]
        if is_note(token):
            j = i+1; next_token = song[j]
            while not is_note(next_token) and j < len(song)-1:
                j += 1; next_token = song[j]
            if not is_note(next_token): break
            empty_measures = next_token[1] - token[1]
            empty_beats = (int(next_token[2]) - int(token[2])) % conf.numerators[token[9] % conf.tot_numerators]
            empty_positions = (int(next_token[3]) - int(token[3])) % len(conf.np_positions)
            beats_offset = empty_measures * conf.numerators[token[9] % conf.tot_numerators] + empty_beats
            offset = beats_offset * len(conf.np_positions) + empty_positions
            offsets.append(offset)
        elif token[0] == 7:
            break
    return max(offsets) if len(offsets) > 0 else 0

# REPETITIONS IN MEASURES
def avg_unique_pitches_in_measure_metric(song):
    # Average number of different pitches in a measures
    pitches_per_measure = {}
    for token in song:
        if is_note(token):
            measure = token[1]
            if measure in pitches_per_measure:
                # Add pitch to set
                pitches_per_measure[measure].add(token[5])
            else:
                # Create set
                pitches_per_measure[measure] = set([token[5]])
    avg_pitches_per_measure = np.mean([len(s) for s in pitches_per_measure.values()])
    return avg_pitches_per_measure

def repetition_factor_metric(song):
    measures = {}
    for token in song:
        if is_note(token):
            measure = token[1]
            if measure in measures:
                measures[measure].append((token[2], token[3], token[5])) # beat, position, pitch
            else:
                measures[measure] = [(token[2], token[3], token[5])]
    tot_measures = len(measures)
    matchings = {}
    for measure_1 in measures:
        matchings[measure_1] = 0
        for measure_2 in measures:
            if measure_1 != measure_2 and measures[measure_1] == measures[measure_2]:
                matchings[measure_1] += 1
        matchings[measure_1] /= tot_measures
    return max(matchings.values())

## Metrics computation on our datasets

In [8]:
train_set, _, test_set = utils.get_dataset_splits(conf.dataset_paths['lmd_matched_final_2048_cut'], conf)

The generated set has to be created "manually". We load all of the related songs and stack them into a large matrix. Then use dataloader utilities to create a dataset.

In [24]:
from glob import glob

# FIND MATRICES
filename_pattern = os.path.join(conf.DATA_PATH, 'generated_songs', 'repr', '*top_p_0_9_*.npy')
filenames = glob(filename_pattern)
# LOAD + CONCAT MATRICES
songs = [np.load(fn) for fn in filenames]
songs = np.concatenate(songs, axis=0)
print(f"Loaded the songs of the generated dataset (tensor of shape {songs.shape})")
# CREATE DATASET AROUND MATRICES
gen_set = tf.data.Dataset.from_tensor_slices(songs).batch(conf.GLOBAL_BATCH_SIZE).\
                                                    cache().\
                                                    shuffle(conf.SHUFFLE_SIZE).\
                                                    prefetch(conf.PREFETCH_SIZE)
gen_set

Loaded the songs of the generated dataset (tensor of shape (4000, 2047, 11))


<PrefetchDataset element_spec=TensorSpec(shape=(None, 2047, 11), dtype=tf.int32, name=None)>

In [25]:
len(train_set) * conf.BATCH_SIZE, len(test_set) * conf.BATCH_SIZE, len(gen_set) * conf.BATCH_SIZE

(55596, 6960, 4002)

Note: for the metric computation we randomly sample 1/5 of the dataset, to make the analysis a little faster.

In [33]:
def _collect_stats_batch(song_batch, stat):
    vals = []
    for song in song_batch:
        if stat == 'entropy_1':
            val = pitch_class_histogram_entropy_metric(song, window_size = 1)
        elif stat == 'entropy_4':
            val = pitch_class_histogram_entropy_metric(song, window_size = 4)
        elif stat == 'poliphony':
            val = poliphony_metric(song)
        elif stat == 'tone_span':
            val = tone_span_metric(song)
        elif stat == 'scale_consistency':
            val = scale_consistency_metric(song)
        elif stat == 'n_notes':
            val = n_notes_metric(song)
        elif stat == 'avg_note_duration':
            val = avg_note_duration_metric(song)
        elif stat == 'max_offset':
            val = max_offset_duration_metric(song)
        elif stat == 'unique_pitches_in_measure':
            val = avg_unique_pitches_in_measure_metric(song)
        elif stat == 'measure_repetition':
            val = repetition_factor_metric(song)
        else:
            raise NotImplementedError
        vals.append(val)
    return vals

def collect_stats(dataset, stat, portion=20, iterator_mode='default'):
    values = []
    dataset = dataset.shuffle(len(dataset) // 100 * portion).take(len(dataset) // 100 * portion)
    dataset_iter = dataset.as_numpy_iterator()
    if iterator_mode == 'default':
        for X, _ in tqdm(dataset_iter, total=len(dataset)):
            batch_vals = _collect_stats_batch(X[0], stat)
            for v in batch_vals: values.append(v)
    elif iterator_mode == 'gen_set':
        for song_batch in tqdm(dataset_iter, total=len(dataset)):
            batch_vals = _collect_stats_batch(song_batch, stat)
            for v in batch_vals: values.append(v)
    else:
        raise ValueError(f'iterator_mode {iterator_mode} not supported')
    return values

In [34]:
def run_stat(dataset, stat_name, stats, eval_name = 'train'):
    print(f"Computing metric {stat_name}")
    s = time.time()
    vals = collect_stats(dataset, stat_name, portion=100, 
                         iterator_mode='default' if 'gen' not in eval_name else 'gen_set')
    e = time.time()
    stats[stat_name + '_mean'] = float(np.mean(vals))
    stats[stat_name + '_std'] = float(np.std(vals))
    stats[stat_name + '_interval'] = [float(x) 
        for x in st.t.interval(
            0.95, len(vals)-1, 
            loc=np.mean(vals), 
            scale=st.sem(vals)
        )]
    stats[stat_name + '_time'] = float(e-s)
    with open(f'evaluation_stats_{eval_name}.json', 'w') as f:
        json.dump(stats, f)

def evaluate(dataset, eval_name='train'):
    stats = {}
    run_stat(dataset, 'entropy_1', stats, eval_name=eval_name)
    run_stat(dataset, 'entropy_4', stats, eval_name=eval_name)
    run_stat(dataset, 'poliphony', stats, eval_name=eval_name)
    run_stat(dataset, 'tone_span', stats, eval_name=eval_name)
    run_stat(dataset, 'scale_consistency', stats, eval_name=eval_name)
    run_stat(dataset, 'n_notes', stats, eval_name=eval_name)
    run_stat(dataset, 'avg_note_duration', stats, eval_name=eval_name)
    run_stat(dataset, 'max_offset', stats, eval_name=eval_name)
    run_stat(dataset, 'unique_pitches_in_measure', stats, eval_name=eval_name)
    run_stat(dataset, 'measure_repetition', stats, eval_name=eval_name)
    return stats

### Train set evaluation

In [None]:
evaluate(train_set, eval_name='train')

### Test set evaluation

In [None]:
evaluate(test_set, eval_name='test')

### Generated set evaluation

In [35]:
evaluate(gen_set, eval_name='gen_p_0_9')

Computing metric entropy_1


100%|██████████| 600/600 [14:16<00:00,  1.43s/it]


Computing metric entropy_4


100%|██████████| 600/600 [12:54<00:00,  1.29s/it]


Computing metric poliphony


100%|██████████| 600/600 [00:45<00:00, 13.09it/s]


Computing metric tone_span


100%|██████████| 600/600 [00:21<00:00, 28.27it/s]


Computing metric scale_consistency


100%|██████████| 600/600 [07:18<00:00,  1.37it/s]


Computing metric n_notes


100%|██████████| 600/600 [00:19<00:00, 31.24it/s]


Computing metric avg_note_duration


100%|██████████| 600/600 [00:20<00:00, 29.72it/s]


Computing metric max_offset


100%|██████████| 600/600 [01:40<00:00,  5.94it/s]


Computing metric unique_pitches_in_measure


100%|██████████| 600/600 [00:22<00:00, 26.65it/s]


Computing metric measure_repetition


100%|██████████| 600/600 [00:25<00:00, 23.84it/s]


{'entropy_1_mean': 1.1916802662785957,
 'entropy_1_std': 0.5802743191916998,
 'entropy_1_interval': [1.1727107007735627, 1.2106498317836287],
 'entropy_1_time': 856.3519761562347,
 'entropy_4_mean': 1.2298731605167605,
 'entropy_4_std': 0.6255078695915011,
 'entropy_4_interval': [1.2094248792234268, 1.2503214418100943],
 'entropy_4_time': 774.6415026187897,
 'poliphony_mean': 0.07734177408677449,
 'poliphony_std': 0.10476434322110692,
 'poliphony_interval': [0.07391695584892308, 0.08076659232462591],
 'poliphony_time': 45.98257493972778,
 'tone_span_mean': 38.225403001667594,
 'tone_span_std': 21.483981895575045,
 'tone_span_interval': [37.523076897619525, 38.92772910571566],
 'tone_span_time': 21.375214338302612,
 'scale_consistency_mean': 0.9578259564450549,
 'scale_consistency_std': 0.08844709185678003,
 'scale_consistency_interval': [0.9549353643874049, 0.9607165485027048],
 'scale_consistency_time': 438.4157907962799,
 'n_notes_mean': 2026.0430794886047,
 'n_notes_std': 26.9312546

## Print

In [38]:
with open('evaluation_stats_train.json', 'r') as f:
    train_stats = json.load(f)
    
with open('evaluation_stats_test.json', 'r') as f:
    test_stats = json.load(f)
    
with open('evaluation_stats_gen_p_0_9.json', 'r') as f:
    gen_stats = json.load(f)

In [41]:
metrics = ['entropy_1', 'entropy_4', 'poliphony', 'tone_span',
           'scale_consistency', 'n_notes', 'avg_note_duration', 
           'max_offset', 'unique_pitches_in_measure', 
           'measure_repetition']

stat_sets = {'train': train_stats, 'test': test_stats, 'gen_p_0.9': gen_stats}

for k, v in stat_sets.items():
    print("============= " + k.upper() + " =============")
    for metric in metrics:
        m = v[f'{metric}_mean']
        m_around =  m - v[f'{metric}_interval'][0]
        print(f"{metric}: ${m:.2f} \pm {m_around:.2f}$")

entropy_1: $2.33 \pm 0.01$
entropy_4: $2.68 \pm 0.00$
poliphony: $0.21 \pm 0.00$
tone_span: $58.39 \pm 0.23$
scale_consistency: $0.81 \pm 0.00$
n_notes: $1929.77 \pm 6.46$
avg_note_duration: $26.23 \pm 0.19$
max_offset: $796.34 \pm 7.22$
unique_pitches_in_measure: $13.18 \pm 0.07$
measure_repetition: $0.00 \pm 0.00$
entropy_1: $2.31 \pm 0.02$
entropy_4: $2.68 \pm 0.01$
poliphony: $0.21 \pm 0.00$
tone_span: $58.10 \pm 0.66$
scale_consistency: $0.81 \pm 0.01$
n_notes: $1924.47 \pm 18.35$
avg_note_duration: $26.29 \pm 0.59$
max_offset: $805.82 \pm 31.45$
unique_pitches_in_measure: $12.99 \pm 0.21$
measure_repetition: $0.00 \pm 0.00$
entropy_1: $1.19 \pm 0.02$
entropy_4: $1.23 \pm 0.02$
poliphony: $0.08 \pm 0.00$
tone_span: $38.23 \pm 0.70$
scale_consistency: $0.96 \pm 0.00$
n_notes: $2026.04 \pm 0.88$
avg_note_duration: $13.21 \pm 0.64$
max_offset: $1099.71 \pm 144.42$
unique_pitches_in_measure: $5.22 \pm 0.11$
measure_repetition: $0.02 \pm 0.00$
