Plan: Take a collection of songs from a directory. Each folder will contain the song and a text file of labels. The song will be wav, text file will be comma separated labels.

ALL label files will be loaded and a vocabulary will be generated and embedded, this will be used as one input vector to the network.

The song vocabulary will be an embedding of 2 ** 16 -1 values (all uint16 values). These will be embedded into an input vector as well.

In [23]:
import os
import numpy as np
from scipy.io import wavfile
import re

ROOT = "./data"

def clean_label(label: str):
    return label.strip(" :,;_-.><[]{}")

def generate_label_vocabulary(source: str):
    directories = os.listdir(source)
    
    all_labels = set()
    all_labels.add("")
    
    for song_directory in directories:
        full_song_directory = os.path.join(source, song_directory)
        files = os.listdir(full_song_directory)
        
        for song_file in files:
            full_song_file = os.path.join(full_song_directory, song_file)
            

            
            if not full_song_file.endswith(".labels"):
                title = os.path.splitext(song_file)[0]
                all_labels.add(clean_label(title))
                continue
            
            
            
            with open(full_song_file, 'r') as label_file:
                labels = label_file.read().split(",")
                all_labels = all_labels.union(labels)
     
    word_to_index = {}
    index_to_word = np.ndarray(shape=(len(all_labels)), dtype=np.object_)
    for i, label in enumerate(sorted(all_labels)):
        clean = clean_label(label)
        word_to_index[clean] = i
        index_to_word[i] = clean

    
    return word_to_index, index_to_word
     
     
def vectorize_labels(labels: list[str], word_to_index: dict[str, int]):
       return np.array([word_to_index[label] for label in labels])
   
   
def normalize_song(song: np.ndarray):
    song = song / np.max(song)
    
    
    mean = np.mean(song)
    std = np.std(song)
    normalized = (song - mean) / std
    
    normalized_reduced_size = normalized.astype(np.float16)
    return normalized_reduced_size
        
def load_song_data(directory: str):
    files = os.listdir(directory)
    
    title: str
    labels: list[str]
    song: np.ndarray
    sample_rate: np.int32
    
    for song_file in files:
        full_song_file = os.path.join(directory, song_file)
        if full_song_file.endswith(".labels"):
            with open(full_song_file, 'r') as label_file:
                labels = label_file.read().split(",")
                labels = [clean_label(label) for label in labels]
                continue
        
        title = os.path.splitext(song_file)[0]
        labels.append(clean_label(title))
        sample_rate, song = wavfile.read(full_song_file)
        
    return labels, title, song, sample_rate
            
        
    
word_to_index, index_to_word = generate_label_vocabulary(ROOT)
vocab_size = len(word_to_index.items())

print("Label Vocabulary Length:")
print(vocab_size)

mario_index = word_to_index.get("mario")
word_at_mario_index = index_to_word[mario_index]

print("Index-encoded Label:")
print(mario_index)

print("Label at index:")
print(word_at_mario_index)
    
    
example_song_directory = os.path.join(ROOT, "Super Mario")

labels, title, song_data, sample_rate = load_song_data(example_song_directory)

print("Song labels:")
print(labels)

print("Vectorized Labels:")
print(vectorize_labels(labels, word_to_index))

print("Song title:")
print(title)

print("Sample Rate:")
print(sample_rate)

print("Song Data Shape:")

print(song_data.shape)


                
                    
                    
        

Label Vocabulary Length:
11
Index-encoded Label:
5
Label at index:
mario
Song labels:
['mario', 'super', 'bit', '8bit', '8-bit', 'nes', 'nintendo', 'game', 'video', 'sm64']
Vectorized Labels:
[ 5  9  3  2  1  6  7  4 10  8]
Song title:
sm64
Sample Rate:
44100
Song Data Shape:
(36097889,)


Now that we have a vocabulary generated and the song data loaded, we need to transform that data into forms that a neural network can consume. But that begs the question "what does the neural network LOOK like?"

There will be two inputs to the network: the encoded labels, and the song data:
The encoded labels will be transformed into an embedding.

The song data will be normalized and cut up into predictive segements: each previous segment being used to predict the segment that follows. This will be learned using an LSTM.

The outputs will then be concatenated and an output dense layer will predict the generated amplitude value of the waveform given a previous segment and a set of labels.

In [34]:
import keras
import tensorflow as tf


# There are a few problems with this model. 1. It's a simple LSTM model. 2. The concat label and song layers hardly makes any sense,
def build_model(song_sequence_length: int, combined_layer_units: int, rnn_units: int):
    label_input = keras.layers.Input(shape=vocab_size, name="Label_Input")
    label_embedding = keras.layers.Embedding(vocab_size, rnn_units, input_length=None, name="Label_Embedding")(label_input)
    
    label_out = keras.layers.Flatten(name="Label_Flattening")(label_embedding)
    
    
    song_input = keras.layers.Input(shape=(song_sequence_length, 1), name="Song_Sequence_Input")
    song_lstm = keras.layers.LSTM(rnn_units, activation=keras.activations.relu,  name="Song_Sequence_LSTM")(song_input)
    
    
    concat = keras.layers.concatenate([label_out, song_lstm], axis = -1)
    combined_learning_layer = keras.layers.Dense(combined_layer_units, activation=keras.activations.relu, name="Combined_Learning_Layer")(concat)
    
    
    predictive_layer = keras.layers.Dense(song_sequence_length, activation="sigmoid", name="Amplitude_Prediction_Layer")(combined_learning_layer)
    
    
    model =  keras.Model(inputs=[label_input, song_input], outputs = predictive_layer, name="Song_Sequence_Predictor", )
    return model

def pad_label_vector(label_vector: np.ndarray, vocab_size: int):
    return np.pad(label_vector, (0, vocab_size - len(label_vector)))
    

sequence_length = 1_000

model = build_model(sequence_length, 256, 99)


opt = keras.optimizers.Adam(learning_rate=0.000001)
loss = keras.losses.mean_squared_error
model.compile(opt, loss, metrics=["accuracy"])


example_song_directory = os.path.join(ROOT, "Super Mario")
labels, title, song_data, sample_rate = load_song_data(example_song_directory)



vectorized = pad_label_vector(vectorize_labels(labels, word_to_index), vocab_size)
vectorized = tf.expand_dims(vectorized, 0)


song_chunk =  tf.expand_dims(tf.expand_dims(normalize_song(song_data)[:sequence_length], 1), 0)


print("Vectorized Labels:")
print(vectorized.shape)

print("Song Data:")
print(song_chunk.shape)

print("Prediction (:10): ")
result = model([vectorized, song_chunk])[0][:10]
print(result)

print("Model Summary:")
model.summary()

Vectorized Labels:
(1, 11)
Song Data:
(1, 1000, 1)
Prediction (:10): 
tf.Tensor(
[0.4997029  0.5090413  0.49525148 0.50307536 0.49385962 0.4905735
 0.49291262 0.50268656 0.5028553  0.5057263 ], shape=(10,), dtype=float32)
Model Summary:
Model: "Song_Sequence_Predictor"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Label_Input (InputLayer)    [(None, 11)]                 0         []                            
                                                                                                  
 Label_Embedding (Embedding  (None, 11, 99)               1089      ['Label_Input[0][0]']         
 )                                                                                                
                                                                                                  
 Song_Sequence_Input (Input  [(None, 

Now that we have a basic model together, we can transform data into inputs and outputs, and train the data set.

In [35]:
def generate_song_datasets(labels: list[str], vocab_size:int, word_to_index: dict, song_data:np.ndarray, sequence_length: int):
    # Vectorize the dataset into as many inputs as there are song sets.
    # vectorized = tf.expand_dims(tf.expand_dims(vectorize_labels(labels, word_to_index), 1), 0)
    
    sequences = []
    
    for i in range(0, len(song_data), sequence_length):
        sequence = song_data[i:i + sequence_length]
        if len(sequence) < sequence_length:
            sequence = np.pad(sequence, (0, sequence_length - len(sequence)))
        sequences.append(sequence)
        
    x = np.array(sequences[:-1])
    y = np.array(sequences[1:])
    
    vectorized = pad_label_vector(vectorize_labels(labels, word_to_index), vocab_size)
    vectorized_set = np.array([vectorized] * len(x))
    
    return vectorized_set, x, y
    

    
example_song_directory = os.path.join(ROOT, "Super Mario")
labels, title, song_data, sample_rate = load_song_data(example_song_directory)


labels, x, y = generate_song_datasets(labels, vocab_size, word_to_index, song_data, sequence_length)

model.fit([labels, x], y, epochs=10, batch_size=64)

Epoch 1/10
 174/3610 [>.............................] - ETA: 7:32 - loss: nan - accuracy: 0.0885

KeyboardInterrupt: 