In [1]:
import os
import tensorflow as tf
import numpy as np
from IPython.display import Audio
from scipy.io import wavfile
from sklearn.decomposition import PCA

In [2]:
sample_rate, source_audio = wavfile.read("./data/donuts_hq.wav")
sample_start = sample_rate * 60
sample_end = sample_rate * 70
print("Sample rate:", sample_rate)

Sample rate: 32000


In [3]:
Audio(source_audio[sample_start:sample_end], rate=sample_rate)

In [4]:
def compress_with_pca(data, components, block_size=1024):
    samples = len(data)
    leftover = block_size - samples % block_size
    padded = np.lib.pad(data, (0, leftover), "constant", constant_values=0)
    
    reshaped = padded.reshape((len(padded) // block_size, block_size))
    
    pca = PCA(n_components=components)
    pca.fit(reshaped)
    
    transformed = pca.transform(reshaped)
    reconstructed = pca.inverse_transform(transformed).reshape((len(padded)))
    return pca, transformed, reconstructed

In [5]:
COMPONENT_COUNT = 200
PCA_BLOCK_SIZE = 3200 # 1/10 of the sample rate, which should be 32000 here

pca, transformed, constructed = compress_with_pca(
    source_audio, COMPONENT_COUNT, PCA_BLOCK_SIZE
)
print(f"PCA Transformed audio shape: {transformed.shape}")
Audio(constructed[sample_start:sample_end], rate=sample_rate)

PCA Transformed audio shape: (1741, 200)


In [49]:
pca_max = np.amax(transformed)
print(transformed.shape)
print(np.amax(transformed))
print(np.amin(transformed))
normalized = transformed / pca_max
print(normalized.shape)
print(np.amax(normalized))
print(np.amin(normalized))

(1741, 200)
972767.7704313479
-871977.7014758553
(1741, 200)
1.0
-0.8963883549402547


In [50]:
train_dataset = tf.data.Dataset.from_tensor_slices(normalized)
train_dataset

<TensorSliceDataset shapes: (200,), types: tf.float64>

In [99]:
MAXIMUM_SEQUENCE_LENGTH = 16
sequences = train_dataset.batch(
    MAXIMUM_SEQUENCE_LENGTH + 4,
    drop_remainder=True
)
sequences

<BatchDataset shapes: (20, 200), types: tf.float64>

In [100]:
def split_input_target(sequence):
    input_data = sequence[:-1]
    target_data = sequence[1:]
    return input_data, target_data
    return sequence, sequence

prepared_dataset = sequences.map(split_input_target)
prepared_dataset

<MapDataset shapes: ((19, 200), (19, 200)), types: (tf.float64, tf.float64)>

In [101]:
BATCH_SIZE = 32
BUFFER_SIZE = 1024
shuffled_dataset = (
    prepared_dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
)

In [102]:
CHECKPOINT_DIR = "./training-checkpoints/music-generation-with-pca-and-gru"

In [103]:
def build_model(rnn_units, batch_size):
    model =  tf.keras.Sequential([
        tf.keras.layers.InputLayer(
            batch_size=batch_size,
            input_shape=(None, COMPONENT_COUNT),
            dtype="float32",
        ),
        tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer="glorot_uniform",
            dtype="float32",
        ),
        tf.keras.layers.Dense(COMPONENT_COUNT, dtype="float32"),
    ])
    return model

In [104]:
latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)

model = build_model(
    rnn_units=1024,
    batch_size=BATCH_SIZE,
)
model.build()
if latest_checkpoint:
    model.load_weights(latest_checkpoint)
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_8 (GRU)                  (32, None, 1024)          3766272   
_________________________________________________________________
dense_8 (Dense)              (32, None, 200)           205000    
Total params: 3,971,272
Trainable params: 3,971,272
Non-trainable params: 0
_________________________________________________________________


In [105]:
for input_batch, target_batch in shuffled_dataset.take(1):
    predictions = model(input_batch)
    print(predictions.shape, "# (batch_size, sequence_length, output)")
    print(predictions[0][0][:10])
    print(target_batch[0][0][:10])

(32, 19, 200) # (batch_size, sequence_length, output)
tf.Tensor(
[-0.0552041   0.0135721   0.12493993 -0.03191961  0.04949732  0.07150979
 -0.00256165 -0.04370787  0.00874729  0.03567329], shape=(10,), dtype=float32)
tf.Tensor(
[-0.04246185  0.01961843  0.03615393 -0.04456432  0.04134095  0.02739182
  0.02832068 -0.1120001   0.03828289 -0.03845181], shape=(10,), dtype=float64)


In [106]:
def loss(target, predicted):
    return tf.keras.losses.MSE(target, predicted)

In [107]:
batch_loss = loss(target_batch, predictions)
print("Predictions shape (batch_size, sequence_length, vocabulary_size)")
print(predictions.shape, "\n")
print("scalar_loss:", batch_loss.numpy().mean())

Predictions shape (batch_size, sequence_length, vocabulary_size)
(32, 19, 200) 

scalar_loss: 0.000402696


In [108]:
model.compile(optimizer="adam", loss=loss)

In [109]:
checkpoint_prefix = os.path.abspath(
    os.path.join(CHECKPOINT_DIR, "ckpt_{epoch}")
)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
)

In [110]:
EPOCHS = 100

In [111]:
history = model.fit(
    shuffled_dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback],
)

Train for 2 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epo

Epoch 98/100
Epoch 99/100
Epoch 100/100


In [112]:
model = build_model(
    rnn_units=1024,
    batch_size=1,
)
model.load_weights(tf.train.latest_checkpoint(CHECKPOINT_DIR))
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_9 (GRU)                  (1, None, 1024)           3766272   
_________________________________________________________________
dense_9 (Dense)              (1, None, 200)            205000    
Total params: 3,971,272
Trainable params: 3,971,272
Non-trainable params: 0
_________________________________________________________________


In [121]:
def generate_output(model, input_eval, samples_to_generate=4000):
    generated_output = []
    
    model.reset_states()
    for i in range(samples_to_generate):
        predictions = model(input_eval)
        prediction = predictions[0, -1].numpy()
        input_eval = tf.expand_dims([prediction], 0)
        generated_output.append(prediction)
        if i % 50 == 0:
            print(f"Generated {i} / {samples_to_generate}")
    print("Done!")
    
    return generated_output

In [114]:
initial = list(shuffled_dataset.take(1))[0][0][0]
initial = tf.expand_dims(initial, 0)
Audio(pca.inverse_transform(initial * pca_max).flatten(), rate=sample_rate)

In [126]:
output = generate_output(model, initial, samples_to_generate=200)
output = np.array(output) * pca_max
# model.reset_states()
# output = model(initial) * pca_max

Generated 0 / 200
Generated 50 / 200
Generated 100 / 200
Generated 150 / 200
Done!


In [127]:
output_reconstructed = pca.inverse_transform(output).flatten()
output_reconstructed *= pca_max
Audio(output_reconstructed, rate=sample_rate)