In [166]:
import IPython
import tensorflow as tf
import numpy as np
import os

In [50]:
example_audio_file = "./data/donuts.wav"
example_audio_data = open(example_audio_file, "rb").read()
example_audio_tensor = tf.audio.decode_wav(example_audio_data)
sample_rate = example_audio_tensor.sample_rate.numpy()
print("Audio:", example_audio_tensor.audio.shape)
print("Sample rate:", sample_rate)

Audio: (1392188, 1)
Sample rate: 8000


In [51]:
output_audio = tf.audio.encode_wav(
    example_audio_tensor.audio,
    example_audio_tensor.sample_rate,
)
IPython.display.Audio(
    output_audio.numpy(),
    rate=example_audio_tensor.sample_rate,
)

In [52]:
example_audio_dataset = tf.data.Dataset.from_tensor_slices(
    example_audio_tensor.audio
)
example_audio_dataset

<TensorSliceDataset shapes: (1,), types: tf.float32>

In [64]:
# Half a second of audio
sequence_length = sample_rate * 0.5
num_sequences = example_audio_tensor.audio.shape[0] // sequence_length
print("Sequence length:", sequence_length)
print("Example data sequences:", num_sequences)

Sequence length: 4000.0
Example data sequences: 348.0


In [87]:
sequences = example_audio_dataset.batch(
    sequence_length + 1,
    drop_remainder=True,
)
sequences

<BatchDataset shapes: (4001, 1), types: tf.float32>

In [81]:
def split_input_target(sequence):
    input_data = sequence[:-1]
    target_data = sequence[1:]
    return input_data, target_data

prepared_dataset = sequences.map(split_input_target)
prepared_dataset

<MapDataset shapes: ((4000, 1), (4000, 1)), types: (tf.float32, tf.float32)>

In [108]:
BATCH_SIZE = 16
BUFFER_SIZE = 1024
shuffled_dataset = (
    prepared_dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
)

In [109]:
CHECKPOINT_DIR = "./training-checkpoints/music-generation-with-gru"

In [132]:
def build_model(rnn_units, batch_size):
    model =  tf.keras.Sequential([
        tf.keras.layers.InputLayer(
            batch_size=batch_size,
            input_shape=(4000, 1)
        ),
        tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer="glorot_uniform",
        ),
        tf.keras.layers.Dense(1),
    ])
    return model

In [133]:
latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)

model = build_model(
    rnn_units=1024,
    batch_size=BATCH_SIZE,
)
model.build()
if latest_checkpoint:
    model.load_weights(latest_checkpoint)
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_8 (GRU)                  (16, 4000, 1024)          3154944   
_________________________________________________________________
dense_8 (Dense)              (16, 4000, 1)             1025      
Total params: 3,155,969
Trainable params: 3,155,969
Non-trainable params: 0
_________________________________________________________________


In [134]:
for input_batch, target_batch in shuffled_dataset.take(1):
    predictions = model(input_batch)
    print(predictions.shape, "# (batch_size, sequence_length, output)")

(16, 4000, 1) # (batch_size, sequence_length, output)


In [135]:
def loss(target, prediction):
    return tf.keras.losses.MSE(target, prediction)

In [137]:
batch_loss = loss(target_batch, predictions)
print("Predictions shape (batch_size, sequence_length, vocabulary_size)")
print(predictions.shape, "\n")
print("scalar_loss:", batch_loss.numpy().mean())

Predictions shape (batch_size, sequence_length, vocabulary_size)
(16, 4000, 1) 

scalar_loss: 0.090968564


In [138]:
model.compile(optimizer="adam", loss=loss)

In [139]:
checkpoint_prefix = os.path.abspath(
    os.path.join(CHECKPOINT_DIR, "ckpt_{epoch}")
)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
)

In [140]:
EPOCHS = 10

In [141]:
history = model.fit(
    shuffled_dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback],
)

Train for 21 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [142]:
model = build_model(
    rnn_units=1024,
    batch_size=1,
)
model.load_weights(tf.train.latest_checkpoint(CHECKPOINT_DIR))
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_9 (GRU)                  (1, 4000, 1024)           3154944   
_________________________________________________________________
dense_9 (Dense)              (1, 4000, 1)              1025      
Total params: 3,155,969
Trainable params: 3,155,969
Non-trainable params: 0
_________________________________________________________________


In [173]:
def generate_output(model, input_eval, samples_to_generate=4000):
    generated_output = []
    
    model.reset_states()
    for i in range(samples_to_generate):
        predictions = model(input_eval)
        prediction = predictions[0, -1, 0].numpy()
        input_eval = predictions
        generated_output.append(prediction)
        if i % 50 == 0:
            print(f"Generated {i} / {samples_to_generate}")
    print("Done!")
    
    return generated_output

In [174]:
initial = list(shuffled_dataset.take(1))[0][0][0]
initial = tf.expand_dims(initial, 0)
print(initial.shape)

(1, 4000, 1)


In [175]:
output = generate_output(model, initial, samples_to_generate=4000)

Generated 0 / 4000
Generated 50 / 4000
Generated 100 / 4000
Generated 150 / 4000
Generated 200 / 4000
Generated 250 / 4000
Generated 300 / 4000
Generated 350 / 4000
Generated 400 / 4000
Generated 450 / 4000
Generated 500 / 4000
Generated 550 / 4000
Generated 600 / 4000
Generated 650 / 4000
Generated 700 / 4000
Generated 750 / 4000
Generated 800 / 4000
Generated 850 / 4000
Generated 900 / 4000
Generated 950 / 4000
Generated 1000 / 4000
Generated 1050 / 4000
Generated 1100 / 4000
Generated 1150 / 4000
Generated 1200 / 4000
Generated 1250 / 4000
Generated 1300 / 4000
Generated 1350 / 4000
Generated 1400 / 4000
Generated 1450 / 4000
Generated 1500 / 4000
Generated 1550 / 4000
Generated 1600 / 4000
Generated 1650 / 4000
Generated 1700 / 4000
Generated 1750 / 4000
Generated 1800 / 4000
Generated 1850 / 4000
Generated 1900 / 4000
Generated 1950 / 4000
Generated 2000 / 4000
Generated 2050 / 4000
Generated 2100 / 4000
Generated 2150 / 4000
Generated 2200 / 4000
Generated 2250 / 4000
Generated 2

In [177]:
output_numpy = np.array(output).reshape(len(output), 1)
encoded_output = tf.audio.encode_wav(
    output_numpy,
    sample_rate,
)
IPython.display.Audio(
    encoded_output.numpy(),
    rate=sample_rate,
)