In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
import string

In [3]:
!wget --no-check-certificate \
    https://drive.google.com/uc?id=1LiJFZd41ofrWoBtW-pMYsfz1w8Ny0Bj8 \
    -O /tmp/songdata.csv

In [50]:
dataset = pd.read_csv("/tmp/songdata.csv")
dataset.head()

In [51]:
dataset.shape

In [52]:
def preprocess_data(dataframe, field):
  dataframe[field] = dataframe[field].str.replace("[{}]".format(string.punctuation),'')
  dataframe[field] = dataframe[field].str.lower()
  dataframe[field] = dataframe[field].str.strip()
  lines = [l for l in dataframe[field].to_numpy() if l != '']
  return lines

In [53]:
dataset = preprocess_data(dataset, "text")

In [54]:
len(dataset)

In [55]:
dataset[:10]

In [56]:
def train_val_split(data, train_size=0.8):
  train_len = int(len(data) * train_size)
  index = tf.random.shuffle(tf.range(len(data)))
  x_train = tf.gather(data, index[:train_len])
  x_val = tf.gather(data, index[train_len:])

  return x_train, x_val

In [57]:
train, val = train_val_split(dataset)

In [58]:
train = tf.strings.join(train, separator='\n')
val = tf.strings.join(val, separator='\n')

In [59]:
train = tf.strings.unicode_split(train, input_encoding='UTF-8')
val = tf.strings.unicode_split(val, input_encoding='UTF-8')

In [61]:
train

In [62]:
vocabulary = sorted(set(train.numpy()))
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocabulary), mask_token=None)

len(vocabulary)

In [63]:
train = ids_from_chars(train)
val = ids_from_chars(val)

In [64]:
train

In [65]:
val

In [66]:
train = tf.data.Dataset.from_tensor_slices(train)
val = tf.data.Dataset.from_tensor_slices(val)

In [67]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [68]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [70]:
for ids in train.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

In [73]:
train

In [75]:
def final_create_sequence_sequence_batch(dataset, window_size=1, batch_size=64,buffer_size=10000):
  AUTOTUNE = tf.data.experimental.AUTOTUNE
  dataset = dataset.window(window_size+1, shift=window_size+1, drop_remainder=True)
  dataset = dataset.flat_map(lambda window: window.batch(window_size+1))
  dataset = dataset.map(lambda window: (window[:-1], window[1:]), num_parallel_calls=AUTOTUNE)
  dataset = dataset.cache().shuffle(buffer_size).batch(batch_size, drop_remainder=True)
  dataset = dataset.prefetch(AUTOTUNE)
  return dataset

In [76]:
seq_length = 500

train = final_create_sequence_sequence_batch(train, window_size=seq_length)
val = final_create_sequence_sequence_batch(val, window_size=seq_length)

In [None]:
for input_example, target_example in train.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())
    print("\n\n")

In [79]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [96]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [97]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [98]:
for input_example_batch, target_example_batch in train.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [92]:
model.summary()

In [99]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [102]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint('best_music/', save_best_only=True)
earlystopping = tf.keras.callbacks.EarlyStopping(patience=20)

In [None]:
EPOCHS = 100
history = model.fit(train, epochs=EPOCHS, validation_data=val,callbacks=[checkpoint_callback, earlystopping])