# Data preparation and preprocessing

In [1]:
import tensorflow as tf

import numpy as np
import pandas as pd
import os
import time

device_name = tf.test.gpu_device_name()# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

AUTOTUNE = tf.data.experimental.AUTOTUNE

Found GPU at: /device:GPU:0


## Load Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

path_to_file = '/content/drive/MyDrive/Lyrics_generator/english_cleaned_lyrics.csv'

Mounted at /content/drive


In [3]:
# Load CSV data
data = pd.read_csv(path_to_file)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Select the columns: lyrics and genre
# columns = ['index', 'song', 'artist', 'year']

# data.drop(columns, inplace=True, axis=1)
#data.drop(data.columns[[0]], axis=1, inplace=True)

data = data[['lyrics', 'genre']]

#print(lyrics[:10])
#print(genre[:10])
print(data[:10])

                                              lyrics genre
0  Oh baby how you doing You know I'm gonna cut r...   Pop
1  playin everything so easy it's like you seem s...   Pop
2  If you search For tenderness It isn't hard to ...   Pop
3  Oh oh oh I oh oh oh I If I wrote a book about ...   Pop
4  Party the people the people the party it's pop...   Pop
5  I heard Church bells ringing I heard A choir s...   Pop
6  This is just another day that I would spend Wa...   Pop
7  Waiting waiting waiting waiting Waiting waitin...   Pop
8   I read all of the magazines while waiting aro...   Pop
9  N n now honey You better sit down and look aro...   Pop


## Preprocess and split dataset

In [4]:
# Lower
def lyrics_preprocessing(lyrics_text):
    return lyrics_text.lower() #.split()

data['lyrics'] = data['lyrics'].apply(lyrics_preprocessing)



# One Hot encoding
data = pd.get_dummies(data, columns=['genre'])

#print(data[:10])

lyrics = data[['lyrics']]
genre = data.iloc[:, 1:]

print(lyrics[:10])
print(genre[:10])

                                              lyrics
0  oh baby how you doing you know i'm gonna cut r...
1  playin everything so easy it's like you seem s...
2  if you search for tenderness it isn't hard to ...
3  oh oh oh i oh oh oh i if i wrote a book about ...
4  party the people the people the party it's pop...
5  i heard church bells ringing i heard a choir s...
6  this is just another day that i would spend wa...
7  waiting waiting waiting waiting waiting waitin...
8   i read all of the magazines while waiting aro...
9  n n now honey you better sit down and look aro...
   genre_Country  genre_Electronic  genre_Folk  genre_Hip-Hop  genre_Indie  \
0              0                 0           0              0            0   
1              0                 0           0              0            0   
2              0                 0           0              0            0   
3              0                 0           0              0            0   
4              0           

# Nuovo Tutorial TF


In [5]:
def split_l(x):
  splitted = x.split()[:6]
  if len(splitted) < 6: 
    return None
  else:
    return " ".join(splitted)

'''nan_values = l.isna()
nan_columns = nan_values.any()
print("Prima: ", nan_columns)'''

l = data['lyrics'].apply(split_l)
l.dropna(axis=0, inplace=True)

'''nan_values = l.isna()
nan_columns = nan_values.any()
print("Dopo: ", nan_columns)'''
#print("Dopo: ", l.shape)

tokens = tf.strings.split(l)

#tokens = data['lyrics'].apply(lambda t: tf.strings.split(t, maxsplit=5))

tokens[:10]

<tf.RaggedTensor [[b'oh', b'baby', b'how', b'you', b'doing', b'you'], [b'playin', b'everything', b'so', b'easy', b"it's", b'like'], [b'if', b'you', b'search', b'for', b'tenderness', b'it'], [b'oh', b'oh', b'oh', b'i', b'oh', b'oh'], [b'party', b'the', b'people', b'the', b'people', b'the'], [b'i', b'heard', b'church', b'bells', b'ringing', b'i'], [b'this', b'is', b'just', b'another', b'day', b'that'], [b'waiting', b'waiting', b'waiting', b'waiting', b'waiting', b'waiting'], [b'i', b'read', b'all', b'of', b'the', b'magazines'], [b'n', b'n', b'now', b'honey', b'you', b'better']]>

In [6]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

#lyrics_tensor = tf.convert_to_tensor(data['lyrics'].tolist())

layer = StringLookup()
layer.adapt(tokens)
vocab = layer.get_vocabulary()

vocab[:10]

['', '[UNK]', 'the', 'i', 'you', 'a', 'to', 'in', 'and', 'my']

In [7]:
ids_from_words = layer(tokens)
#ids_from_words = StringLookup(vocabulary=list(vocab))


ids_from_words[:2]
#ids = ids_from_words(tokens)
#ids[:2]

<tf.RaggedTensor [[29, 58, 66, 4, 608, 4], [1982, 171, 27, 322, 23, 32]]>

In [8]:
words_from_ids = StringLookup(
    vocabulary=layer.get_vocabulary(), invert=True)
#words_from_ids = StringLookup(
#    vocabulary=ids_from_words.get_vocabulary(), invert=True)

words = words_from_ids(ids_from_words)
#words = words_from_ids(ids)
words[:10]

<tf.RaggedTensor [[b'oh', b'baby', b'how', b'you', b'doing', b'you'], [b'playin', b'everything', b'so', b'easy', b"it's", b'like'], [b'if', b'you', b'search', b'for', b'tenderness', b'it'], [b'oh', b'oh', b'oh', b'i', b'oh', b'oh'], [b'party', b'the', b'people', b'the', b'people', b'the'], [b'i', b'heard', b'church', b'bells', b'ringing', b'i'], [b'this', b'is', b'just', b'another', b'day', b'that'], [b'waiting', b'waiting', b'waiting', b'waiting', b'waiting', b'waiting'], [b'i', b'read', b'all', b'of', b'the', b'magazines'], [b'n', b'n', b'now', b'honey', b'you', b'better']]>

In [9]:
def text_from_ids(ids):
  return tf.strings.reduce_join(words_from_ids(ids), axis=-1, separator=' ')

In [10]:
#seq_length = 32

#sequences = ids_from_words.batch(seq_length, drop_remainder=True)

sequences = tf.data.Dataset.from_tensor_slices(ids_from_words)
#sequences = tf.data.Dataset.from_tensor_slices(ids)

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

for input_example, target_example in  dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'oh baby how you doing'
Target: b'baby how you doing you'


DA qui in poi abbiamo copiato senza verificare il codice

In [11]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, None), (64, None)), types: (tf.int64, tf.int64)>

In [12]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024


In [13]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True, 
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else: 
      return x

In [14]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)


In [15]:
#for input_example_batch, target_example_batch in dataset.take(1):
for input_example_batch, target_example_batch in dataset.take(15*BATCH_SIZE):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_size)
(64, 5, 36279) # (batch_size, sequence_length, vocab_siz

In [16]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  9287424   
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  37185975  
Total params: 50,411,703
Trainable params: 50,411,703
Non-trainable params: 0
_________________________________________________________________


# Training



In [17]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [18]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_predictions.numpy().mean()

print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_lenght, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 5, 36279)  # (batch_size, sequence_lenght, vocab_size)
Mean loss:         1.1824714e-06


In [19]:
tf.exp(mean_loss).numpy()

1.0000012

In [20]:
model.compile(optimizer='adam', loss=loss)

In [21]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [22]:
EPOCHS = 20

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
 290/3403 [=>............................] - ETA: 8:39 - loss: 5.0915

KeyboardInterrupt: ignored

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, words_from_ids, ids_from_words, temperature=1.0):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.words_from_ids = words_from_ids
    self.ids_from_words = ids_from_words

    #'''
    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_words(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_words.get_vocabulary())]) 
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)
    #'''

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    '''input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_words(input_chars).to_tensor())'''
    input_ids = self.ids_from_words([inputs])

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits] 
    predicted_logits, states =  self.model(inputs=input_ids, 
                                           states=states, 
                                           return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_words = self.words_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_words, states

In [None]:
#one_step_model = OneStep(model, words_from_ids, ids_from_words)
one_step_model = OneStep(model, words_from_ids, layer)

In [None]:
start  = time.time()
states = None
next_word = tf.constant(['romeo'])
result    = [next_word]

for n in range(100):
  next_word, states = one_step_model.generate_one_step(next_word, states=states)
  result.append(next_word)

result = tf.strings.join(result)
end    = time.time()

print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)

print(f"\nRun time: {end - start}")


In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

In [None]:
states = None
next_char = tf.constant(['stock'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result, separator=" ")[0].numpy().decode("utf-8"))

# TUTORIAL VECCHIO

## Vectorize the lyrics

In [None]:
# Build the Vectorizer
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Init
vectorize_layer = TextVectorization(
    output_mode='int',
    output_sequence_length=50)

# Load data as TF Dataset object
tf_lyrics = tf.data.Dataset.from_tensor_slices(lyrics)
tf_genre = tf.data.Dataset.from_tensor_slices(genre)

# Fit the Vectorizer
vectorize_layer.adapt(tf_lyrics.batch(1024))
inverse_vocab = vectorize_layer.get_vocabulary()

ids_from_chars = StringLookup(
    vocabulary=inverse_vocab)

# Traform (vectorize) the lyrics
def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return tf.squeeze(vectorize_layer(text))

# Vectorize the data in text_ds.
tf_vec_lyrics = tf_lyrics.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer)

ids = ids_from_chars(tf_vec_lyrics)



In [None]:
# Build the Vectorizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Init
vectorize_layer = TextVectorization(
    output_mode='int',
    output_sequence_length=50)

# Load data as TF Dataset object
tf_lyrics = tf.data.Dataset.from_tensor_slices(lyrics)
tf_genre = tf.data.Dataset.from_tensor_slices(genre)

# Fit the Vectorizer
vectorize_layer.adapt(tf_lyrics.batch(1024))

# Save the inverse vocabulary
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

In [None]:
'''
# Traform (vectorize) the lyrics
def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return tf.squeeze(vectorize_layer(text))

# Vectorize the data in text_ds.
tf_vec_lyrics = tf_lyrics.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

# Print the vectorized text
sequences = list(tf_vec_lyrics.as_numpy_iterator())
             
print(len(sequences))

#for seq in sequences[:5]:
#  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")
'''

In [None]:
# Transform (vectorize) the lyrics
def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return tf.squeeze(vectorize_layer(text))

# Vectorize the data in text_ds.
tf_vec_lyrics = tf_lyrics.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

# Print the vectorized text
sequences = tf_vec_lyrics.batch(seq_length+1, drop_remainder=True)

#print(len(sequences))

for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

In [None]:
ids_from_terms = preprocessing.StringLookup(
    vocabulary=list(inverse_vocab))

ids = ids_from_terms(tf_lyrics)

terms_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_terms.get_vocabulary(), invert=True)

terms = terms_from_ids(ids)

tf.strings.reduce_join(terms, axis=-1).numpy()



def text_from_ids(term):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
def split_input_target(lyrics):
    input_text = lyrics[:-1]
    target_text = lyrics[1:]
    return input_text, target_text

In [None]:
tf_vec_lyrics = sequences.map(split_input_target)

In [None]:
for input_example, target_example in tf_vec_lyrics.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

In [None]:
'''
ids_from_terms = preprocessing.StringLookup(
    vocabulary=list(inverse_vocab))

ids = ids_from_terms(terms)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

chars = chars_from_ids(ids)
'''

In [None]:
# The maximum length sentence you want for a single input in characters
#seq_length = 10
#examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
#char_dataset = tf.data.Dataset.from_tensor_slices(list(text_vector_ds.as_numpy_iterator())[0])

In [None]:
#sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#def split_input_target(chunk):
#    input_text = chunk[:-1]
#    target_text = chunk[1:]
#    return input_text, target_text

#dataset = sequences.map(split_input_target)

## Build the model

In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

tf_vec_lyrics = tf_vec_lyrics.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
# Length of the vocabulary in chars
vocab_size = len(inverse_vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True, 
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else: 
      return x

model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(inverse_vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)


In [None]:
'''
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(
    vocab_size=len(inverse_vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)
'''

In [None]:
#example_batch_predictions = 0

#for input_example_batch, target_example_batch in tf_vec_lyrics.take(1):
#    example_batch_predictions = model(input_example_batch)
#    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

#model.summary()


In [None]:
tf_vec_lyrics.batch(50, drop_remainder = True)

for lyric in tf_vec_lyrics.take(1):
  example_batch_predictions = model(lyric)

model.summary()

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

sampled_indices

In [None]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())
