In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

import numpy as np
import os
import time

In [0]:
path_to_file = '/content/drive/My Drive/lyric/preprocessed_data.txt'

In [0]:
data = open(path_to_file, 'r', encoding='utf-8')
text = data.readlines()

print(str(len(text)))

In [0]:
vocab = set()
vocab_num = dict()

for line in text:
    if line != '\n':
        line = line.replace('\n', ' \n').replace(')', ' )').replace('(', ' (')
    for word in line.split(' '):
        if word not in vocab_num:
            vocab_num[word] = 0
        else:
            ori_num = vocab_num[word]
            ori_num += 1
            vocab_num[word] = ori_num

for vocabs in vocab_num:
    if vocab_num[vocabs] > 0:
        vocab.add(vocabs)

vocab = sorted(list(vocab))

print ('{} unique words'.format(len(vocab)))

In [0]:
print('vocab_len: ' + str(len(vocab)))

In [0]:
vocab_file = open('/content/drive/My Drive/lyric/vocabs_word_token.txt', 'w', encoding='utf-8')
for voc in vocab:
    vocab_file.write(voc + '\n')
vocab_file.close()

In [0]:
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)

text_as_int = []

for line in text:
    if line != '\n':
        line = line.replace('\n', ' \n').replace(')', ' )').replace('(', ' (')
    for word in line.split(' '):
        if word in word2idx:
            text_as_int.append(word2idx[word])

text_as_int = np.array(text_as_int)

In [0]:
text_as_int

In [0]:
print('{')
for char,_ in zip(word2idx, range(50)):
    print('  {:4s}: {:3d},'.format(repr(char), word2idx[char]))
print('  ...\n}')

In [0]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(10):
  print(idx2word[i.numpy()])

In [0]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(2):
  print(repr(' '.join(idx2word[item.numpy()])))

In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [0]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

In [0]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 512

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [0]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [0]:
model.summary()

In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

In [0]:
model.compile(optimizer='adam', loss=loss)

In [0]:
checkpoint_dir = '/content/drive/My Drive/lyric/model_output_word'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [0]:
EPOCHS=50

In [0]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])