Using the datasets: [Poems](https://www.kaggle.com/datasets/alindgupta99/poems-lyrics), [Quotes](https://www.kaggle.com/datasets/akmittal/quotes-dataset)

In [None]:

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np, json, os, io, time, random, sys

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, LSTM, GRU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau

In [None]:
batch_size = 128

In [None]:
def get_poems_text() -> str:
    with open("poems.txt", 'r') as file:
        text = file.read()
    return text

def get_quotes_text() -> str:
    batch_size = 1024 # Dataset is much bigger than the poems
    file_name = "quotes.json"
    with open(file_name, 'r', encoding="utf-8") as file:
        dataset = json.load(file)
    text_list = []
    for item in dataset[:int(len(dataset)/4)]: # my resources are limited and it's a too big dataset
        text_list.append(item['Quote'])
    text = '\n'.join([str(elem) for elem in text_list])
    return text

In [None]:
# text = get_poems_text()
text = get_quotes_text()

In [None]:
vocabulary = sorted(list(set(text)))
# Creating dictionaries to map each character to an index
char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

In [None]:
# Dividing the text into subsequences of length `max_length`
# So that at each time step the next max_length characters are fed into the network
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])

### Hot encoding each character into a boolean vector 

In [None]:
# Initializing a matrix of boolean vectors with each column representing hot encoded representation of the character
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype = np.bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype = np.bool)
# Placing the value 1 at the appropriate position for each vector to complete the hot-encoding process
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype = np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sentences), len(vocabulary)), dtype = np.bool)


### Model Creation

In [None]:
model = Sequential()
model.add(GRU(128, input_shape =(max_length, len(vocabulary))))
model.add(Dense(len(vocabulary)))
model.add(Activation('softmax'))
optimizer = RMSprop(learning_rate = 0.01)
model.compile(loss ='categorical_crossentropy', optimizer = optimizer)

In [None]:
# Helper function to sample an index from a probability array
def sample_index(preds, temperature = 1.0):
    # Converting the predictions vector into a numpy array
    preds = np.asarray(preds).astype('float64')
    # Normalizing the predictions array
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    # Creating an array of probabilities signifying the probability of
    # each character to be the next character in the generated text
    probas = np.random.multinomial(1, preds, 1)
    # Returning the character with `argmax` probability
    return np.argmax(probas)

In [None]:
# Checkpoint to save the model after each epoch in which the loss decreases
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor ='loss', verbose = 1, save_best_only = True, mode ='min')
# Checkpoint to reduce the learning rate each time the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor ='loss', factor = 0.2, patience = 5, min_lr = 0.001)


In [None]:
model.fit(X, y, batch_size = batch_size, epochs = 10, verbose = 1, callbacks = [checkpoint, reduce_alpha])

Epoch 1/10






Epoch 10: loss did not improve from 1.58478




INFO:tensorflow:Assets written to: /tmp/tmpv_cdz0uh/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpv_cdz0uh/model/data/model/assets


<keras.callbacks.History at 0x7f92907bb0d0>

In [None]:
def generate_text(length, diversity):
	start_index = random.randint(0, len(text) - max_length - 1)
	sentence = text[start_index: start_index + max_length]
	generated = sentence
	# Generating new text of `length`
	for i in range(length):
			# Initializing the prediction vector
			x_pred = np.zeros((1, max_length, len(vocabulary)))
			for t, char in enumerate(sentence):
				x_pred[0, t, char_to_indices[char]] = 1.
			preds = model.predict(x_pred, verbose = 0)[0]
			# Index of the next most probable index
			next_index = sample_index(preds, diversity)
			# Most probable next character using the mapping built
			next_char = indices_to_char[next_index]
			# New text
			generated += next_char
			sentence = sentence[1:] + next_char
	return generated

In [None]:
print(generate_text(300, 0.2))

body will care for you.
Very occasionally, if you pay really close attention, life doesn't suck.
Love the the the and and e the f the was the the the the to the se to the s the and ther the an are the and are the the we and I I whe the warr are the the the and I con on the wome aner and to be pat the are to he the s the the s fome the wand the wile the and I the an and and cous the s as and the th
