# Text generator

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
import string
import requests

In [2]:
# using shakespeare writings for training our model 
response = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')

In [3]:
response.text[:1000]

'This is the 100th Etext file presented by Project Gutenberg, and\nis presented in cooperation with World Library, Inc., from their\nLibrary of the Future and Shakespeare CDROMS.  Project Gutenberg\noften releases Etexts that are NOT placed in the Public Domain!!\n\nShakespeare\n\n*This Etext has certain copyright implications you should read!*\n\n<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS OF WILLIAM\nSHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS\nPROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENEDICTINE COLLEGE\nWITH PERMISSION.  ELECTRONIC AND MACHINE READABLE COPIES MAY BE\nDISTRIBUTED SO LONG AS SUCH COPIES (1) ARE FOR YOUR OR OTHERS\nPERSONAL USE ONLY, AND (2) ARE NOT DISTRIBUTED OR USED\nCOMMERCIALLY.  PROHIBITED COMMERCIAL DISTRIBUTION INCLUDES BY ANY\nSERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>\n\n*Project Gutenberg is proud to cooperate with The World Library*\nin the presentation of The Complete Works of William Shakespeare\nfor your

###Text processing

In [4]:
data = response.text.split('\n')
data[0]

'This is the 100th Etext file presented by Project Gutenberg, and'

In [5]:
data = data[253:]
data[0]

'  From fairest creatures we desire increase,'

In [6]:
len(data)

124204

In [7]:
data = " ".join(data)

In [8]:
data[:1000]

"  From fairest creatures we desire increase,   That thereby beauty's rose might never die,   But as the riper should by time decease,   His tender heir might bear his memory:   But thou contracted to thine own bright eyes,   Feed'st thy light's flame with self-substantial fuel,   Making a famine where abundance lies,   Thy self thy foe, to thy sweet self too cruel:   Thou that art now the world's fresh ornament,   And only herald to the gaudy spring,   Within thine own bud buriest thy content,   And tender churl mak'st waste in niggarding:     Pity the world, or else this glutton be,     To eat the world's due, by the grave and thee.                        2   When forty winters shall besiege thy brow,   And dig deep trenches in thy beauty's field,   Thy youth's proud livery so gazed on now,   Will be a tattered weed of small worth held:     Then being asked, where all thy beauty lies,   Where all the treasure of thy lusty days;   To say within thine own deep sunken eyes,   Were an al

In [9]:
# cleaning the text
def clean_text(doc):
  tokens = doc.split()
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  return tokens

tokens = clean_text(data)
print(tokens[:50])

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy']


In [10]:
len(tokens)

898199

In [11]:
len(set(tokens))

27956

In [12]:
length = 50 + 1  # sequence length
lines = []

# creating the sequence of lines
for i in range(length, len(tokens)):
  seq = tokens[i-length:i]
  line = ' '.join(seq)
  lines.append(line)
  if i > 100000:      # Training on 1 lakh words
    break

print(len(lines))

99951


In [13]:
lines[0]

'from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self'

In [14]:
lines[1]

'fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self thy'

## Build LSTM Model and Prepare X and y

In [15]:
import numpy as np
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
# converting words to numericals using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [17]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:,-1]
X[0]

array([  49, 1437, 1624,   44,  427, 1305,    9, 2338,  669,  849,  165,
        131,  229,   17,   27,    1, 2977,   86,   31,   83, 2976,   19,
        771, 1436,  165,  205,   19,  975,   17,   22, 2975,    4,  160,
        119,  741,  133, 8841,   26, 8839, 1918,   16, 8838, 8837,  596,
          7, 2974,  107, 1917,  458,   26])

In [18]:
y[0]

176

In [19]:
X[1]

array([1437, 1624,   44,  427, 1305,    9, 2338,  669,  849,  165,  131,
        229,   17,   27,    1, 2977,   86,   31,   83, 2976,   19,  771,
       1436,  165,  205,   19,  975,   17,   22, 2975,    4,  160,  119,
        741,  133, 8841,   26, 8839, 1918,   16, 8838, 8837,  596,    7,
       2974,  107, 1917,  458,   26,  176])

In [20]:
vocab_size = len(tokenizer.word_index) + 1

In [21]:
# one hot encoding of y
y = to_categorical(y, num_classes=vocab_size)
print(y[0])

[0. 0. 0. ... 0. 0. 0.]


In [22]:
seq_length = X.shape[1]
seq_length

50

##LSTM Model

In [23]:
# Layers
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            442200    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 8844)              893244    
Total params: 1,486,344
Trainable params: 1,486,344
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# increase the epochs to get more accuracy
history = model.fit(X, y, batch_size = 256, epochs = 100)

In [27]:
model.save("/content/drive/MyDrive/Colab Notebooks/text-generator.h5")

In [28]:
model = keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/text-generator.h5")

In [29]:
# function to generate the next text of words from the given text
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
  """
  seed_text is the input text.
  n_words is the number of words to be predicted after the input text.
  """
  text = []

  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')

    y_predict = np.argmax(model.predict(encoded), axis=-1)

    predicted_word = ''
    for word, index in tokenizer.word_index.items():
      if index == y_predict:
        predicted_word = word
        break
    seed_text = seed_text + ' ' + predicted_word
    text.append(predicted_word)
  return ' '.join(text)

In [32]:
seed_text = lines[1234]
seed_text

'i behold the violet past prime and sable curls all silvered oer with white when lofty trees i see barren of leaves which erst from heat did canopy the herd and summers green all girded up in sheaves borne on the bier with white and bristly beard then of thy beauty'

In [36]:
generate_text_seq(model, tokenizer, seq_length, seed_text, 10)

'and most best eyes can make times fickle glass if'