##Import Packages

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np

##Download Dataset

In [2]:
!wget --no-check-certificate http://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt -O sonnets.txt

--2022-01-27 13:54:52--  http://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.218.128, 142.250.153.128, 142.250.145.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.218.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93578 (91K) [text/plain]
Saving to: ‘sonnets.txt’


2022-01-27 13:54:52 (356 MB/s) - ‘sonnets.txt’ saved [93578/93578]



##Tokenize Text

In [None]:
tokenizer = Tokenizer()
data = open("/content/sonnets.txt").read()

corpus = data.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

##From text to numbers

In [None]:
input_sequences = []

for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  #print("LIST = ", str(token_list))
  for i in range(1,len(token_list)):
    n_gram_sequence = token_list[:i+1]
    #print(n_gram_sequence )
    input_sequences.append(n_gram_sequence)

#USE PADDING TO MAKE SENTENCES HAVE THE SAME LENGTH
max_sequence_len = max([len(seq) for seq in input_sequences])

#can use pre or post for padding
input_sequences = np.array(pad_sequences(input_sequences, padding="pre", maxlen=max_sequence_len))

#format training data
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)

##Build LSTM

In [32]:
model = Sequential([
  Embedding(input_dim = total_words, output_dim = 100, input_length= max_sequence_len -1), #converts sequences into parameters
  LSTM(150, return_sequences = True), #all LSTM layers except the last should have this parameter to true
  Dropout(0.3),
  LSTM(96),
  Dense(total_words / 2, activation = "relu"),
  Dense(total_words, activation = "softmax")
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10, 100)           321100    
                                                                 
 lstm_2 (LSTM)               (None, 10, 150)           150600    
                                                                 
 dropout_1 (Dropout)         (None, 10, 150)           0         
                                                                 
 lstm_3 (LSTM)               (None, 96)                94848     
                                                                 
 dense_2 (Dense)             (None, 1605)              155685    
                                                                 
 dense_3 (Dense)             (None, 3211)              5156866   
                                                                 
Total params: 5,879,099
Trainable params: 5,879,099
Non-

##Training Neural Network

In [36]:
model.compile(loss = "categorical_crossentropy", optimizer = "adam")
model.fit(xs,ys, epochs = 200, verbose = 1)

##Generate Text

In [42]:

def predict_next_words(seed_text, next_words):
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predict_x=model.predict(token_list) 
    predicted=np.argmax(predict_x,axis=1)
    # predicted = model.predict_classes(token_list, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " " + output_word

  print(seed_text)
  return seed_text

In [45]:
seed_text = "I love you"
next_words = 100
generated_text = predict_next_words(seed_text, next_words)

I love you bore i beauty nourish'd of those old near care it wastes new lie date to heart ' ' gladly forth forth me with seen lease of day day trust night oppress'd night trust bow bow bow thereby face staineth lie it it in me lies lies lies you ' untrue ' 'no ' untrue untrue untrue untrue untrue untrue untrue untrue forth bright to lease days ' wrong seen days days ' ' wrong new gone esteem'd face seen untrue seen days days days or near be more ' wrong seen ' date staineth faith torn gone seen of trust


"I love you bore i beauty nourish'd of those old near care it wastes new lie date to heart ' ' gladly forth forth me with seen lease of day day trust night oppress'd night trust bow bow bow thereby face staineth lie it it in me lies lies lies you ' untrue ' 'no ' untrue untrue untrue untrue untrue untrue untrue untrue forth bright to lease days ' wrong seen days days ' ' wrong new gone esteem'd face seen untrue seen days days days or near be more ' wrong seen ' date staineth faith torn gone seen of trust"