#Word Suggestions

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import string
import requests

In [None]:
# using shakespeare writings for training our model 
response = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')

###Text Preprocessing

In [None]:
data = response.text.split('\n')
data[0]

'This is the 100th Etext file presented by Project Gutenberg, and'

In [None]:
data = data[253:]
data[0]

'  From fairest creatures we desire increase,'

In [None]:
data = " ".join(data)

In [None]:
# cleaning the text
def clean_text(doc):
  tokens = doc.split()
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  return tokens

tokens = clean_text(data)
print(tokens[:50])

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy']


In [None]:
length = 10 + 1  # sequence length
lines = []

# creating the sequence of lines
for i in range(length, len(tokens)):
  seq = tokens[i-length:i]
  line = ' '.join(seq)
  lines.append(line)
  if i > 100000:      # Training on 1 lakh words
    break

print(len(lines))

99991


In [None]:
import numpy as np
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# converting words to numericals using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [None]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:,-1]

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# one hot encoding of y
y = to_categorical(y, num_classes=vocab_size)

In [None]:
seq_length = X.shape[1]

## LSTM model

In [None]:
# Layers
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# increase the epochs to get more accuracy
history = model.fit(X, y, batch_size = 256, epochs = 100)

In [None]:
model.save("/content/drive/MyDrive/Colab Notebooks/word-suggest.h5")

In [None]:
model = keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/word-suggest.h5")

In [None]:
# function to generate the next text of words from the given text
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
  """
  seed_text is the input text.
  n_words is the number of words to be predicted after the input text.
  """
  text = []

  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')

    y_predicts = np.argsort(np.max(model.predict(encoded), axis=0))[[-1,-2,-3]]
    y_confidences = np.sort(np.max(model.predict(encoded), axis=0))[[-1,-2,-3]]

    print("Suggestions: ", end='')
    for y_predict in y_predicts:
      for word, index in tokenizer.word_index.items():
        if index == y_predict:
          print(word , end='     ')
          break
    print(f"\nConfidence:  {y_confidences[0]:.2f}   {y_confidences[1]:.2f}   {y_confidences[2]:.2f}")


In [None]:
seed_text = lines[526]
seed_text

'was but flowers distilled though they with winter meet leese but'

In [None]:
generate_text_seq(model, tokenizer, seq_length, seed_text, 1)

Suggestions: their     all     the     
Confidence:  0.71   0.05   0.03
