# Neural Language Models and Word Embeddings

This notebook uses the text from the book, [Frankenstein; Or, The Modern Prometheus by Mary Wollstonecraft Shelley](https://www.gutenberg.org/ebooks/42324) as the training dataset.

TENSORFLOW

In [None]:
# LIBRARIES
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import tensorflow as tf 
import logging
tf.get_logger().setLevel(logging.ERROR)

In [None]:
# PARAMETERS
EPOCHS = 32
BATCH_SIZE = 256
INPUT_FILE_NAME = 'frankenstein.txt'
WINDOW_LENGTH = 40
WINDOW_STEP = 3
PREDICT_LENGTH = 3
MAX_WORDS = 10000
EMBEDDING_WIDTH = 100

In [None]:
# Read in the file
file = open(INPUT_FILE_NAME, 'r', encoding='utf-8-sig') # Handle BOM
text = file.read()
file.close()

In [None]:
# Make lower case and split into individual words
text = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^”_`{|}~\t\n')

In [None]:
# Create training examples.
fragments = []
targets = []
for i in range(0, len(text) - WINDOW_LENGTH, WINDOW_STEP):
  fragments.append(text[i: i+WINDOW_LENGTH])
  targets.append(text[i+WINDOW_LENGTH])

In [None]:
len(fragments)

In [None]:
# Example
fragments[10000], targets[10000]

In [None]:
# Convert to indices.
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='UNK')
tokenizer.fit_on_texts(text)
fragments_indexed = tokenizer.texts_to_sequences(fragments)
targets_indexed = tokenizer.texts_to_sequences(targets)

# Convert to appropriate input and output formats.
X = np.array(fragments_indexed, dtype=np.int)
y = np.zeros((len(targets_indexed), MAX_WORDS))
for i, target_index in enumerate(targets_indexed):
    y[i, target_index] = 1

In [None]:
# Build and train model
training_model = Sequential()
training_model.add(Embedding(
    output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS,
    mask_zero=True, input_length=None))
training_model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
training_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
training_model.add(Dense(128, activation='relu'))
training_model.add(Dense(MAX_WORDS, activation='softmax'))
training_model.compile(loss='categorical_crossentropy', optimizer='adam')
training_model.summary()

In [None]:
# Training
history = training_model.fit(X, y, validation_split=0.05, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, shuffle=True)

In [None]:
# For the predictions, it is important for the model to consider its past predictions
# for its next prediction, to achieve this, we set the LSTM layers to stateful,
# to keep the state from the last prediction(s).
# We use the trained model weights and simply transfer them to the inference model,
# which has exactly the same architecture as the training model.
# The only difference is that the LSTM layers are defined as stateful,
# and the batch_input_shape set to 1.
inference_model = Sequential()
inference_model.add(Embedding(
    output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS, mask_zero=True,
    batch_input_shape=(1,1)))
inference_model.add(LSTM(128, return_sequences=True, dropout=0.2, 
                         recurrent_dropout=0.2, stateful=True))
inference_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, stateful=True))
inference_model.add(Dense(128, activation='relu'))
inference_model.add(Dense(MAX_WORDS, activation='softmax'))
weights = training_model.get_weights()
inference_model.set_weights(weights)

In [None]:
# The prediction
first_words = ['no', 'man']
first_words_indexed = tokenizer.texts_to_sequences(first_words)
# Break the dependency on the last training example.
inference_model.reset_states()

In [None]:
predicted_string = ''
# Feed initial words to the model.
for i, word_index in enumerate(first_words_indexed):
  x = np.zeros((1,1), dtype=np.int)
  x[0][0] = word_index[0]
  predicted_string += first_words[i]
  predicted_string += ' '
  y_predict = inference_model.predict(x, verbose=0)[0]

In [None]:
# Predict PREDICT_LENGTH words.
for i in range(PREDICT_LENGTH):
  new_word_index = np.argmax(y_predict)
  word = tokenizer.sequences_to_texts([[new_word_index]])
  x[0][0] = new_word_index 
  predicted_string += word[0]
  predicted_string += ' '
  y_predict = inference_model.predict(x, verbose=0)[0]

# Print the predictions
print(predicted_string)

In [None]:
# Explore embedding similarities.
embeddings = training_model.layers[0].get_weights()[0]
lookup_words = ['the', 'saw', 'see', 'of', 'and',
                'monster', 'frankenstein', 'read', 'eat']
for lookup_word in lookup_words:
    lookup_word_indexed = tokenizer.texts_to_sequences(
        [lookup_word])
    print('words close to:', lookup_word)
    lookup_embedding = embeddings[lookup_word_indexed[0]]
    word_indices = {}
    # Calculate distances.
    for i, embedding in enumerate(embeddings):
        distance = np.linalg.norm(
            embedding - lookup_embedding)
        word_indices[distance] = i
    # Print sorted by distance.
    for distance in sorted(word_indices.keys())[:5]:
        word_index = word_indices[distance]
        word = tokenizer.sequences_to_texts([[word_index]])[0]
        print(word + ': ', distance)
    print('')