<a href="https://colab.research.google.com/github/MartinNde/MartinN_1/blob/main/Challenge_4_AustenWordLevelTextGen_Sln.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Text Generation in the style of Jane Austen -  Exercise Solutions

## 1. Change the hyper parameters, including the size and number of LSTM layers and number of epochs to see if you get better results.

Smplify the network to one LSTM layer and increase the epochs to 50 improves Bleu score to 0.95. More epochs will improve this further.

In [1]:
# import python libraries
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical
from random import randint
import re

import nltk   # natural language tool kit library
nltk.download('gutenberg')  # downloads a library that NLTK uses

from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
print(gut.fileids())    # prints the name of the files in the dataset

# get the book text
book_text = nltk.corpus.gutenberg.raw('blake-poems.txt')

# Data preprocessing
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()
book_text = preprocess_text(book_text)

book_text = book_text[:5000]  # limit text to 5000, just for this exercise

# convert words to numbers
from nltk.tokenize import word_tokenize
nltk.download('punkt')
book_text_words = (word_tokenize(book_text))
n_words = len(book_text_words)
unique_words = len(set(book_text_words))

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=unique_words)
tokenizer.fit_on_texts(book_text_words)

vocab_size = len(tokenizer.word_index) + 1    # word_index is the dictionary. Store the number of unique words in vocab_size variable
word_2_index = tokenizer.word_index           # store the dictionary in the variable called word_2_index

# Create the input sequences
input_sequence_words = []  # input sequences in words (used for metric evaluation later on)
input_sequence = []   # empty list to hold the sequences that will be input into our model
output_words = []     # empty list to hold the output words
input_seq_length = 100  # length of the input sequence
for i in range(0, n_words - input_seq_length , 1):
    in_seq = book_text_words[i:i + input_seq_length]
    input_sequence_words.append(in_seq)
    out_seq = book_text_words[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

# reshape the input sequences to be 3-dimensional
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))

# Normalise the data by dividing by the max number of unique words (the vocab size)
X = X / float(vocab_size)

# one-hot encode the output words so that they can be used by the model (converts the output to 2-dimensions)
y = to_categorical(output_words)

# create, compile and fit the model
model = Sequential()
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=64, epochs=50, verbose=1)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 800)               2566400   
                                                                 
 dense (Dense)               (None, 389)               311589    
                                                                 
Total params: 2,877,989
Trainable params: 2,877,989
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 3

<keras.callbacks.History at 0x7f30b80ff850>

In [3]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(100):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.

# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

little lamb who make thee dost thou know who made thee gave thee life and bid thee feed by the stream and er the mead gave thee clothing of delight softest clothing wolly bright gave thee such tender voice making all the vales rejoice little lamb who made thee dost thou know who made thee little lamb ll tell thee little lamb ll tell thee he is called by thy name for he calls himself lamb he is meek and he is mild he became little child a child and thou lamb we are called by his name little lamb
Seed word sequence: little lamb who make thee dost thou know who made thee gave thee life and bid thee feed by the stream and er the mead gave thee clothing of delight softest clothing wolly bright gave thee such tender voice making all the vales rejoice little lamb who made thee dost thou know who made thee little lamb ll tell thee little lamb ll tell thee he is called by thy name for he calls himself lamb he is meek and he is mild he became little child a child and thou lamb we are called by h

## 2. Try adding dropout after the LSTM layers and Dense layers.

In this particular model, Dropout prevents the model from learning at 50 epochs, but it does learn by 85 epochs. Dropout slows learning down in this case.

In [4]:
# import python libraries
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from random import randint
import re

import nltk   # natural language tool kit library
nltk.download('gutenberg')  # downloads a library that NLTK uses

from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
print(gut.fileids())    # prints the name of the files in the dataset

# get the book text
book_text = nltk.corpus.gutenberg.raw('austen-sense.txt')

# Data preprocessing
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()
book_text = preprocess_text(book_text)

book_text = book_text[:5000]  # limit text to 5000, just for this exercise

# convert words to numbers
from nltk.tokenize import word_tokenize
nltk.download('punkt')
book_text_words = (word_tokenize(book_text))
n_words = len(book_text_words)
unique_words = len(set(book_text_words))

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=unique_words)
tokenizer.fit_on_texts(book_text_words)

vocab_size = len(tokenizer.word_index) + 1    # word_index is the dictionary. Store the number of unique words in vocab_size variable
word_2_index = tokenizer.word_index           # store the dictionary in the variable called word_2_index

# Create the input sequences
input_sequence_words = []  # input sequences in words (used for metric evaluation later on)
input_sequence = []   # empty list to hold the sequences that will be input into our model
output_words = []     # empty list to hold the output words
input_seq_length = 100  # length of the input sequence
for i in range(0, n_words - input_seq_length , 1):
    in_seq = book_text_words[i:i + input_seq_length]
    input_sequence_words.append(in_seq)
    out_seq = book_text_words[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

# reshape the input sequences to be 3-dimensional
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))

# Normalise the data by dividing by the max number of unique words (the vocab size)
X = X / float(vocab_size)

# one-hot encode the output words so that they can be used by the model (converts the output to 2-dimensions)
y = to_categorical(output_words)

# create, compile and fit the model
model = Sequential()
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=64, epochs=100, verbose=1)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 800)               2566400   
                                                                 
 dropout (Dropout)           (None, 800)               0         
                                                                 
 dense_1 (Dense)             (None, 344)               275544    
                                                                 
Total params: 2,841,944
Traina

<keras.callbacks.History at 0x7f30ac08a760>

In [5]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(100):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.

# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

relish to his existence by former marriage mr henry dashwood had one son by his present lady three daughters the son steady respectable young man was amply provided for by the fortune of his mother which had been large and half of which devolved on him on his coming of age by his own marriage likewise which happened soon afterwards he added to his wealth to him therefore the succession to the norland estate was not so really important as to his sisters for their fortune independent of what might arise to them from their father inheriting that property could
Seed word sequence: relish to his existence by former marriage mr henry dashwood had one son by his present lady three daughters the son steady respectable young man was amply provided for by the fortune of his mother which had been large and half of which devolved on him on his coming of age by his own marriage likewise which happened soon afterwards he added to his wealth to him therefore the succession to the norland estate was n

## 3. Normalisation does not always provide the best results. Remove normalisation and see if this improves the results (this will probably mean the model hyper-parameters also need changing).

The model has learnt by 20 epochs, so not having normalisation has sped up the model training.  The number of neurons in the model can also be reduced but this increases the number of epochs it takes to train.



In [6]:
# import python libraries
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical
from random import randint
import re

import nltk   # natural language tool kit library
nltk.download('gutenberg')  # downloads a library that NLTK uses

from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
print(gut.fileids())    # prints the name of the files in the dataset

# get the book text
book_text = nltk.corpus.gutenberg.raw('austen-sense.txt')

# Data preprocessing
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()
book_text = preprocess_text(book_text)

book_text = book_text[:5000]  # limit text to 5000, just for this exercise

# convert words to numbers
from nltk.tokenize import word_tokenize
nltk.download('punkt')
book_text_words = (word_tokenize(book_text))
n_words = len(book_text_words)
unique_words = len(set(book_text_words))

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=unique_words)
tokenizer.fit_on_texts(book_text_words)

vocab_size = len(tokenizer.word_index) + 1    # word_index is the dictionary. Store the number of unique words in vocab_size variable
word_2_index = tokenizer.word_index           # store the dictionary in the variable called word_2_index

# Create the input sequences
input_sequence_words = []  # input sequences in words (used for metric evaluation later on)
input_sequence = []   # empty list to hold the sequences that will be input into our model
output_words = []     # empty list to hold the output words
input_seq_length = 100  # length of the input sequence
for i in range(0, n_words - input_seq_length , 1):
    in_seq = book_text_words[i:i + input_seq_length]
    input_sequence_words.append(in_seq)
    out_seq = book_text_words[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

# reshape the input sequences to be 3-dimensional
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))

# Normalise the data by dividing by the max number of unique words (the vocab size)
# X = X / float(vocab_size)

# one-hot encode the output words so that they can be used by the model (converts the output to 2-dimensions)
y = to_categorical(output_words)

# create, compile and fit the model
model = Sequential()
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=64, epochs=20, verbose=1)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 800)               2566400   
                                                                 
 dense_2 (Dense)             (None, 344)               275544    
                                                                 
Total params: 2,841,944
Trainable params: 2,841,944
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/2

<keras.callbacks.History at 0x7f30a7e9be80>

In [None]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(100):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    # int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.

# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

promised to do every thing in his power to make them comfortable his father was rendered easy by such an assurance and mr john dashwood had then leisure to consider how much there might prudently be in his power to do for them he was not an ill disposed young man unless to be rather cold hearted and rather selfish is to be ill disposed but he was in general well respected for he conducted himself with propriety in the discharge of his ordinary duties had he married more amiable woman he might have been made still more respectable than
Seed word sequence: promised to do every thing in his power to make them comfortable his father was rendered easy by such an assurance and mr john dashwood had then leisure to consider how much there might prudently be in his power to do for them he was not an ill disposed young man unless to be rather cold hearted and rather selfish is to be ill disposed but he was in general well respected for he conducted himself with propriety in the discharge of his 

## 4. Add an Embedding Layer into the DNN to see if this improves the model.

Adding an embedding layer and removing normalisation allows the model to acheive good results after 35 epochs.

The dimension of the dense embedding layer was arbitarily selected to be 50.  This is a parameter that can be experimented with.

In [7]:
# import python libraries
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.utils import to_categorical
from random import randint
import re

import nltk   # natural language tool kit library
nltk.download('gutenberg')  # downloads a library that NLTK uses

from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
print(gut.fileids())    # prints the name of the files in the dataset

# get the book text
book_text = nltk.corpus.gutenberg.raw('austen-sense.txt')

# Data preprocessing
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()
book_text = preprocess_text(book_text)

book_text = book_text[:5000]  # limit text to 5000, just for this exercise

# convert words to numbers
from nltk.tokenize import word_tokenize
nltk.download('punkt')
book_text_words = (word_tokenize(book_text))
n_words = len(book_text_words)
unique_words = len(set(book_text_words))

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=unique_words)
tokenizer.fit_on_texts(book_text_words)

vocab_size = len(tokenizer.word_index) + 1    # word_index is the dictionary. Store the number of unique words in vocab_size variable
word_2_index = tokenizer.word_index           # store the dictionary in the variable called word_2_index

# Create the input sequences
input_sequence_words = []  # input sequences in words (used for metric evaluation later on)
input_sequence = []   # empty list to hold the sequences that will be input into our model
output_words = []     # empty list to hold the output words
input_seq_length = 100  # length of the input sequence
for i in range(0, n_words - input_seq_length , 1):
    in_seq = book_text_words[i:i + input_seq_length]
    input_sequence_words.append(in_seq)
    out_seq = book_text_words[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

# reshape the input sequences to be 3-dimensional
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))

# Normalise the data by dividing by the max number of unique words (the vocab size)
#X = X / float(vocab_size)

# one-hot encode the output words so that they can be used by the model (converts the output to 2-dimensions)
y = to_categorical(output_words)

# create, compile and fit the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=X.shape[1]))
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=64, epochs=100, verbose=1)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           17200     
                                                                 
 lstm_3 (LSTM)               (None, 800)               2723200   
                                                                 
 dense_3 (Dense)             (None, 344)               275544    
                                                                 
Total params: 3,015,944
Traina

<keras.callbacks.History at 0x7f30a7aa7670>

In [8]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(100):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    # int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.

# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

him on his coming of age by his own marriage likewise which happened soon afterwards he added to his wealth to him therefore the succession to the norland estate was not so really important as to his sisters for their fortune independent of what might arise to them from their father inheriting that property could be but small their mother had nothing and their father only seven thousand pounds in his own disposal for the remaining moiety of his first wife fortune was also secured to her child and he had only life interest in it the old gentleman died
Seed word sequence: him on his coming of age by his own marriage likewise which happened soon afterwards he added to his wealth to him therefore the succession to the norland estate was not so really important as to his sisters for their fortune independent of what might arise to them from their father inheriting that property could be but small their mother had nothing and their father only seven thousand pounds in his own disposal for th