In [1]:
# Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.python.keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM
from keras.optimizers import SGD
import pickle

In [2]:
# Get data
file = 'data.txt'
data = [open(file).read().lower()]

In [3]:
# Create a tokenizer and fit it on the data
token_obj = Tokenizer()
token_obj.fit_on_texts(data)

In [4]:
# Get the size of the vocabulary
vocab_size = len(token_obj.word_index) + 1

In [5]:
# Tokenize the data using our fitted tokenizer
data_tokens = token_obj.texts_to_sequences(data)[0]

In [6]:
# Define the memory length i.e. how many words the model is predicting on
MEMORY_LENGTH = 5

In [7]:
# Create lists to store sets of previous words X, and their corresponding
# next word y
X = []
y = []

In [8]:
# Populate these lists with all the data retrieved from the txt file
for j in range(len(data_tokens) - MEMORY_LENGTH):
    X.append(data_tokens[j:j + MEMORY_LENGTH])
    y.append(data_tokens[j + MEMORY_LENGTH])

In [9]:
print(X)

[[59, 502, 1, 14, 6], [502, 1, 14, 6, 383], [1, 14, 6, 383, 11], [14, 6, 383, 11, 392], [6, 383, 11, 392, 1], [383, 11, 392, 1, 38], [11, 392, 1, 38, 117], [392, 1, 38, 117, 22], [1, 38, 117, 22, 845], [38, 117, 22, 845, 12], [117, 22, 845, 12, 474], [22, 845, 12, 474, 42], [845, 12, 474, 42, 1], [12, 474, 42, 1, 38], [474, 42, 1, 38, 265], [42, 1, 38, 265, 301], [1, 38, 265, 301, 4], [38, 265, 301, 4, 846], [265, 301, 4, 846, 4], [301, 4, 846, 4, 121], [4, 846, 4, 121, 301], [846, 4, 121, 301, 12], [4, 121, 301, 12, 6], [121, 301, 12, 6, 314], [301, 12, 6, 314, 5], [12, 6, 314, 5, 4], [6, 314, 5, 4, 150], [314, 5, 4, 150, 314], [5, 4, 150, 314, 572], [4, 150, 314, 572, 315], [150, 314, 572, 315, 4], [314, 572, 315, 4, 384], [572, 315, 4, 384, 121], [315, 4, 384, 121, 301], [4, 384, 121, 301, 85], [384, 121, 301, 85, 4], [121, 301, 85, 4, 329], [301, 85, 4, 329, 42], [85, 4, 329, 42, 7], [4, 329, 42, 7, 21], [329, 42, 7, 21, 26], [42, 7, 21, 26, 88], [7, 21, 26, 88, 13], [21, 26, 88, 1

In [10]:
print(y)

[383, 11, 392, 1, 38, 117, 22, 845, 12, 474, 42, 1, 38, 265, 301, 4, 846, 4, 121, 301, 12, 6, 314, 5, 4, 150, 314, 572, 315, 4, 384, 121, 301, 85, 4, 329, 42, 7, 21, 26, 88, 13, 2, 301, 4, 743, 846, 4, 384, 85, 4, 329, 846, 8, 41, 6, 77, 129, 22, 44, 99, 330, 22, 1, 14, 2, 34, 2, 990, 13, 2, 34, 7, 4, 846, 2321, 536, 19, 1533, 42, 3, 29, 285, 1534, 537, 18, 94, 10, 502, 75, 2, 744, 2322, 5, 991, 1534, 18, 181, 8, 41, 6, 77, 2, 475, 4, 1206, 314, 5, 1207, 7, 12, 4, 150, 102, 19, 273, 2, 475, 133, 1207, 183, 11, 4, 669, 68, 1535, 7, 21, 26, 990, 13, 2, 1, 151, 847, 11, 4, 314, 407, 475, 5, 151, 847, 331, 104, 11, 4, 845, 5, 407, 1207, 7, 31, 2323, 4, 314, 1, 45, 265, 2, 2324, 1, 96, 7, 2325, 9, 22, 1536, 2326, 1208, 51, 28, 7, 8, 2327, 11, 210, 100, 1, 408, 2, 847, 11, 4, 573, 100, 5, 345, 4, 670, 5, 52, 42, 28, 7, 8, 2328, 7, 2329, 4, 1206, 2330, 2, 22, 1208, 98, 623, 409, 10, 146, 14, 2, 1537, 7, 98, 223, 1, 410, 71, 18, 26, 6, 316, 31, 503, 10, 475, 7, 2, 6, 244, 669, 345, 4, 846, 98,

In [11]:
X = np.array(X)
y = np.array(y)

In [12]:
# Define the embedding dimension
EMBEDDING_DIM = 100

In [13]:
# Build a Sequential model
model = Sequential()
# Add embedding layer with the set embedding dimension
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length = MEMORY_LENGTH))
# Add a Long-Short Term Memory (LSTM) layer with n nodes
model.add(LSTM(128, activation = 'relu',
            kernel_initializer = 'he_uniform'))
# Add a dense layer with softmax activation representing the output layer
model.add(Dense(vocab_size, activation = 'softmax'))

In [14]:
# Get a summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 100)            437100    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dense (Dense)                (None, 4371)              563859    
Total params: 1,118,207
Trainable params: 1,118,207
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Create a stochastic gradient descent optimizer
opt = SGD(learning_rate = 0.02)

In [16]:
# Compile the model with the SGD optimizer, categorical crossentropy loss,
# and accuracy as it's metric
model.compile(optimizer = opt, loss = 'sparse_categorical_crossentropy',
                metrics = ['accuracy'])

In [None]:
# Fit the model on the data with 10% validation split, run for 30 epochs
# with a batch size of 32, and save its history
hist = model.fit(X, y, validation_split = 0.1, epochs = 30, batch_size = 32, verbose = 1).history

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30

In [None]:
# Save the model in an h5 file
model.save('5w-model.h5')

In [None]:
# Save graph in a jpg file
plt.plot(hist['accuracy'])
plt.plot(hist['val_accuracy'])
plt.title('Accuracy of next word prediction model')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.ylim([0, 1])
plt.legend(['train', 'test'])
plt.savefig('Extras/5w-graph.jpg')