In [1]:
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
import keras
from keras import backend as K
from tensorflow.python.client import device_lib
import pandas as pd
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import requests
from nltk.tokenize import word_tokenize

from gensim.models import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding


from timeit import default_timer as timer

Using TensorFlow backend.


Only run this code if you have a GPU. This part of the code makes the code run on your GPU, I used the tensorflow-gpu version 1.15 for this to work. It's considerably faster using my GPU (NVIDIA GTX 1060 6GB), than my CPU. About ~5 times faster, depending on the dataset I use.

In [2]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

print(device_lib.list_local_devices())
K.tensorflow_backend._get_available_gpus()

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:27:00.0, compute capability: 6.1

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12005222237026359134
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5083824128
locality {
  bus_id: 1
  links {
  }
}
incarnation: 8974174928950180734
physical_device_desc: "device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:27:00.0, compute capability: 6.1"
]


['/job:localhost/replica:0/task:0/device:GPU:0']

Several different datasets I experimented on, they are listed from short to long. The shortest one takes a couple of seconds to train, the longest one can, depending on your computer, take hours.

In [1]:
with open('wikiped.txt', 'r') as file:
    data = file.read().replace('\n', '')
print(len(data))

29088


In [3]:
df = pd.read_csv("jokes.csv")
data = ' '.join(df['Joke'].tolist()).replace("\'", "")
print(len(data))

128866


In [19]:
url = "http://gutenberg.org/files/1342/1342-0.txt"
book = requests.get(url)
data = book.text
data = data[2440:]
print(len(data))

797205


In [50]:
url = "https://www.gutenberg.org/files/24869/24869-0.txt"
book = requests.get(url)
data = book.text
print(len(data))

2396753


Cleaning of the data

In [20]:
def clean_dataset(dataset, char_filter = r"[^\w]"):

    # convert words to lower case
    dataset = dataset.lower()
    dataset = dataset.replace("Ã¢", "a")
    # tokenise words
    words = word_tokenize(dataset)

    # strip whitespace from all words
    words = [word.strip() for word in words]

        
    # join back words to get dataset
    dataset = " ".join(words)

    # remove unwanted characters
    dataset = re.sub(char_filter, " ", dataset)

    # replace multiple whitespaces with single whitespace
    dataset = re.sub(r"\s+", " ", dataset)

    # strip whitespace from dataset
    dataset = dataset.strip()

    return dataset

data = clean_dataset(data)

In [21]:
print(data[0:1000])

it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife however little known the feelings or views of such a man may be on his first entering a neighbourhood this truth is so well fixed in the minds of the surrounding families that he is considered the rightful property of some one or other of their daughters a my dear mr bennet a said his lady to him one day a have you heard that netherfield park is let at last a mr bennet replied that he had not a but it is a returned she a for mrs long has just been here and she told me all about it a mr bennet made no answer a do you not want to know who has taken it a cried his wife impatiently a _you_ want to tell me and i have no objection to hearing it a this was invitation enough a why my dear you must know mrs long says that netherfield is taken by a young man of large fortune from the north of england that he came down on monday in a chaise and four to see the place and was so much d

In [22]:
word_tokeniser = Tokenizer()
word_tokeniser.fit_on_texts([data])
encoded_words = word_tokeniser.texts_to_sequences([data])[0]

In [23]:
VOCABULARY_SIZE = len(word_tokeniser.word_index) + 1
print('Vocabulary Size: {}'.format(VOCABULARY_SIZE))

Vocabulary Size: 6875


In [24]:
sequences = []
MAX_SEQ_LENGTH = 10

for i in range(MAX_SEQ_LENGTH, len(encoded_words)):
    sequence = encoded_words[i-MAX_SEQ_LENGTH:i+1]
    sequences.append(sequence)
sequences = np.array(sequences)

In [25]:
# divide the sequence into X and y
sequences = np.array(sequences)

X = sequences[:80000,:-1]  # assign all but last words of a sequence to X
y = sequences[:80000,-1]   # assign last word of each sequence to y
y = to_categorical(y, num_classes=VOCABULARY_SIZE)

In [26]:
X = pad_sequences(X, maxlen=MAX_SEQ_LENGTH, padding='pre')

In [27]:
# create model architecture

EMBEDDING_SIZE = 100


model = Sequential()

# embedding layer
model.add(Embedding(VOCABULARY_SIZE, EMBEDDING_SIZE, input_length = MAX_SEQ_LENGTH))

# lstm layer 1
model.add(LSTM(128, return_sequences=True))

# lstm layer 2
model.add(LSTM(128))

# output layer
model.add(Dense(VOCABULARY_SIZE, activation='softmax'))

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  
# summarize defined model
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 100)           687500    
_________________________________________________________________
lstm_3 (LSTM)                (None, 10, 128)           117248    
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 6875)              886875    
Total params: 1,823,207
Trainable params: 1,823,207
Non-trainable params: 0
_________________________________________________________________


In [35]:
# The 'joke' database on GPU 154.4100482 seconds for 100 epochs seq length of 4.
# The 'joke' database on CPU 439.003922 seconds for 100 epochs seq length of 4.

start = timer()
model.fit(X, y, epochs=6, verbose=1, batch_size=256)
end = timer()
print(end - start)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
55.24112939999986


Make predictions using this function

In [14]:
def generate_words(model, word_tokeniser, MAX_SEQ_LENGTH, seed, n_words):
    
    text = seed
    
    # generate n_words
    for _ in range(n_words):
        
        # encode text as integers
        encoded_words = word_tokeniser.texts_to_sequences([text])[0]
        
        # pad sequences
        padded_words = pad_sequences([encoded_words], maxlen=MAX_SEQ_LENGTH, padding='pre')
        
        # predict next word
        prediction = model.predict_classes(padded_words, verbose=0)
        
        print(sorted(model.predict(padded_words)[0], reverse=True)[0:10])
        
        # convert predicted index to its word
        next_word = ""
        for word, i in word_tokeniser.word_index.items():
            if i == prediction:
                next_word = word
                break
        
        # append predicted word to text
        text += " " + next_word
        
    return text

Pride and Prejudice corpus results

In [51]:
num_words = 4

sentence = "I was very much flattered by"
print(generate_words(model, word_tokeniser, MAX_SEQ_LENGTH, sentence, num_words))

[0.99917847, 0.00068758277, 6.306314e-05, 1.7433202e-05, 1.1614113e-05, 8.018825e-06, 5.4203883e-06, 5.2689443e-06, 3.340532e-06, 2.7873257e-06]
[0.18028742, 0.17101498, 0.16787572, 0.09690676, 0.09265507, 0.06591675, 0.062038455, 0.042713653, 0.017685033, 0.011922607]
[0.19502637, 0.1772658, 0.1566488, 0.121589184, 0.07760419, 0.07511853, 0.05972236, 0.04104813, 0.021070376, 0.014065979]
[0.765685, 0.069594964, 0.040247027, 0.018514656, 0.016270895, 0.012419882, 0.010289915, 0.01003431, 0.0062004873, 0.005689317]
I was very much flattered by the regiment since mr


In [54]:
num_words = 3

sentence = "though he was now only established as a"
print(generate_words(model, word_tokeniser, MAX_SEQ_LENGTH, sentence, num_words))

[0.28368878, 0.14794074, 0.1075356, 0.09502737, 0.08129186, 0.061073847, 0.043049652, 0.03332956, 0.030446982, 0.012883066]
[0.7236329, 0.27459297, 0.0012267408, 0.0002207649, 0.00012167183, 5.8653062e-05, 4.212259e-05, 1.758753e-05, 1.7078182e-05, 1.6908958e-05]
[0.6196457, 0.15557612, 0.06316139, 0.049632628, 0.033423506, 0.015990576, 0.012141923, 0.007345006, 0.006750378, 0.005515686]
though he was now only established as a full of men


With the joke dataset 

In [16]:
num_words = 2

sentence = "Knock knock"
print(generate_words(model, word_tokeniser, MAX_SEQ_LENGTH, sentence, num_words))

[0.6683171, 0.25462034, 0.0651055, 0.00431349, 0.0012285054, 0.00091190205, 0.0007116308, 0.00050085026, 0.00046884821, 0.00038164938]
[0.98092365, 0.0048741344, 0.0023324254, 0.0018169106, 0.0009334787, 0.00072973984, 0.0006090417, 0.0005649004, 0.00053405005, 0.00049594225]
Knock knock whos there


In [18]:
num_words = 4

sentence = "What did the"
print(generate_words(model, word_tokeniser, MAX_SEQ_LENGTH, sentence, num_words))

[0.18106021, 0.10527202, 0.087918885, 0.06749019, 0.04438426, 0.027411213, 0.02216347, 0.020651741, 0.020146707, 0.019159257]
[0.8620607, 0.061113253, 0.012960028, 0.010936391, 0.010249718, 0.006638202, 0.005750539, 0.005358668, 0.003060069, 0.002434764]
[0.752341, 0.04861866, 0.04823112, 0.039533246, 0.015625905, 0.013603913, 0.013017845, 0.0076805474, 0.0075942194, 0.0070863375]
[0.7690748, 0.110211566, 0.0147162145, 0.014382733, 0.014136069, 0.009521721, 0.008592104, 0.008185829, 0.007048806, 0.005682896]
What did the rubber band factory worker
