## 1. Load and preprocess the text

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

text = open('bible.txt', 'rb').read().decode(encoding='utf-8').lower()
text[:250]

'1:1 in the beginning god created the heaven and the earth.\r\n\r\n1:2 and the earth was without form, and void; and darkness was upon\r\nthe face of the deep. and the spirit of god moved upon the face of the\r\nwaters.\r\n\r\n1:3 and god said, let there be light'

In [2]:
# remove \n and \r
text = text.replace('\r', '').replace('\n', ' ')
text[:250]

'1:1 in the beginning god created the heaven and the earth.  1:2 and the earth was without form, and void; and darkness was upon the face of the deep. and the spirit of god moved upon the face of the waters.  1:3 and god said, let there be light: and '

In [3]:
# remover the paragaphs numbers
import re
pattern = r'[0-9]+:[0-9]+'
text = re.sub(pattern, '', text)[1:].replace('   ', ' ').replace('  ', ' ')
text[:250]

'in the beginning god created the heaven and the earth. and the earth was without form, and void; and darkness was upon the face of the deep. and the spirit of god moved upon the face of the waters. and god said, let there be light: and there was ligh'

In [4]:
# remove punctutation
text = re.sub(r'[^\w\s]', '', text)
text[:250]

'in the beginning god created the heaven and the earth and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters and god said let there be light and there was light and g'

In [5]:
tokens = text.split(' ')
tokens = tokens[:500_000]

number_of_unique_tokens = len(set(tokens))

print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % number_of_unique_tokens)

sequence_length = 20

# organize into sequences of tokens of input words plus one output word
length = sequence_length + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)

print ('Total Sequences: %d' % len(sequences))
print ('This is the first sequence: {0}'.format(sequences[0]))
print ('This is the second sequence: {0}'.format(sequences[1]))
print ('This is the third sequence: {0}'.format(sequences[2]))

Total Tokens: 500000
Unique Tokens: 10071
Total Sequences: 499979
This is the first sequence: in the beginning god created the heaven and the earth and the earth was without form and void and darkness was
This is the second sequence: the beginning god created the heaven and the earth and the earth was without form and void and darkness was upon
This is the third sequence: beginning god created the heaven and the earth and the earth was without form and void and darkness was upon the


## 2. Train the model

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout
from keras.layers import Embedding
 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)

# remove sequences with not enough words
sequences = [sequences[i] for i in range(len(sequences)) if len(sequences[i])==length]

vocab_size = number_of_unique_tokens + 1
 
sequences0 = np.array(sequences)
X, y = sequences0[:,:-1], sequences0[:,-1]
y = to_categorical(y, num_classes=vocab_size)

## 3. Predict word by word

In [7]:
import keras
model = keras.models.load_model('bible_model', compile=False)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# must be longer than sequence_length
seed = "and the lord said unto moses and aaron go get thee down for thy people i will make of thee a great nation the twelve children of"
n_predictions = 100
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

print('seed : \n' + seed)
last_words = seed.split(' ')[-sequence_length:]
preds = []

for i in range(n_predictions):
    example = tokenizer.texts_to_sequences([last_words])
    prediction = model.predict(np.array(example), verbose=0)
    predicted_word = np.argmax(prediction)
    last_words = last_words[1:]
    last_words.append(reverse_word_map[predicted_word])
    preds.append(reverse_word_map[predicted_word])
    
print('next words : \n' + ' '.join(preds))

seed : 
and the lord said unto moses and aaron go get thee down for thy people i will make of thee a great nation the twelve children of
next words : 
israel which thou hast brought forth from the hand of the lord and the lord spake unto moses saying speak unto the children of israel and say unto them ye shall not be eaten and they shall not be ashamed and they shall be as the thing that is in the land of egypt and the lord said unto moses stretch out thine hand and the angel of the lord was kindled against israel and he said unto him i pray thee to the king of israel and say unto him behold i will not go down to the children
