In [0]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import pandas as pd

sequence_length=1

# define the model
def define_model(vocab_size,max_length):
	model = Sequential()
	model.add(Embedding(vocab_size, 10, input_length=max_length-1))
	model.add(LSTM(50))
	model.add(Dense(vocab_size, activation='softmax'))
	# compile network
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize defined model
	model.summary()
	plot_model(model, to_file='model.png', show_shapes=True)
	return model

# generate a sequence from a language model
def generate_text(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded,verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text


# framing language modelling
def frame_lm(encoded, length):
  sequences = list()
  for i in range(length, len(encoded)):
    sequence = encoded[i-length:i+1]
    sequences.append(sequence)
  return sequences

# source text
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

sequences=frame_lm(encoded,sequence_length)
print('Total Sequences: %d' % len(sequences))
max_length=max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences,maxlen=max_length,padding='pre')
print('Max Sequence Length: %d' %max_length)
# split into X and y elements
sequences = array(sequences)
#all columns except last go into X and the last columns goes into y
X, y = sequences[:,:-1],sequences[:,-1]
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)
print(y.shape)
# define model
model = define_model(vocab_size,max_length)
# fit network
model.fit(X, y, epochs=500, verbose=2)
# evaluate
print(generate_text(model, tokenizer, max_length-1,'Jack and Jill', 10))