Step0: Make the necessary imports

In [0]:
from numpy import array
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils.vis_utils import plot_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


Code to upload files to Colab from your file system

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

A utility function to read a file from the Colab file system

In [0]:
def read_file(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

A utility function to return a rhyme text

In [0]:
def read_rhyme():
  text = '''Old MACDONALD had a farm\n
  E-I-E-I-O\n
  And on his farm he had a cow\n
  E-I-E-I-O\n
  With a moo moo here\n
  And a moo moo there\n
  Here a moo, there a moo\n
  Everywhere a moo moo\n
  Old MacDonald had a farm\n
  E-I-E-I-O\n'''
  return text

Step 1 - Define your training data - this function creates character sequences of given length from given text

In [0]:
def create_char_sequences(text,length):
  print(text)
  # clean up extra white spaces and new lines
  tokens = text.split()
  text = ' '.join(tokens)
  print(text)
  # transform into character sequences
  char_sequences = list()
  for i in range(length, len(text)):
    # select sequence of tokens
    seq = text[i-length:i+1]
    # store
    char_sequences.append(seq)
  print('Total Sequences: %d' % len(char_sequences))
  return char_sequences

Step 1 - Define your training data - this function creates the character to index mapping which is used for encoding charcter sequences created above

In [0]:
def define_mapping(text):
  # define the mapping to encode characters to integers
  chars = sorted(list(set(text)))
  mapping = dict((c, i) for i, c in enumerate(chars))
  return mapping

Step 1 - Define your training data - this function encodes character sequences to integer sequences

In [0]:
def encode_char_sequences(text,mapping):
  lines = text.split('\n')
  sequences = list()
  # integer encode each line in lines and store in sequences
  for line in lines:
    encoded_seq = [mapping[char] for char in line]
    sequences.append(encoded_seq)
  return sequences

Step 1 - Define your training data

In [0]:
def create_training_data(text,mapping,vocab_size):
  # integer encode sequences of characters
  sequences = encode_char_sequences(text,mapping)
  # separate into input and output
  sequences = array(sequences)
  X, y = sequences[:,:-1], sequences[:,-1]
  sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
  X = array(sequences)
  y = to_categorical(y, num_classes=vocab_size)
  return X,y

Steps 2 & 3 - Define your model and configure the learning process

In [0]:
def define_model(X,vocab_size):
  model = Sequential()
  model.add(LSTM(100,input_shape=(X.shape[1], X.shape[2])))
  model.add(Dense(vocab_size,activation='softmax'))
  # compile model
  model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
  # summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

Step 4: Train your model and save the trained model

In [0]:
def train_model(model,X,y):
  # fit model
  model.fit(X, y, epochs=100, verbose=2)
  # save the model to file
  model.save('model.h5')

Step 5: Generate Text

In [0]:
def generate_text(model, mapping, seq_length, seed_text, n_chars):
	in_text = seed_text
	# generate a fixed number of characters
	for _ in range(n_chars):
		# encode the characters as integers
		encoded = [mapping[char] for char in in_text]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# one hot encode
		encoded = to_categorical(encoded, num_classes=len(mapping))
		# predict character
		yhat = model.predict_classes(encoded, verbose=0)
		# reverse map integer to character
		out_char = ''
		for char, index in mapping.items():
			if index == yhat:
				out_char = char
				break
		# append to input
		in_text += out_char
	return in_text

Step 5: Predict only one next character

In [0]:
def predict_next_char(model,mapping,seq_length,seed_text):
  #in_text = seed_text
  # encode the characters as integers
  encoded = [mapping[char] for char in in_text]
  # truncate sequences to a fixed length
  encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
  # one hot encode
  encoded = to_categorical(encoded, num_classes=len(mapping))
  # predict character
  yhat = model.predict_classes(encoded, verbose=0)
  # reverse map integer to character
  out_char = ''
  for char, index in mapping.items():
    if index == yhat:
      out_char = char
      break
  return out_char

Main code that calls all the functions for different steps

In [0]:
#read and prerocess (clean) text
char_sequence_list=create_char_sequences(read_rhyme(),10)
raw_text = '\n'.join(char_sequence_list)
#define the mapping from chars to integers
mapping = define_mapping(raw_text)
# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)
# create training data
X,y=create_training_data(raw_text,mapping,vocab_size)
#print(X.shape)
#print(y.shape)
# define model
model = define_model(X,vocab_size)
# train model
train_model(model,X,y)
# test start of rhyme
print(generate_text(model, mapping, 10, 'Old MACDON', 50))
# test mid-line
print(generate_text(model, mapping, 10, 'With a moo', 50))
# test not in original
print(generate_text(model, mapping, 10, 'Hello worl', 50))
#predict_next_char(model,mapping,10,'Old MACDON')