In [1]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load the text
raw_text = load_doc('rhyme.txt')
print(raw_text)

Sing a song of sixpence,

A pocket full of rye.

Four and twenty blackbirds,

Baked in a pie.

When the pie was opened

The birds began to sing;

Wasn't that a dainty dish,

To set before the king.

The king was in his counting house,

Counting out his money;

The queen was in the parlour,

Eating bread and honey.

The maid was in the garden,

Hanging out the clothes,

When down came a blackbird

And pecked off her nose.



In [2]:
# clean
tokens = raw_text.split()
raw_text = ' '.join(tokens)
print(raw_text)
print(tokens)

Sing a song of sixpence, A pocket full of rye. Four and twenty blackbirds, Baked in a pie. When the pie was opened The birds began to sing; Wasn't that a dainty dish, To set before the king. The king was in his counting house, Counting out his money; The queen was in the parlour, Eating bread and honey. The maid was in the garden, Hanging out the clothes, When down came a blackbird And pecked off her nose.
['Sing', 'a', 'song', 'of', 'sixpence,', 'A', 'pocket', 'full', 'of', 'rye.', 'Four', 'and', 'twenty', 'blackbirds,', 'Baked', 'in', 'a', 'pie.', 'When', 'the', 'pie', 'was', 'opened', 'The', 'birds', 'began', 'to', 'sing;', "Wasn't", 'that', 'a', 'dainty', 'dish,', 'To', 'set', 'before', 'the', 'king.', 'The', 'king', 'was', 'in', 'his', 'counting', 'house,', 'Counting', 'out', 'his', 'money;', 'The', 'queen', 'was', 'in', 'the', 'parlour,', 'Eating', 'bread', 'and', 'honey.', 'The', 'maid', 'was', 'in', 'the', 'garden,', 'Hanging', 'out', 'the', 'clothes,', 'When', 'down', 'came', 

In [3]:
# organize into sequences of characters
length = 10
sequences = list()
for i in range(length, len(raw_text)):
    # select sequence of tokens
    seq = raw_text[i-length:i+1]
    # store
    sequences.append(seq)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 399


In [4]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [5]:
# save sequences to file
out_filename = 'char_sequences.txt'
save_doc(sequences, out_filename)

In [6]:
chars = load_doc('char_sequences.txt')
print(chars)

Sing a song
ing a song 
ng a song o
g a song of
 a song of 
a song of s
 song of si
song of six
ong of sixp
ng of sixpe
g of sixpen
 of sixpenc
of sixpence
f sixpence,
 sixpence, 
sixpence, A
ixpence, A 
xpence, A p
pence, A po
ence, A poc
nce, A pock
ce, A pocke
e, A pocket
, A pocket 
 A pocket f
A pocket fu
 pocket ful
pocket full
ocket full 
cket full o
ket full of
et full of 
t full of r
 full of ry
full of rye
ull of rye.
ll of rye. 
l of rye. F
 of rye. Fo
of rye. Fou
f rye. Four
 rye. Four 
rye. Four a
ye. Four an
e. Four and
. Four and 
 Four and t
Four and tw
our and twe
ur and twen
r and twent
 and twenty
and twenty 
nd twenty b
d twenty bl
 twenty bla
twenty blac
wenty black
enty blackb
nty blackbi
ty blackbir
y blackbird
 blackbirds
blackbirds,
lackbirds, 
ackbirds, B
ckbirds, Ba
kbirds, Bak
birds, Bake
irds, Baked
rds, Baked 
ds, Baked i
s, Baked in
, Baked in 
 Baked in a
Baked in a 
aked in a p
ked in a pi
ed in a pie
d in a pie.
 in a pie. 
in a pie. W
n a pie. Wh
 a p

Now that we have prepared our sequence of characters (time series data), we are ready to build our Neural Language Model or just Language Model.

# Building and Training our Model

In [7]:
lines = chars.split('\n')

In [8]:
# Encode Sequences
chars = sorted(list(set(raw_text)))
mapping = dict((c,i) for i, c in enumerate(chars))

In [9]:
sequences = list()
for line in lines:
    # integer encode line
    encoded_seq = [mapping[char] for char in line]
    # store
    sequences.append(encoded_seq)

In [10]:
# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 37


In [11]:
print(mapping)

{' ': 0, "'": 1, ',': 2, '.': 3, ';': 4, 'A': 5, 'B': 6, 'C': 7, 'E': 8, 'F': 9, 'H': 10, 'S': 11, 'T': 12, 'W': 13, 'a': 14, 'b': 15, 'c': 16, 'd': 17, 'e': 18, 'f': 19, 'g': 20, 'h': 21, 'i': 22, 'k': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'w': 34, 'x': 35, 'y': 36}


In [12]:
print(sequences)

[[11, 22, 26, 20, 0, 14, 0, 31, 27, 26, 20], [22, 26, 20, 0, 14, 0, 31, 27, 26, 20, 0], [26, 20, 0, 14, 0, 31, 27, 26, 20, 0, 27], [20, 0, 14, 0, 31, 27, 26, 20, 0, 27, 19], [0, 14, 0, 31, 27, 26, 20, 0, 27, 19, 0], [14, 0, 31, 27, 26, 20, 0, 27, 19, 0, 31], [0, 31, 27, 26, 20, 0, 27, 19, 0, 31, 22], [31, 27, 26, 20, 0, 27, 19, 0, 31, 22, 35], [27, 26, 20, 0, 27, 19, 0, 31, 22, 35, 28], [26, 20, 0, 27, 19, 0, 31, 22, 35, 28, 18], [20, 0, 27, 19, 0, 31, 22, 35, 28, 18, 26], [0, 27, 19, 0, 31, 22, 35, 28, 18, 26, 16], [27, 19, 0, 31, 22, 35, 28, 18, 26, 16, 18], [19, 0, 31, 22, 35, 28, 18, 26, 16, 18, 2], [0, 31, 22, 35, 28, 18, 26, 16, 18, 2, 0], [31, 22, 35, 28, 18, 26, 16, 18, 2, 0, 5], [22, 35, 28, 18, 26, 16, 18, 2, 0, 5, 0], [35, 28, 18, 26, 16, 18, 2, 0, 5, 0, 28], [28, 18, 26, 16, 18, 2, 0, 5, 0, 28, 27], [18, 26, 16, 18, 2, 0, 5, 0, 28, 27, 16], [26, 16, 18, 2, 0, 5, 0, 28, 27, 16, 23], [16, 18, 2, 0, 5, 0, 28, 27, 16, 23, 18], [18, 2, 0, 5, 0, 28, 27, 16, 23, 18, 32], [2, 0, 5,

In [13]:
import numpy as np
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:,-1]

In [14]:
import tensorflow as tf
sequences = [tf.keras.utils.to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(sequences)
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

In [15]:
# define the model
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from pickle import dump

def define_model(X):
    model = Sequential()
    model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(vocab_size, activation='softmax'))
    # compile the model
    model.compile(loss='categorical_crossentropy',
                 optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

Using TensorFlow backend.


In [16]:
model = define_model(X)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                33900     
_________________________________________________________________
dense_1 (Dense)              (None, 37)                2812      
Total params: 36,712
Trainable params: 36,712
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(X, y, epochs=100, verbose=2)
# save the model
model.save('LSTM_model.h5')
# save the mapping
dump(mapping, open('mapping.pkl', 'wb'))

Epoch 1/100
 - 0s - loss: 3.5862 - accuracy: 0.1178
Epoch 2/100
 - 0s - loss: 3.4432 - accuracy: 0.1880
Epoch 3/100
 - 0s - loss: 3.1678 - accuracy: 0.1905
Epoch 4/100
 - 0s - loss: 3.0592 - accuracy: 0.1905
Epoch 5/100
 - 0s - loss: 3.0128 - accuracy: 0.1905
Epoch 6/100
 - 0s - loss: 2.9971 - accuracy: 0.1905
Epoch 7/100
 - 0s - loss: 2.9785 - accuracy: 0.1905
Epoch 8/100
 - 0s - loss: 2.9639 - accuracy: 0.1905
Epoch 9/100
 - 0s - loss: 2.9507 - accuracy: 0.1905
Epoch 10/100
 - 0s - loss: 2.9352 - accuracy: 0.1905
Epoch 11/100
 - 0s - loss: 2.9349 - accuracy: 0.1905
Epoch 12/100
 - 0s - loss: 2.9137 - accuracy: 0.2080
Epoch 13/100
 - 0s - loss: 2.8961 - accuracy: 0.1980
Epoch 14/100
 - 0s - loss: 2.8883 - accuracy: 0.1905
Epoch 15/100
 - 0s - loss: 2.8618 - accuracy: 0.2005
Epoch 16/100
 - 0s - loss: 2.8357 - accuracy: 0.2005
Epoch 17/100
 - 0s - loss: 2.8176 - accuracy: 0.2105
Epoch 18/100
 - 0s - loss: 2.7928 - accuracy: 0.2055
Epoch 19/100
 - 0s - loss: 2.7715 - accuracy: 0.2607
Ep

In [20]:
# generate characters
from pickle import load
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        encoded = encoded.reshape(1, 10, 37)
        # predict character
        y_pred = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == y_pred:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text

# load the model
model = load_model('LSTM_model.h5')
# load the mapping
mapping = load(open('mapping.pkl', 'rb'))
# test start of rhyme
print(generate_seq(model, mapping, 10, 'Sing a son', 20))
# test mid-line
print(generate_seq(model, mapping, 10, 'king was i', 20))
# test not in original
print(generate_seq(model, mapping, 10, 'hello worl', 20))

Sing a song of sixpence, A poc
king was in his counting house
hello worl, The ind w seey  ap
