In [1]:
from numpy import array 
from pickle import dump 
from keras.utils import to_categorical 
from keras.utils.vis_utils import plot_model 
from keras.models import Sequential 
from keras.layers import Dense 
from keras.layers import LSTM
from pickle import load 
from keras.models import load_model 
from keras.utils import to_categorical 
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [2]:
def load_doc(filename): 
    # open the file as read only 
    file = open(filename,'r') 
    # read all text 
    text = file.read() 
    # close the file 
    file.close() 
    return text

In [3]:
# load text 
raw_text = load_doc('rhyme.txt') 
print(raw_text)


Sing a song of sixpence, A pocket full of rye. Four and twenty blackbirds, Baked in a pie.
When the pie was opened The birds began to sing; Wasn't that a dainty dish, To set before the king.
The king was in his counting house, Counting out his money; The queen was in the parlour, Eating bread and honey.
The maid was in the garden, Hanging out the clothes, When down came a blackbird And pecked off her nose.



In [4]:
# clean 
tokens = raw_text.split() 
raw_text = ' '.join(tokens)


In [5]:
raw_text

"Sing a song of sixpence, A pocket full of rye. Four and twenty blackbirds, Baked in a pie. When the pie was opened The birds began to sing; Wasn't that a dainty dish, To set before the king. The king was in his counting house, Counting out his money; The queen was in the parlour, Eating bread and honey. The maid was in the garden, Hanging out the clothes, When down came a blackbird And pecked off her nose."

In [6]:
len(raw_text)

409

In [7]:
# organize into sequences of characters 
length = 10 
sequences = list() 
for i in range(length, len(raw_text)): 
    # select sequence of tokens 
    seq = raw_text[i-length:i+1]
    # store 
    sequences.append(seq) 
print('Total Sequences: %d' % len(sequences))

Total Sequences: 399


In [8]:
sequences[0:3]

['Sing a song', 'ing a song ', 'ng a song o']

In [9]:
# save tokens to file, one dialog per line 
def save_doc(lines, filename): 
    data = '\n'.join(lines) 
    file = open(filename, 'w') 
    file.write(data) 
    file.close()


In [10]:
# save sequences to file 
out_filename = 'char_sequences.txt' 
save_doc(sequences, out_filename)


In [11]:
# load 
in_filename = 'char_sequences.txt' 
raw_text = load_doc(in_filename) 
lines = raw_text.split('\n')


In [12]:
lines

['Sing a song',
 'ing a song ',
 'ng a song o',
 'g a song of',
 ' a song of ',
 'a song of s',
 ' song of si',
 'song of six',
 'ong of sixp',
 'ng of sixpe',
 'g of sixpen',
 ' of sixpenc',
 'of sixpence',
 'f sixpence,',
 ' sixpence, ',
 'sixpence, A',
 'ixpence, A ',
 'xpence, A p',
 'pence, A po',
 'ence, A poc',
 'nce, A pock',
 'ce, A pocke',
 'e, A pocket',
 ', A pocket ',
 ' A pocket f',
 'A pocket fu',
 ' pocket ful',
 'pocket full',
 'ocket full ',
 'cket full o',
 'ket full of',
 'et full of ',
 't full of r',
 ' full of ry',
 'full of rye',
 'ull of rye.',
 'll of rye. ',
 'l of rye. F',
 ' of rye. Fo',
 'of rye. Fou',
 'f rye. Four',
 ' rye. Four ',
 'rye. Four a',
 'ye. Four an',
 'e. Four and',
 '. Four and ',
 ' Four and t',
 'Four and tw',
 'our and twe',
 'ur and twen',
 'r and twent',
 ' and twenty',
 'and twenty ',
 'nd twenty b',
 'd twenty bl',
 ' twenty bla',
 'twenty blac',
 'wenty black',
 'enty blackb',
 'nty blackbi',
 'ty blackbir',
 'y blackbird',
 ' black

In [13]:
chars = sorted(list(set(raw_text))) 
mapping = dict((c, i) for i, c in enumerate(chars))

In [14]:
#raw_text

In [15]:
mapping

{'\n': 0,
 ' ': 1,
 "'": 2,
 ',': 3,
 '.': 4,
 ';': 5,
 'A': 6,
 'B': 7,
 'C': 8,
 'E': 9,
 'F': 10,
 'H': 11,
 'S': 12,
 'T': 13,
 'W': 14,
 'a': 15,
 'b': 16,
 'c': 17,
 'd': 18,
 'e': 19,
 'f': 20,
 'g': 21,
 'h': 22,
 'i': 23,
 'k': 24,
 'l': 25,
 'm': 26,
 'n': 27,
 'o': 28,
 'p': 29,
 'q': 30,
 'r': 31,
 's': 32,
 't': 33,
 'u': 34,
 'w': 35,
 'x': 36,
 'y': 37}

In [16]:
sequences = list() 
for line in lines: 
    # integer encode line 
    encoded_seq = [mapping[char] for char in line] 
    # store 
    sequences.append(encoded_seq)


In [17]:
# vocabulary size 
vocab_size = len(mapping) 
print('Vocabulary Size: %d' % vocab_size)


Vocabulary Size: 38


In [18]:
sequences[0:2]

[[12, 23, 27, 21, 1, 15, 1, 32, 28, 27, 21],
 [23, 27, 21, 1, 15, 1, 32, 28, 27, 21, 1]]

In [19]:
sequences = array(sequences) 
X, y = sequences[:,:-1], sequences[:,-1]


In [20]:
X,y

(array([[12, 23, 27, ..., 32, 28, 27],
        [23, 27, 21, ..., 28, 27, 21],
        [27, 21,  1, ..., 27, 21,  1],
        ...,
        [28, 20, 20, ...,  1, 27, 28],
        [20, 20,  1, ..., 27, 28, 32],
        [20,  1, 22, ..., 28, 32, 19]]),
 array([21,  1, 28, 20,  1, 32, 23, 36, 29, 19, 27, 17, 19,  3,  1,  6,  1,
        29, 28, 17, 24, 19, 33,  1, 20, 34, 25, 25,  1, 28, 20,  1, 31, 37,
        19,  4,  1, 10, 28, 34, 31,  1, 15, 27, 18,  1, 33, 35, 19, 27, 33,
        37,  1, 16, 25, 15, 17, 24, 16, 23, 31, 18, 32,  3,  1,  7, 15, 24,
        19, 18,  1, 23, 27,  1, 15,  1, 29, 23, 19,  4,  1, 14, 22, 19, 27,
         1, 33, 22, 19,  1, 29, 23, 19,  1, 35, 15, 32,  1, 28, 29, 19, 27,
        19, 18,  1, 13, 22, 19,  1, 16, 23, 31, 18, 32,  1, 16, 19, 21, 15,
        27,  1, 33, 28,  1, 32, 23, 27, 21,  5,  1, 14, 15, 32, 27,  2, 33,
         1, 33, 22, 15, 33,  1, 15,  1, 18, 15, 23, 27, 33, 37,  1, 18, 23,
        32, 22,  3,  1, 13, 28,  1, 32, 19, 33,  1, 16, 19, 20, 28,

In [21]:
#Next, we need to one hot encode each character
sequences = [to_categorical(x, num_classes=vocab_size) for x in X] 
X = array(sequences) 
y = to_categorical(y, num_classes=vocab_size)


In [22]:
X,y

(array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 1., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 1., 0., ..., 0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 1., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],


In [59]:
X.shape

(399, 10, 38)

In [60]:
y.shape

(399, 38)

In [23]:
# define the model 
def define_model(X): 
    model = Sequential() 
    model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2]))) 
    model.add(Dense(vocab_size, activation='softmax')) 
    # compile model 
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
    # summarize defined model 
    model.summary() 
    plot_model(model, to_file='character_text_model.png', show_shapes=True) 
    return model


In [24]:
model = define_model(X) 






_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                34200     
_________________________________________________________________
dense_1 (Dense)              (None, 38)                2888      
Total params: 37,088
Trainable params: 37,088
Non-trainable params: 0
_________________________________________________________________


In [25]:
# fit model 
model.fit(X, y, epochs=100, verbose=2) 

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/100
 - 2s - loss: 3.6246 - acc: 0.0702
Epoch 2/100
 - 0s - loss: 3.5600 - acc: 0.1880
Epoch 3/100
 - 0s - loss: 3.3581 - acc: 0.1905
Epoch 4/100
 - 0s - loss: 3.0755 - acc: 0.1905
Epoch 5/100
 - 0s - loss: 3.0286 - acc: 0.1905
Epoch 6/100
 - 0s - loss: 2.9963 - acc: 0.1905
Epoch 7/100
 - 0s - loss: 2.9735 - acc: 0.1905
Epoch 8/100
 - 0s - loss: 2.9655 - acc: 0.1905
Epoch 9/100
 - 0s - loss: 2.9425 - acc: 0.1905
Epoch 10/100
 - 0s - loss: 2.9231 - acc: 0.1905
Epoch 11/100
 - 0s - loss: 2.9079 - acc: 0.1905
Epoch 12/100
 - 0s - loss: 2.8768 - acc: 0.1905
Epoch 13/100
 - 0s - loss: 2.8454 - acc: 0.1905
Epoch 14/100
 - 0s - loss: 2.8093 - acc: 0.1930
Epoch 15/100
 - 0s - loss: 2.7736 - acc: 0.2080
Epoch 16/100
 - 0s - loss: 2.7253 - acc: 0.2256
Epoch 17/100
 - 0s - loss: 2.6700 - acc: 0.2381
Epoch 18/100
 - 0s - loss: 2.6234 - acc: 0.3058
Epoch 19/100
 - 0s - loss: 2.5791 - acc: 0.2607
Ep

<keras.callbacks.History at 0x1fdb8f32160>

In [26]:
# save the model to file 
model.save('character_text_model.h5')


In [27]:
# save the mapping 
"""
We also save the mapping from characters to integers 
that we will need to encode any input when using the model 
and decode any output from the model.
"""
dump(mapping, open('mapping.pkl', 'wb'))


In [63]:
# generate a sequence of characters with a language model 
def generate_seq(model, mapping, seq_length, seed_text, n_chars): 
    in_text = seed_text 
    # generate a fixed number of characters 
    for _ in range(n_chars): 
        # encode the characters as integers 
        encoded = [mapping[char] for char in in_text] 
        # truncate sequences to a fixed length 
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre') 
        # one hot encode 
        encoded = to_categorical(encoded, num_classes=len(mapping)) 
        encoded = encoded.reshape(1, encoded.shape[1],encoded.shape[2]) 
        # predict character 
        yhat = model.predict_classes(encoded, verbose=0) 
        # reverse map integer to character 
        """
        We can then decode this integer by looking up the mapping to see the character to which it maps.
        """
        out_char = '' 
        for char, index in mapping.items(): 
            if index == yhat: 
                out_char = char 
                break 
        # append to input 
        in_text += out_char 
    return in_text


In [29]:
# load the model 
model = load_model('character_text_model.h5')


In [30]:
# load the mapping 
mapping = load(open('mapping.pkl', 'rb'))


In [64]:
print(generate_seq(model, mapping, 10, 'Sing a son', 20))


Sing a song of sixpence, A poc


In [47]:
te="Sing a son"

In [50]:
c=[mapping[char] for char in te]

In [51]:
e = pad_sequences([c], maxlen=10, truncating='pre')

In [52]:
e

array([[12, 23, 27, 21,  1, 15,  1, 32, 28, 27]])

In [53]:
e.shape

(1, 10)

In [54]:
e=to_categorical(e, num_classes=len(mapping))

In [55]:
e

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0.

In [56]:
e.shape

(1, 10, 38)

In [61]:
e =e.reshape(1, e.shape[1],e.shape[2])

In [62]:
e.shape

(1, 10, 38)

In [65]:
# test mid-line 
print(generate_seq(model, mapping, 10, 'king was i', 20)) 
# test not in original 
print(generate_seq(model, mapping, 10, 'hello worl', 20))


king was in his counting house
hello worls ;uu atteeett hhe i


In [66]:
print(generate_seq(model, mapping, 10, 'please ent', 20))

please ent he kbrerdw  aag bbb
