In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM , Dense , Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

from tensorflow.python.keras.optimizers import TFOptimizer

Using TensorFlow backend.


In [2]:
# load the text file

filename = 'wonder.txt'
raw_txt = open(filename , 'r' , encoding = 'utf-8').read()
raw_txt = raw_txt.lower()

In [3]:
# create mapping of unique chars to integers

chars = sorted(list(set(raw_txt)))
chars_to_int = dict((c , i) for i , c in enumerate(chars))
chars_to_int

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '$': 4,
 '%': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 ',': 10,
 '-': 11,
 '.': 12,
 '/': 13,
 '0': 14,
 '1': 15,
 '2': 16,
 '3': 17,
 '4': 18,
 '5': 19,
 '6': 20,
 '7': 21,
 '8': 22,
 '9': 23,
 ':': 24,
 ';': 25,
 '?': 26,
 '@': 27,
 '[': 28,
 ']': 29,
 '_': 30,
 'a': 31,
 'b': 32,
 'c': 33,
 'd': 34,
 'e': 35,
 'f': 36,
 'g': 37,
 'h': 38,
 'i': 39,
 'j': 40,
 'k': 41,
 'l': 42,
 'm': 43,
 'n': 44,
 'o': 45,
 'p': 46,
 'q': 47,
 'r': 48,
 's': 49,
 't': 50,
 'u': 51,
 'v': 52,
 'w': 53,
 'x': 54,
 'y': 55,
 'z': 56}

In [4]:
n_chars = len(raw_txt)
n_vocab = len(chars)
print("No of characters: " , n_chars)
print("No of vocabulary: " , n_vocab)

No of characters:  163170
No of vocabulary:  57


In [5]:
#Split the dataset into train and test

datax = []
datay = []
seg_length = 100

for i in range(0 , n_chars - seg_length , 1):
    seg_in = raw_txt[i : i + seg_length]
    seg_out = raw_txt[i + seg_length]
    
    datax.append([chars_to_int [char] for char in seg_in])
    datay.append(chars_to_int[seg_out])
    
n_patterns = len(datax)
print(n_patterns)

163070


In [6]:
x = np.reshape(datax , (n_patterns , seg_length , 1))
x

array([[[31],
        [42],
        [39],
        ...,
        [12],
        [ 1],
        [34]],

       [[42],
        [39],
        [33],
        ...,
        [ 1],
        [34],
        [45]],

       [[39],
        [33],
        [35],
        ...,
        [34],
        [45],
        [53]],

       ...,

       [[44],
        [34],
        [31],
        ...,
        [ 1],
        [31],
        [32]],

       [[34],
        [31],
        [50],
        ...,
        [31],
        [32],
        [45]],

       [[31],
        [50],
        [39],
        ...,
        [32],
        [45],
        [51]]])

In [7]:
x = x / float(n_vocab)
x

array([[[0.54385965],
        [0.73684211],
        [0.68421053],
        ...,
        [0.21052632],
        [0.01754386],
        [0.59649123]],

       [[0.73684211],
        [0.68421053],
        [0.57894737],
        ...,
        [0.01754386],
        [0.59649123],
        [0.78947368]],

       [[0.68421053],
        [0.57894737],
        [0.61403509],
        ...,
        [0.59649123],
        [0.78947368],
        [0.92982456]],

       ...,

       [[0.77192982],
        [0.59649123],
        [0.54385965],
        ...,
        [0.01754386],
        [0.54385965],
        [0.56140351]],

       [[0.59649123],
        [0.54385965],
        [0.87719298],
        ...,
        [0.54385965],
        [0.56140351],
        [0.78947368]],

       [[0.54385965],
        [0.87719298],
        [0.68421053],
        ...,
        [0.56140351],
        [0.78947368],
        [0.89473684]]])

In [8]:
# one hot encode the output variable
y = np_utils.to_categorical(datay)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [9]:
model = Sequential()
model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))


model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [10]:
# define the checkpoint
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [11]:
model.fit(x , y , epochs = 10 , batch_size = 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1d76f6eda08>

### Generate Text

In [17]:
# load the network weights

filename = "weights-improvement-01-3.0620.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
int_to_char = dict((i , c) for i , c in enumerate(chars))

In [22]:
# pick random seed

start = np.random.randint(0 , len(datax) - 1)
pattern = datax[start]
print("Seed")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

Seed
" g at the hatter,
who turned pale and fidgeted.

'give your evidence,' said the king; 'and don't be n "


In [None]:
# generate Characters

import sys
for i in range(1000):
    x = np.reshape(pattern , (1 , len(pattern) , 1))
    x = x / float(n_vocab)
    
    prediction = model.predict(x , verbose = 0)
    
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1 : len(pattern)]

    
print("\nDONE")

                                                                                                                                                                                                                                                                                                                          