In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM, Dropout
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import keras
import random
import numpy as np

# text generation using RNN - how to make a better prediction character by character 

In [5]:
text=open('datasets/SherlockHolmes.txt').read().lower()
print('given script has '+ str(len(text))+' characters')

given script has 581862 characters


# strip the dataset and perform basic perprocessing 

In [6]:
text=text[1302:]
for ch in ['0','1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '$', '%', '&', '~', '`', '(', ')', '*',
          '-', '/', ';', '@', '?', ':', '©', '¢', 'ã', '\xa0', '\n', '\r', '.']:
    if ch in text:
        text=text.replace(ch,' ')
print(set(text))

{'t', 'é', 'd', 'e', 'x', 'i', 'w', 'm', 'h', 'u', 'o', 'â', 'z', "'", 'p', 'l', 'à', 'n', 'f', 'g', 'r', 'v', 's', 'q', ',', 'b', 'a', 'j', 'c', ' ', 'y', 'k', 'è'}


# create sliding window function, all the characters inside the window are treated as input 
- window size =50 and step size =3

In [7]:
def window_transform(text,window,step_size):
    input=[]
    output=[]
    n_batches=int((len(text)-window)/step_size)

    for i in range(n_batches-1):
        a=text[i*step_size:(i*step_size)+window]
        # print('a '+str(i*step_size)+':'+str((i*step_size)+window))
        input.append(a)
        b=text[(i*step_size)+window]
        # print('b '+str((i*step_size)+window))
        output.append(b)
    return input,output


window=50
step_size=3
input,output = window_transform(text,window,step_size)
# print(input,output)

In [29]:
input[6], output[6]

('s and predominates the whole of her sex  it was no', 't')

# sort the output of set(text) and map to unique numerical value 

In [8]:
#sort 
chars = sorted(list(set(text)))
#encoding 
char_to_indices = dict((c,i) for i,c in enumerate(chars))
#decoding
indices_to_chars= dict((i,c) for i,c in enumerate(chars))



# we have each character mapped to a numerical value, we need to transform the input/output vector in the same numerical format 

In [20]:
def encode_io_pairs(text,window,step_size):
    num_chars = len(chars)
    inputs,outputs=window_transform(text,window,step_size)
    #create empty vessels for one-hot encoded input/output
    x=np.zeros((len(inputs),window,num_chars),dtype=bool)
    y=np.zeros((len(inputs),num_chars),dtype=bool)
   
    #loop over inputs/outputs and transform and store in x/y
    for i,sentence in enumerate(inputs):
        for t,char in enumerate(sentence):
            x[i,t,char_to_indices[char]]=1
        y[i, char_to_indices[outputs[i]]]=1
    return x,y        

x,y = encode_io_pairs(text,window,step_size)
print(x.shape,y.shape)

(193502, 50, 33) (193502, 33)


# build LSTM network starting with starting with the first layer having 120 nodes followed by a fully connected linear layer and a softmax layer 

In [31]:
model = Sequential([
    keras.layers.LSTM(120,input_shape=(window,len(chars))),
    keras.layers.Dropout(0.22),
    keras.layers.Dense(len(chars), activation='linear'),
    keras.layers.Dense(len(chars), activation='softmax')
])

#compiling the model
model.compile(loss='categorical_crossentropy',optimizer='adam')
xsmall=x[:20000,:,:]
ysmall=y[:20000,:]
model.fit(xsmall,ysmall,batch_size=500,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x269a23b8d30>

- top proceed with prediction
  - our script accepts window size of 50 and takes the output as the 51st character
  - we need to predict a character
  - later remove the first charecter from our previous window and add the newly predicted character at the end making it still a window of size 50
  - predict the second character and keep following the process

In [32]:
def predict_next_chars(model,input_chars,num_to_predict):
    pred_chars=''
    for i in range(num_to_predict):
        #convert this round's predicted charaters to numberical input
        x_test=np.zeros((1,window,len(chars)))
        for t, char in enumerate(input_chars):
            x_test[0,t,char_to_indices[char]] = 1

        #make this round's prediction 
        test_predict = model.predict(x_test,verbose=0)[0]

        #translate the numerical prediction back to characters
        r=np.argmax(test_predict)
        d=indices_to_chars[r]

        #update predicted chars and input
        pred_chars+=d
        input_chars+=d
        input_chars = input_chars[1:]
    return pred_chars


start=89
num_to_predict=10
input_chars = text[start:start+window]
print('complete sequence:', text[start:start+window+num_to_predict])
print('input sequence:', input_chars)
print('output sequence:', predict_next_chars(model,input_char,num_to_predict=num_to_predict))

complete sequence: otion akin to love for irene adler  all emotions, and that o
input sequence: otion akin to love for irene adler  all emotions, 
output sequence: an  ou the
