In [3]:
!pip install tensorflow



In [4]:
import numpy as np
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense,Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Using TensorFlow backend.


In [5]:
#load data
file=open("frankenstein.txt").read()

In [6]:
#tokenize and standardize
def tok_words(input):
    input=input.lower()
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=tokenizer.tokenize(input)
    fil=filter(lambda token:token not in stopwords.words('english'),tokens)
    return "".join(fil)
processed_inputs=tok_words(file)
 

In [7]:
#char to number
chars=sorted(list(set(processed_inputs)))
c_to_n=dict((c,i) for i,c in enumerate(chars))

In [8]:
input_len=len(processed_inputs)
vocab_len=len(chars)
print("Total number of characters: ",input_len)
print("Total vocab: ",vocab_len)

Total number of characters:  220850
Total vocab:  41


In [9]:
#seq length
seq_len=100
x_data=[]
y_data=[]

In [10]:
#loop through sequence
for i in range(0, input_len-seq_len, 1):
    in_seq=processed_inputs[i:i + seq_len]
    out_seq=processed_inputs[i + seq_len]
    x_data.append([c_to_n[char] for char in in_seq])
    y_data.append(c_to_n[out_seq])
    
n_patterns=len(x_data)
print("Total Patterns: ",n_patterns)

Total Patterns:  220750


In [11]:
#input sequence to np array
X=np.reshape(x_data,(n_patterns,seq_len,1))
X=X/float(vocab_len)

In [12]:
#one-hot encoding
y=np_utils.to_categorical(y_data)

In [14]:
#create model
model=Sequential()
model.add(LSTM(256,input_shape=(X.shape[1],X.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [16]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [17]:
#saving weights
filepath="model_weights_saved.hdf5"
checkpoint=ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks=[checkpoint]

In [30]:
#fit the model and train
model.fit(X,y, epochs=10,batch_size=256,callbacks=desired_callbacks)

Epoch 1/10

Epoch 00001: loss improved from 2.83619 to 2.79319, saving model to model_weights_saved.hdf5
Epoch 2/10

Epoch 00002: loss improved from 2.79319 to 2.73680, saving model to model_weights_saved.hdf5
Epoch 3/10

Epoch 00003: loss improved from 2.73680 to 2.67460, saving model to model_weights_saved.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.67460 to 2.61300, saving model to model_weights_saved.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.61300 to 2.55661, saving model to model_weights_saved.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.55661 to 2.50755, saving model to model_weights_saved.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.50755 to 2.46342, saving model to model_weights_saved.hdf5
Epoch 8/10

Epoch 00008: loss improved from 2.46342 to 2.42697, saving model to model_weights_saved.hdf5
Epoch 9/10

Epoch 00009: loss improved from 2.42697 to 2.39416, saving model to model_weights_saved.hdf5
Epoch 10/10

Epoch 00010: loss improved from 2.39416 to

<keras.callbacks.History at 0x7f09dc060550>

In [32]:
#recompile model with saved weights
filename="model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [33]:
#output of model back to chars
n_to_c=dict((i,c) for i,c in enumerate(chars))

In [34]:
#random seed to help generate
start=np.random.randint(0,len(x_data)-1)
pattern=x_data[start]
print("Random seed: ")
print("\"", ''.join([n_to_c[value] for value in pattern]),"\"")

Random seed: 
" edadvancedreadingthousandnamelessevilsmadetremblealthoughunabledefineremainedtwodayslausannepainfuls "


In [35]:
#generate the text
for i in range(1000):
    x=np.reshape(pattern, (1,len(pattern),1))
    x=x/float(vocab_len)
    pred=model.predict(x,verbose=0)
    index=np.argmax(pred)
    result=n_to_c[index]
    seq_in=[n_to_c[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]


alessedtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredtertedsertedsestrnedseatedaredter