In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm", disable=['parser','tagger', 'ner'])

In [4]:
nlp.max_length = 1198623

In [9]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [10]:
d = read_file('datasets/melville-moby_dick.txt')

In [11]:
tokens = separate_punc(d)

In [13]:
len(tokens)

215339

In [None]:
## Create the seques

In [14]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [16]:
from keras.preprocessing.text import Tokenizer


In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [25]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [26]:
#Converting to numpy array
import numpy as np
sequences = np.array(sequences)

In [27]:
sequences

array([[  159,  9473, 17255, ...,   218,   446,     5],
       [ 9473, 17255,   406, ...,   446,     5,  1180],
       [17255,   406,    42, ...,     5,  1180,    42],
       ...,
       [  240,   946,   354, ...,  1431,  1327,    74],
       [  946,   354,  1430, ...,  1327,    74,   219],
       [  354,  1430,     3, ...,    74,   219,   220]])

In [28]:
from keras.utils import to_categorical

In [None]:
# Select features except last column

In [32]:
vocabulary_size = len(tokenizer.word_counts)

In [33]:
X = sequences[:,:-1]

In [34]:
y = sequences[:,-1]

In [36]:
y = to_categorical(y, num_classes =  vocabulary_size + 1)

In [37]:
seq_len = X.shape[1]

In [39]:
X.shape

(215313, 25)

In [41]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [47]:
def create_model(vocabulary_size, seq_len):
    
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(seq_len*2, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))
    
    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    
    return model
    

In [48]:
model = create_model(vocabulary_size+1,seq_len )

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 25)            431400    
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 17256)             880056    
Total params: 1,349,406
Trainable params: 1,349,406
Non-trainable params: 0
_________________________________________________________________


In [49]:
from pickle import dump,load

In [50]:
model.fit(X, y, batch_size=128, epochs=2, verbose=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7f89027becd0>

In [51]:
#Save the model 
model.save('my_moby.h5')

In [53]:
dump(tokenizer, open('my_moby_tokenizer', 'wb'))

In [54]:
from keras.preprocessing.sequence import pad_sequences

In [64]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        
        #encode the input text as we did for training
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        #pad the text.. if its more than the max lenght cut or less than the maxlen
        pad_encoded  = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        #pedict the word index
        pred_word_ind =model.predict_classes(pad_encoded, verbose=0)[0]
        #Get the predicted word
        pred_word = tokenizer.index_word[pred_word_ind]
        #Add to the input text for next word prediction
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
    
    return ' '.join(output_text)

In [65]:
text_sequences[0]

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to']

In [66]:
seed_text = ' '.join(text_sequences[0])

In [67]:
seed_text

'chapter 1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to'

In [68]:
generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=25)

'the whale of the whale of the whale of the whale of the whale of the whale of the whale of the whale of the'